From a088b858f16af85e3db359b6c6aaa92dd3bc0921 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Fri, 10 Apr 2020 09:43:20 +0200 Subject: efi/x86: Revert struct layout change to fix kexec boot regression Commit 0a67361dcdaa29 ("efi/x86: Remove runtime table address from kexec EFI setup data") removed the code that retrieves the non-remapped UEFI runtime services pointer from the data structure provided by kexec, as it was never really needed on the kexec boot path: mapping the runtime services table at its non-remapped address is only needed when calling SetVirtualAddressMap(), which never happens during a kexec boot in the first place. However, dropping the 'runtime' member from struct efi_setup_data was a mistake. That struct is shared ABI between the kernel and the kexec tooling for x86, and so we cannot simply change its layout. So let's put back the removed field, but call it 'unused' to reflect the fact that we never look at its contents. While at it, add a comment to remind our future selves that the layout is external ABI. Fixes: 0a67361dcdaa29 ("efi/x86: Remove runtime table address from kexec EFI setup data") Reported-by: Theodore Ts'o Tested-by: Theodore Ts'o Reviewed-by: Dave Young Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar --- arch/x86/include/asm/efi.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index cdcf48d52a12..8391c115c0ec 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -178,8 +178,10 @@ extern void efi_free_boot_services(void); extern pgd_t * __init efi_uv1_memmap_phys_prolog(void); extern void __init efi_uv1_memmap_phys_epilog(pgd_t *save_pgd); +/* kexec external ABI */ struct efi_setup_data { u64 fw_vendor; + u64 __unused; u64 tables; u64 smbios; u64 reserved[8]; -- cgit v1.2.3 From f14eec0a32038f2d6c05b8ea91c7701f65ce7418 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 13 Apr 2020 03:17:58 -0400 Subject: KVM: SVM: move more vmentry code to assembly Manipulate IF around vmload/vmsave to remove the confusing usage of local_irq_enable where interrupts are actually disabled via GIF. And stuff the RSB immediately without waiting for a RET to avoid Spectre-v2 attacks. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/nospec-branch.h | 21 --------------------- arch/x86/kvm/svm/svm.c | 7 ------- arch/x86/kvm/svm/vmenter.S | 9 +++++++++ 3 files changed, 9 insertions(+), 28 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 07e95dcb40ad..7e9a281e2660 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -237,27 +237,6 @@ enum ssb_mitigation { extern char __indirect_thunk_start[]; extern char __indirect_thunk_end[]; -/* - * On VMEXIT we must ensure that no RSB predictions learned in the guest - * can be followed in the host, by overwriting the RSB completely. Both - * retpoline and IBRS mitigations for Spectre v2 need this; only on future - * CPUs with IBRS_ALL *might* it be avoided. - */ -static inline void vmexit_fill_RSB(void) -{ -#ifdef CONFIG_RETPOLINE - unsigned long loops; - - asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE("jmp 910f", - __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)), - X86_FEATURE_RETPOLINE) - "910:" - : "=r" (loops), ASM_CALL_CONSTRAINT - : : "memory" ); -#endif -} - static __always_inline void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) { diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2be5bbae3a40..117bb0b28535 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3330,13 +3330,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) */ x86_spec_ctrl_set_guest(svm->spec_ctrl, svm->virt_spec_ctrl); - local_irq_enable(); - __svm_vcpu_run(svm->vmcb_pa, (unsigned long *)&svm->vcpu.arch.regs); - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, svm->host.gs_base); #else @@ -3366,8 +3361,6 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) reload_tss(vcpu); - local_irq_disable(); - x86_spec_ctrl_restore_host(svm->spec_ctrl, svm->virt_spec_ctrl); vcpu->arch.cr2 = svm->vmcb->save.cr2; diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S index fa1af90067e9..723887e35e95 100644 --- a/arch/x86/kvm/svm/vmenter.S +++ b/arch/x86/kvm/svm/vmenter.S @@ -3,6 +3,7 @@ #include #include #include +#include #define WORD_SIZE (BITS_PER_LONG / 8) @@ -78,6 +79,7 @@ SYM_FUNC_START(__svm_vcpu_run) pop %_ASM_AX /* Enter guest mode */ + sti 1: vmload %_ASM_AX jmp 3f 2: cmpb $0, kvm_rebooting @@ -99,6 +101,13 @@ SYM_FUNC_START(__svm_vcpu_run) ud2 _ASM_EXTABLE(5b, 6b) 7: + cli + +#ifdef CONFIG_RETPOLINE + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE +#endif + /* "POP" @regs to RAX. */ pop %_ASM_AX -- cgit v1.2.3 From 593309423cbad0fab659a685834416cf12d8f581 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 4 Apr 2020 01:33:05 +0200 Subject: x86/32: Remove CONFIG_DOUBLEFAULT Make the doublefault exception handler unconditional on 32-bit. Yes, it is important to be able to catch #DF exceptions instead of silent reboots. Yes, the code size increase is worth every byte. And one less CONFIG symbol is just the cherry on top. No functional changes. Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200404083646.8897-1-bp@alien8.de --- arch/x86/Kconfig.debug | 9 --------- arch/x86/entry/entry_32.S | 2 -- arch/x86/include/asm/doublefault.h | 2 +- arch/x86/include/asm/traps.h | 2 -- arch/x86/kernel/Makefile | 4 +--- arch/x86/kernel/dumpstack_32.c | 4 ---- arch/x86/kernel/traps.c | 2 -- arch/x86/mm/cpu_entry_area.c | 4 +--- tools/testing/selftests/wireguard/qemu/debug.config | 1 - 9 files changed, 3 insertions(+), 27 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 2e74690b028a..f909d3ce36e6 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -99,15 +99,6 @@ config DEBUG_WX If in doubt, say "Y". -config DOUBLEFAULT - default y - bool "Enable doublefault exception handler" if EXPERT && X86_32 - ---help--- - This option allows trapping of rare doublefault exceptions that - would otherwise cause a system to silently reboot. Disabling this - option saves about 4k and might cause you much additional grey - hair. - config DEBUG_TLBFLUSH bool "Set upper limit of TLB entries to flush one-by-one" depends on DEBUG_KERNEL diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b67bae7091d7..5c9c7eee6325 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1536,7 +1536,6 @@ SYM_CODE_START(debug) jmp common_exception SYM_CODE_END(debug) -#ifdef CONFIG_DOUBLEFAULT SYM_CODE_START(double_fault) 1: /* @@ -1576,7 +1575,6 @@ SYM_CODE_START(double_fault) hlt jmp 1b SYM_CODE_END(double_fault) -#endif /* * NMI is doubly nasty. It can happen on the first instruction of diff --git a/arch/x86/include/asm/doublefault.h b/arch/x86/include/asm/doublefault.h index af9a14ac8962..54a6e4a2e132 100644 --- a/arch/x86/include/asm/doublefault.h +++ b/arch/x86/include/asm/doublefault.h @@ -2,7 +2,7 @@ #ifndef _ASM_X86_DOUBLEFAULT_H #define _ASM_X86_DOUBLEFAULT_H -#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +#ifdef CONFIG_X86_32 extern void doublefault_init_cpu_tss(void); #else static inline void doublefault_init_cpu_tss(void) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c26a7e1d8a2c..70bd0f356e5d 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -69,9 +69,7 @@ dotraplinkage void do_overflow(struct pt_regs *regs, long error_code); dotraplinkage void do_bounds(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_op(struct pt_regs *regs, long error_code); dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code); -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2); -#endif dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *regs, long error_code); dotraplinkage void do_invalid_TSS(struct pt_regs *regs, long error_code); dotraplinkage void do_segment_not_present(struct pt_regs *regs, long error_code); diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index ba89cabe5fcf..2a7c3afa62e2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,9 +102,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-y += kprobes/ obj-$(CONFIG_MODULES) += module.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o -endif +obj-$(CONFIG_X86_32) += doublefault_32.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_VM86) += vm86_32.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 8e3a8fedfa4d..722fd712e1cf 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,6 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id()); struct doublefault_stack *ss = &cea->doublefault_stack; @@ -103,9 +102,6 @@ static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info) info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp); return true; -#else - return false; -#endif } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..e85561fc0dc8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -326,7 +326,6 @@ __visible void __noreturn handle_stack_overflow(const char *message, } #endif -#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT) /* * Runs on an IST stack for x86_64 and on a special task stack for x86_32. * @@ -450,7 +449,6 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign die("double fault", regs, error_code); panic("Machine halted."); } -#endif dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 56f9189bbadb..5199d8a1daf1 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -17,7 +17,7 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif -#if defined(CONFIG_X86_32) && defined(CONFIG_DOUBLEFAULT) +#ifdef CONFIG_X86_32 DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack); #endif @@ -114,12 +114,10 @@ static void __init percpu_setup_exception_stacks(unsigned int cpu) #else static inline void percpu_setup_exception_stacks(unsigned int cpu) { -#ifdef CONFIG_DOUBLEFAULT struct cpu_entry_area *cea = get_cpu_entry_area(cpu); cea_map_percpu_pages(&cea->doublefault_stack, &per_cpu(doublefault_stack, cpu), 1, PAGE_KERNEL); -#endif } #endif diff --git a/tools/testing/selftests/wireguard/qemu/debug.config b/tools/testing/selftests/wireguard/qemu/debug.config index 5909e7ef2a5c..807fa7dc60b8 100644 --- a/tools/testing/selftests/wireguard/qemu/debug.config +++ b/tools/testing/selftests/wireguard/qemu/debug.config @@ -58,7 +58,6 @@ CONFIG_RCU_EQS_DEBUG=y CONFIG_USER_STACKTRACE_SUPPORT=y CONFIG_DEBUG_SG=y CONFIG_DEBUG_NOTIFIERS=y -CONFIG_DOUBLEFAULT=y CONFIG_X86_DEBUG_FPU=y CONFIG_DEBUG_SECTION_MISMATCH=y CONFIG_DEBUG_PAGEALLOC=y -- cgit v1.2.3 From bdf89df3c54518eed879d8fac7577fcfb220c67e Mon Sep 17 00:00:00 2001 From: John Allen Date: Thu, 9 Apr 2020 10:34:29 -0500 Subject: x86/microcode/AMD: Increase microcode PATCH_MAX_SIZE Future AMD CPUs will have microcode patches that exceed the default 4K patch size. Raise our limit. Signed-off-by: John Allen Signed-off-by: Borislav Petkov Cc: stable@vger.kernel.org # v4.14.. Link: https://lkml.kernel.org/r/20200409152931.GA685273@mojo.amd.com --- arch/x86/include/asm/microcode_amd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 6685e1218959..7063b5a43220 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -41,7 +41,7 @@ struct microcode_amd { unsigned int mpb[0]; }; -#define PATCH_MAX_SIZE PAGE_SIZE +#define PATCH_MAX_SIZE (3 * PAGE_SIZE) #ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); -- cgit v1.2.3 From 53b3d8e9d57753295b33065f80b1e2fb4fcb946d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:01 -0700 Subject: KVM: x86: Export kvm_propagate_fault() (as kvm_inject_emulated_page_fault) Export the page fault propagation helper so that VMX can use it to correctly emulate TLB invalidation on page faults in an upcoming patch. In the (hopefully) not-too-distant future, SGX virtualization will also want access to the helper for injecting page faults to the correct level (L1 vs. L2) when emulating ENCLS instructions. Rename the function to kvm_inject_emulated_page_fault() to clarify that it is (a) injecting a fault and (b) only for page faults. WARN if it's invoked with an exception other than PF_VECTOR. Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-6-sean.j.christopherson@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 42a2d0d3984a..34ad7297b06b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1452,6 +1452,8 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 027dfd278a97..003e625367b7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -611,8 +611,11 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, + struct x86_exception *fault) { + WARN_ON_ONCE(fault->vector != PF_VECTOR); + if (mmu_is_nested(vcpu) && !fault->nested_page_fault) vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); else @@ -620,6 +623,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau return fault->nested_page_fault; } +EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -6381,7 +6385,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - return kvm_propagate_fault(vcpu, &ctxt->exception); + return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, -- cgit v1.2.3 From e9d7144597b10ff13ff2264c059f7d4a7fbc89ac Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:23:10 +0200 Subject: x86/cpu: Add a steppings field to struct x86_cpu_id Intel uses the same family/model for several CPUs. Sometimes the stepping must be checked to tell them apart. On x86 there can be at most 16 steppings. Add a steppings bitmask to x86_cpu_id and a X86_MATCH_VENDOR_FAMILY_MODEL_STEPPING_FEATURE macro and support for matching against family/model/stepping. [ bp: Massage. ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Reviewed-by: Josh Poimboeuf --- arch/x86/include/asm/cpu_device_id.h | 27 ++++++++++++++++++++++++--- arch/x86/kernel/cpu/match.c | 7 ++++++- include/linux/mod_devicetable.h | 2 ++ 3 files changed, 32 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index cf3d621c6892..10426cd56dca 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -20,12 +20,14 @@ #define X86_CENTAUR_FAM6_C7_D 0xd #define X86_CENTAUR_FAM6_NANO 0xf +#define X86_STEPPINGS(mins, maxs) GENMASK(maxs, mins) /** - * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Base macro for CPU matching + * X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE - Base macro for CPU matching * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY * The name is expanded to X86_VENDOR_@_vendor * @_family: The family number or X86_FAMILY_ANY * @_model: The model number, model constant or X86_MODEL_ANY + * @_steppings: Bitmask for steppings, stepping constant or X86_STEPPING_ANY * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY * @_data: Driver specific data or NULL. The internal storage * format is unsigned long. The supplied value, pointer @@ -37,15 +39,34 @@ * into another macro at the usage site for good reasons, then please * start this local macro with X86_MATCH to allow easy grepping. */ -#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(_vendor, _family, _model, \ - _feature, _data) { \ +#define X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(_vendor, _family, _model, \ + _steppings, _feature, _data) { \ .vendor = X86_VENDOR_##_vendor, \ .family = _family, \ .model = _model, \ + .steppings = _steppings, \ .feature = _feature, \ .driver_data = (unsigned long) _data \ } +/** + * X86_MATCH_VENDOR_FAM_MODEL_FEATURE - Macro for CPU matching + * @_vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY + * The name is expanded to X86_VENDOR_@_vendor + * @_family: The family number or X86_FAMILY_ANY + * @_model: The model number, model constant or X86_MODEL_ANY + * @_feature: A X86_FEATURE bit or X86_FEATURE_ANY + * @_data: Driver specific data or NULL. The internal storage + * format is unsigned long. The supplied value, pointer + * etc. is casted to unsigned long internally. + * + * The steppings arguments of X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE() is + * set to wildcards. + */ +#define X86_MATCH_VENDOR_FAM_MODEL_FEATURE(vendor, family, model, feature, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(vendor, family, model, \ + X86_STEPPING_ANY, feature, data) + /** * X86_MATCH_VENDOR_FAM_FEATURE - Macro for matching vendor, family and CPU feature * @vendor: The vendor name, e.g. INTEL, AMD, HYGON, ..., ANY diff --git a/arch/x86/kernel/cpu/match.c b/arch/x86/kernel/cpu/match.c index d3482eb43ff3..ad6776081e60 100644 --- a/arch/x86/kernel/cpu/match.c +++ b/arch/x86/kernel/cpu/match.c @@ -39,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match) const struct x86_cpu_id *m; struct cpuinfo_x86 *c = &boot_cpu_data; - for (m = match; m->vendor | m->family | m->model | m->feature; m++) { + for (m = match; + m->vendor | m->family | m->model | m->steppings | m->feature; + m++) { if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor) continue; if (m->family != X86_FAMILY_ANY && c->x86 != m->family) continue; if (m->model != X86_MODEL_ANY && c->x86_model != m->model) continue; + if (m->steppings != X86_STEPPING_ANY && + !(BIT(c->x86_stepping) & m->steppings)) + continue; if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature)) continue; return m; diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 4c2ddd0941a7..0754b8d71262 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -663,6 +663,7 @@ struct x86_cpu_id { __u16 vendor; __u16 family; __u16 model; + __u16 steppings; __u16 feature; /* bit index */ kernel_ulong_t driver_data; }; @@ -671,6 +672,7 @@ struct x86_cpu_id { #define X86_VENDOR_ANY 0xffff #define X86_FAMILY_ANY 0 #define X86_MODEL_ANY 0 +#define X86_STEPPING_ANY 0 #define X86_FEATURE_ANY 0 /* Same as FPU, you can't test for that */ /* -- cgit v1.2.3 From 7e5b3c267d256822407a22fdce6afdf9cd13f9fb Mon Sep 17 00:00:00 2001 From: Mark Gross Date: Thu, 16 Apr 2020 17:54:04 +0200 Subject: x86/speculation: Add Special Register Buffer Data Sampling (SRBDS) mitigation SRBDS is an MDS-like speculative side channel that can leak bits from the random number generator (RNG) across cores and threads. New microcode serializes the processor access during the execution of RDRAND and RDSEED. This ensures that the shared buffer is overwritten before it is released for reuse. While it is present on all affected CPU models, the microcode mitigation is not needed on models that enumerate ARCH_CAPABILITIES[MDS_NO] in the cases where TSX is not supported or has been disabled with TSX_CTRL. The mitigation is activated by default on affected processors and it increases latency for RDRAND and RDSEED instructions. Among other effects this will reduce throughput from /dev/urandom. * Enable administrator to configure the mitigation off when desired using either mitigations=off or srbds=off. * Export vulnerability status via sysfs * Rename file-scoped macros to apply for non-whitelist table initializations. [ bp: Massage, - s/VULNBL_INTEL_STEPPING/VULNBL_INTEL_STEPPINGS/g, - do not read arch cap MSR a second time in tsx_fused_off() - just pass it in, - flip check in cpu_set_bug_bits() to save an indentation level, - reflow comments. jpoimboe: s/Mitigated/Mitigation/ in user-visible strings tglx: Dropped the fused off magic for now ] Signed-off-by: Mark Gross Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Reviewed-by: Pawan Gupta Reviewed-by: Josh Poimboeuf Tested-by: Neelima Krishnan --- Documentation/ABI/testing/sysfs-devices-system-cpu | 1 + Documentation/admin-guide/kernel-parameters.txt | 20 ++++ arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/msr-index.h | 4 + arch/x86/kernel/cpu/bugs.c | 106 +++++++++++++++++++++ arch/x86/kernel/cpu/common.c | 31 ++++++ arch/x86/kernel/cpu/cpu.h | 1 + drivers/base/cpu.c | 8 ++ 8 files changed, 173 insertions(+) (limited to 'arch/x86/include') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 2e0e3b45d02a..b39531a3c5bc 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -492,6 +492,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/spec_store_bypass /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds + /sys/devices/system/cpu/vulnerabilities/srbds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f2a93c8679e8..f720463bd918 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4757,6 +4757,26 @@ the kernel will oops in either "warn" or "fatal" mode. + srbds= [X86,INTEL] + Control the Special Register Buffer Data Sampling + (SRBDS) mitigation. + + Certain CPUs are vulnerable to an MDS-like + exploit which can leak bits from the random + number generator. + + By default, this issue is mitigated by + microcode. However, the microcode fix can cause + the RDRAND and RDSEED instructions to become + much slower. Among other effects, this will + result in reduced throughput from /dev/urandom. + + The microcode mitigation can be disabled with + the following option: + + off: Disable mitigation and remove + performance impact to RDRAND and RDSEED + srcutree.counter_wrap_check [KNL] Specifies how frequently to check for grace-period sequence counter wrap for the diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index db189945e9b0..02dabc9e77b0 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -362,6 +362,7 @@ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ #define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ #define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ +#define X86_FEATURE_SRBDS_CTRL (18*32+ 9) /* "" SRBDS mitigation MSR available */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ #define X86_FEATURE_PCONFIG (18*32+18) /* Intel PCONFIG */ @@ -407,5 +408,6 @@ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ #define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ +#define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 12c9684d59ba..3efde600a674 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -128,6 +128,10 @@ #define TSX_CTRL_RTM_DISABLE BIT(0) /* Disable RTM feature */ #define TSX_CTRL_CPUID_CLEAR BIT(1) /* Disable TSX enumeration */ +/* SRBDS support */ +#define MSR_IA32_MCU_OPT_CTRL 0x00000123 +#define RNGDS_MITG_DIS BIT(0) + #define MSR_IA32_SYSENTER_CS 0x00000174 #define MSR_IA32_SYSENTER_ESP 0x00000175 #define MSR_IA32_SYSENTER_EIP 0x00000176 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ed54b3b21c39..56978cb06149 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init mds_print_mitigation(void); static void __init taa_select_mitigation(void); +static void __init srbds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR that always has to be preserved. */ u64 x86_spec_ctrl_base; @@ -108,6 +109,7 @@ void __init check_bugs(void) l1tf_select_mitigation(); mds_select_mitigation(); taa_select_mitigation(); + srbds_select_mitigation(); /* * As MDS and TAA mitigations are inter-related, print MDS @@ -397,6 +399,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str) } early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); +#undef pr_fmt +#define pr_fmt(fmt) "SRBDS: " fmt + +enum srbds_mitigations { + SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_UCODE_NEEDED, + SRBDS_MITIGATION_FULL, + SRBDS_MITIGATION_TSX_OFF, + SRBDS_MITIGATION_HYPERVISOR, +}; + +static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; + +static const char * const srbds_strings[] = { + [SRBDS_MITIGATION_OFF] = "Vulnerable", + [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", + [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", + [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", + [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", +}; + +static bool srbds_off; + +void update_srbds_msr(void) +{ + u64 mcu_ctrl; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) + return; + + rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + + switch (srbds_mitigation) { + case SRBDS_MITIGATION_OFF: + case SRBDS_MITIGATION_TSX_OFF: + mcu_ctrl |= RNGDS_MITG_DIS; + break; + case SRBDS_MITIGATION_FULL: + mcu_ctrl &= ~RNGDS_MITG_DIS; + break; + default: + break; + } + + wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); +} + +static void __init srbds_select_mitigation(void) +{ + u64 ia32_cap; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return; + + /* + * Check to see if this is one of the MDS_NO systems supporting + * TSX that are only exposed to SRBDS when TSX is enabled. + */ + ia32_cap = x86_read_arch_cap_msr(); + if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM)) + srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; + else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; + else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) + srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; + else if (cpu_mitigations_off() || srbds_off) + srbds_mitigation = SRBDS_MITIGATION_OFF; + + update_srbds_msr(); + pr_info("%s\n", srbds_strings[srbds_mitigation]); +} + +static int __init srbds_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + return 0; + + srbds_off = !strcmp(str, "off"); + return 0; +} +early_param("srbds", srbds_parse_cmdline); + #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt @@ -1528,6 +1621,11 @@ static char *ibpb_state(void) return ""; } +static ssize_t srbds_show_state(char *buf) +{ + return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]); +} + static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { @@ -1572,6 +1670,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); + case X86_BUG_SRBDS: + return srbds_show_state(buf); + default: break; } @@ -1618,4 +1719,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } + +ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1131ae032bf2..8293ee514975 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1075,6 +1075,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { {} }; +#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \ + INTEL_FAM6_##model, steppings, \ + X86_FEATURE_ANY, issues) + +#define SRBDS BIT(0) + +static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { + VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS), + {} +}; + static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which) { const struct x86_cpu_id *m = x86_match_cpu(table); @@ -1142,6 +1163,15 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) (ia32_cap & ARCH_CAP_TSX_CTRL_MSR))) setup_force_cpu_bug(X86_BUG_TAA); + /* + * SRBDS affects CPUs which support RDRAND or RDSEED and are listed + * in the vulnerability blacklist. + */ + if ((cpu_has(c, X86_FEATURE_RDRAND) || + cpu_has(c, X86_FEATURE_RDSEED)) && + cpu_matches(cpu_vuln_blacklist, SRBDS)) + setup_force_cpu_bug(X86_BUG_SRBDS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1594,6 +1624,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); validate_apic_and_package_id(c); x86_spec_ctrl_setup_ap(); + update_srbds_msr(); } static __init int setup_noclflush(char *arg) diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 37fdefd14f28..fb538fccd24c 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c); unsigned int aperfmperf_get_khz(int cpu); extern void x86_spec_ctrl_setup_ap(void); +extern void update_srbds_msr(void); extern u64 x86_read_arch_cap_msr(void); diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 9a1c00fbbaef..d2136ab9b14a 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -562,6 +562,12 @@ ssize_t __weak cpu_show_itlb_multihit(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_srbds(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -570,6 +576,7 @@ static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); +static DEVICE_ATTR(srbds, 0444, cpu_show_srbds, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -580,6 +587,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_mds.attr, &dev_attr_tsx_async_abort.attr, &dev_attr_itlb_multihit.attr, + &dev_attr_srbds.attr, NULL }; -- cgit v1.2.3 From 1f6f655e01adebf5bd5e6c3da2e843c104ded051 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Apr 2020 17:27:42 +0200 Subject: x86/mm: Add a x86_has_pat_wp() helper Abstract the ioremap code away from the caching mode internals. Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200408152745.1565832-2-hch@lst.de --- arch/x86/include/asm/memtype.h | 2 ++ arch/x86/mm/init.c | 6 ++++++ arch/x86/mm/ioremap.c | 8 ++------ 3 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h index 9c2447b3555d..1e4e99b40711 100644 --- a/arch/x86/include/asm/memtype.h +++ b/arch/x86/include/asm/memtype.h @@ -24,4 +24,6 @@ extern void memtype_free_io(resource_size_t start, resource_size_t end); extern bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn); +bool x86_has_pat_wp(void); + #endif /* _ASM_X86_MEMTYPE_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 1bba16c5742b..6005f83b8111 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -71,6 +71,12 @@ uint8_t __pte2cachemode_tbl[8] = { }; EXPORT_SYMBOL(__pte2cachemode_tbl); +/* Check that the write-protect PAT entry is set for write-protect */ +bool x86_has_pat_wp(void) +{ + return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP; +} + static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; static unsigned long __initdata pgt_buf_top; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 18c637c0dc6f..41536f523a5f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -778,10 +778,8 @@ void __init *early_memremap_encrypted(resource_size_t phys_addr, void __init *early_memremap_encrypted_wp(resource_size_t phys_addr, unsigned long size) { - /* Be sure the write-protect PAT entry is set for write-protect */ - if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + if (!x86_has_pat_wp()) return NULL; - return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_ENC_WP); } @@ -799,10 +797,8 @@ void __init *early_memremap_decrypted(resource_size_t phys_addr, void __init *early_memremap_decrypted_wp(resource_size_t phys_addr, unsigned long size) { - /* Be sure the write-protect PAT entry is set for write-protect */ - if (__pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] != _PAGE_CACHE_MODE_WP) + if (!x86_has_pat_wp()) return NULL; - return early_memremap_prot(phys_addr, size, __PAGE_KERNEL_NOENC_WP); } #endif /* CONFIG_AMD_MEM_ENCRYPT */ -- cgit v1.2.3 From 7fa3e10f0f3646108a1018004d0f571c3222dc9f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Apr 2020 17:27:43 +0200 Subject: x86/mm: Move pgprot2cachemode out of line This helper is only used by x86 low-level MM code. Also remove the entirely pointless __pte2cachemode_tbl export as that symbol can be marked static now. Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200408152745.1565832-3-hch@lst.de --- arch/x86/include/asm/memtype.h | 1 + arch/x86/include/asm/pgtable_types.h | 10 ---------- arch/x86/mm/init.c | 13 +++++++++++-- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/memtype.h b/arch/x86/include/asm/memtype.h index 1e4e99b40711..9ca760e430b9 100644 --- a/arch/x86/include/asm/memtype.h +++ b/arch/x86/include/asm/memtype.h @@ -25,5 +25,6 @@ extern void memtype_free_io(resource_size_t start, resource_size_t end); extern bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn); bool x86_has_pat_wp(void); +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot); #endif /* _ASM_X86_MEMTYPE_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b6606fe6cfdf..75fe903124f8 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -468,7 +468,6 @@ static inline pteval_t pte_flags(pte_t pte) } extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; -extern uint8_t __pte2cachemode_tbl[8]; #define __pte2cm_idx(cb) \ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ @@ -489,15 +488,6 @@ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { return __pgprot(cachemode2protval(pcm)); } -static inline enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) -{ - unsigned long masked; - - masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; - if (likely(masked == 0)) - return 0; - return __pte2cachemode_tbl[__pte2cm_idx(masked)]; -} static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { pgprotval_t val = pgprot_val(pgprot); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6005f83b8111..4a55d687c246 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -59,7 +59,7 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { }; EXPORT_SYMBOL(__cachemode2pte_tbl); -uint8_t __pte2cachemode_tbl[8] = { +static uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, [__pte2cm_idx(_PAGE_PWT | 0 | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx( 0 | _PAGE_PCD | 0 )] = _PAGE_CACHE_MODE_UC_MINUS, @@ -69,7 +69,6 @@ uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx(0 | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS, [__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC, }; -EXPORT_SYMBOL(__pte2cachemode_tbl); /* Check that the write-protect PAT entry is set for write-protect */ bool x86_has_pat_wp(void) @@ -77,6 +76,16 @@ bool x86_has_pat_wp(void) return __pte2cachemode_tbl[_PAGE_CACHE_MODE_WP] == _PAGE_CACHE_MODE_WP; } +enum page_cache_mode pgprot2cachemode(pgprot_t pgprot) +{ + unsigned long masked; + + masked = pgprot_val(pgprot) & _PAGE_CACHE_MASK; + if (likely(masked == 0)) + return 0; + return __pte2cachemode_tbl[__pte2cm_idx(masked)]; +} + static unsigned long __initdata pgt_buf_start; static unsigned long __initdata pgt_buf_end; static unsigned long __initdata pgt_buf_top; -- cgit v1.2.3 From 9d9e435f3f2492bfd196acacb61cc9a9212d8170 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 26 Mar 2020 23:48:15 -0700 Subject: x86/elf: Add table to document READ_IMPLIES_EXEC Add a table to document the current behavior of READ_IMPLIES_EXEC in preparation for changing the behavior. Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Reviewed-by: Jason Gunthorpe Link: https://lkml.kernel.org/r/20200327064820.12602-2-keescook@chromium.org --- arch/x86/include/asm/elf.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 69c0f892e310..ee459d4c3b45 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -281,6 +281,25 @@ extern u32 elf_hwcap2; /* * An executable for which elf_read_implies_exec() returns TRUE will * have the READ_IMPLIES_EXEC personality flag set automatically. + * + * The decision process for determining the results are: + * + *              CPU: | lacks NX*  | has NX, ia32     | has NX, x86_64 | + * ELF:              |            |                  |                | + * ---------------------|------------|------------------|----------------| + * missing PT_GNU_STACK | exec-all   | exec-all         | exec-all       | + * PT_GNU_STACK == RWX  | exec-all   | exec-all         | exec-all       | + * PT_GNU_STACK == RW   | exec-none  | exec-none        | exec-none      | + * + * exec-all : all PROT_READ user mappings are executable, except when + * backed by files on a noexec-filesystem. + * exec-none : only PROT_EXEC user mappings are executable. + * + * *this column has no architectural effect: NX markings are ignored by + * hardware, but may have behavioral effects when "wants X" collides with + * "cannot be X" constraints in memory permission flags, as in + * https://lkml.kernel.org/r/20190418055759.GA3155@mellanox.com + * */ #define elf_read_implies_exec(ex, executable_stack) \ (executable_stack != EXSTACK_DISABLE_X) -- cgit v1.2.3 From 122306117afe4ba202b5e57c61dfbeffc5c41387 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 26 Mar 2020 23:48:16 -0700 Subject: x86/elf: Split READ_IMPLIES_EXEC from executable PT_GNU_STACK The READ_IMPLIES_EXEC workaround was designed for old toolchains that lacked the ELF PT_GNU_STACK marking under the assumption that toolchains that couldn't specify executable permission flags for the stack may not know how to do it correctly for any memory region. This logic is sensible for having ancient binaries coexist in a system with possibly NX memory, but was implemented in a way that equated having a PT_GNU_STACK marked executable as being as "broken" as lacking the PT_GNU_STACK marking entirely. Things like unmarked assembly and stack trampolines may cause PT_GNU_STACK to need an executable bit, but they do not imply all mappings must be executable. This confusion has led to situations where modern programs with explicitly marked executable stacks are forced into the READ_IMPLIES_EXEC state when no such thing is needed. (And leads to unexpected failures when mmap()ing regions of device driver memory that wish to disallow VM_EXEC[1].) In looking for other reasons for the READ_IMPLIES_EXEC behavior, Jann Horn noted that glibc thread stacks have always been marked RWX (until 2003 when they started tracking the PT_GNU_STACK flag instead[2]). And musl doesn't support executable stacks at all[3]. As such, no breakage for multithreaded applications is expected from this change. [1] https://lkml.kernel.org/r/20190418055759.GA3155@mellanox.com [2] https://sourceware.org/git/?p=glibc.git;a=commitdiff;h=54ee14b3882 [3] https://lkml.kernel.org/r/20190423192534.GN23599@brightrain.aerifal.cx Suggested-by: Hector Marco-Gisbert Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Reviewed-by: Jason Gunthorpe Link: https://lkml.kernel.org/r/20200327064820.12602-3-keescook@chromium.org --- arch/x86/include/asm/elf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ee459d4c3b45..397a1c74433e 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -288,12 +288,13 @@ extern u32 elf_hwcap2; * ELF:              |            |                  |                | * ---------------------|------------|------------------|----------------| * missing PT_GNU_STACK | exec-all   | exec-all         | exec-all       | - * PT_GNU_STACK == RWX  | exec-all   | exec-all         | exec-all       | + * PT_GNU_STACK == RWX  | exec-stack | exec-stack       | exec-stack     | * PT_GNU_STACK == RW   | exec-none  | exec-none        | exec-none      | * * exec-all : all PROT_READ user mappings are executable, except when * backed by files on a noexec-filesystem. * exec-none : only PROT_EXEC user mappings are executable. + * exec-stack: only the stack and PROT_EXEC user mappings are executable. * * *this column has no architectural effect: NX markings are ignored by * hardware, but may have behavioral effects when "wants X" collides with @@ -302,7 +303,7 @@ extern u32 elf_hwcap2; * */ #define elf_read_implies_exec(ex, executable_stack) \ - (executable_stack != EXSTACK_DISABLE_X) + (executable_stack == EXSTACK_DEFAULT) struct task_struct; -- cgit v1.2.3 From 9fccc5c0c99f238aa1b0460fccbdb30a887e7036 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 26 Mar 2020 23:48:17 -0700 Subject: x86/elf: Disable automatic READ_IMPLIES_EXEC on 64-bit With modern x86 64-bit environments, there should never be a need for automatic READ_IMPLIES_EXEC, as the architecture is intended to always be execute-bit aware (as in, the default memory protection should be NX unless a region explicitly requests to be executable). There were very old x86_64 systems that lacked the NX bit, but for those, the NX bit is, obviously, unenforceable, so these changes should have no impact on them. Suggested-by: Hector Marco-Gisbert Signed-off-by: Kees Cook Signed-off-by: Borislav Petkov Reviewed-by: Jason Gunthorpe Link: https://lkml.kernel.org/r/20200327064820.12602-4-keescook@chromium.org --- arch/x86/include/asm/elf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 397a1c74433e..452beed7892b 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -287,7 +287,7 @@ extern u32 elf_hwcap2; *              CPU: | lacks NX*  | has NX, ia32     | has NX, x86_64 | * ELF:              |            |                  |                | * ---------------------|------------|------------------|----------------| - * missing PT_GNU_STACK | exec-all   | exec-all         | exec-all       | + * missing PT_GNU_STACK | exec-all   | exec-all         | exec-none      | * PT_GNU_STACK == RWX  | exec-stack | exec-stack       | exec-stack     | * PT_GNU_STACK == RW   | exec-none  | exec-none        | exec-none      | * @@ -303,7 +303,7 @@ extern u32 elf_hwcap2; * */ #define elf_read_implies_exec(ex, executable_stack) \ - (executable_stack == EXSTACK_DEFAULT) + (mmap_is_ia32() && executable_stack == EXSTACK_DEFAULT) struct task_struct; -- cgit v1.2.3 From 5efac0741ce238e0844d3f7af00198f81e84926a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 23 Mar 2020 20:42:57 -0400 Subject: KVM: x86: introduce kvm_mmu_invalidate_gva Wrap the combination of mmu->invlpg and kvm_x86_ops->tlb_flush_gva into a new function. This function also lets us specify the host PGD to invalidate and also the MMU, both of which will be useful in fixing and simplifying kvm_inject_emulated_page_fault. A nested guest's MMU however has g_context->invlpg == NULL. Instead of setting it to nonpaging_invlpg, make kvm_mmu_invalidate_gva the only entry point to mmu->invlpg and make a NULL invlpg pointer equivalent to nonpaging_invlpg, saving a retpoline. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/mmu/mmu.c | 71 ++++++++++++++++++++++++++--------------- 2 files changed, 47 insertions(+), 26 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 34ad7297b06b..8c1e094a639f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1511,6 +1511,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); +void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8071952e9cf2..84eeaa4ea149 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -2153,10 +2153,6 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 0; } -static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root) -{ -} - static void nonpaging_update_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte) @@ -4237,7 +4233,7 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->page_fault = nonpaging_page_fault; context->gva_to_gpa = nonpaging_gva_to_gpa; context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; + context->invlpg = NULL; context->update_pte = nonpaging_update_pte; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; @@ -4928,7 +4924,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->mmu_role.as_u64 = new_role.as_u64; context->page_fault = kvm_tdp_page_fault; context->sync_page = nonpaging_sync_page; - context->invlpg = nonpaging_invlpg; + context->invlpg = NULL; context->update_pte = nonpaging_update_pte; context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); context->direct_map = true; @@ -5096,6 +5092,12 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; + /* + * L2 page tables are never shadowed, so there is no need to sync + * SPTEs. + */ + g_context->invlpg = NULL; + /* * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using * L1's nested page tables (e.g. EPT12). The nested translation @@ -5497,37 +5499,54 @@ emulate: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); -void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + gva_t gva, hpa_t root_hpa) { - struct kvm_mmu *mmu = vcpu->arch.mmu; int i; - /* INVLPG on a * non-canonical address is a NOP according to the SDM. */ - if (is_noncanonical_address(gva, vcpu)) + /* It's actually a GPA for vcpu->arch.guest_mmu. */ + if (mmu != &vcpu->arch.guest_mmu) { + /* INVLPG on a non-canonical address is a NOP according to the SDM. */ + if (is_noncanonical_address(gva, vcpu)) + return; + + kvm_x86_ops.tlb_flush_gva(vcpu, gva); + } + + if (!mmu->invlpg) return; - mmu->invlpg(vcpu, gva, mmu->root_hpa); + if (root_hpa == INVALID_PAGE) { + mmu->invlpg(vcpu, gva, mmu->root_hpa); - /* - * INVLPG is required to invalidate any global mappings for the VA, - * irrespective of PCID. Since it would take us roughly similar amount - * of work to determine whether any of the prev_root mappings of the VA - * is marked global, or to just sync it blindly, so we might as well - * just always sync it. - * - * Mappings not reachable via the current cr3 or the prev_roots will be - * synced when switching to that cr3, so nothing needs to be done here - * for them. - */ - for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (VALID_PAGE(mmu->prev_roots[i].hpa)) - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + /* + * INVLPG is required to invalidate any global mappings for the VA, + * irrespective of PCID. Since it would take us roughly similar amount + * of work to determine whether any of the prev_root mappings of the VA + * is marked global, or to just sync it blindly, so we might as well + * just always sync it. + * + * Mappings not reachable via the current cr3 or the prev_roots will be + * synced when switching to that cr3, so nothing needs to be done here + * for them. + */ + for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) + if (VALID_PAGE(mmu->prev_roots[i].hpa)) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + } else { + mmu->invlpg(vcpu, gva, root_hpa); + } +} +EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_gva); - kvm_x86_ops.tlb_flush_gva(vcpu, gva); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE); ++vcpu->stat.invlpg; } EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) { struct kvm_mmu *mmu = vcpu->arch.mmu; -- cgit v1.2.3 From e64419d991ea212af087d3c57fcabb4d27db03fc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:10 -0700 Subject: KVM: x86: Move "flush guest's TLB" logic to separate kvm_x86_ops hook Add a dedicated hook to handle flushing TLB entries on behalf of the guest, i.e. for a paravirtualized TLB flush, and use it directly instead of bouncing through kvm_vcpu_flush_tlb(). For VMX, change the effective implementation implementation to never do INVEPT and flush only the current context, i.e. to always flush via INVVPID(SINGLE_CONTEXT). The INVEPT performed by __vmx_flush_tlb() when @invalidate_gpa=false and enable_vpid=0 is unnecessary, as it will only flush guest-physical mappings; linear and combined mappings are flushed by VM-Enter when VPID is disabled, and changes in the guest pages tables do not affect guest-physical mappings. When EPT and VPID are enabled, doing INVVPID is not required (by Intel's architecture) to invalidate guest-physical mappings, i.e. TLB entries that cache guest-physical mappings can live across INVVPID as the mappings are associated with an EPTP, not a VPID. The intent of @invalidate_gpa is to inform vmx_flush_tlb() that it must "invalidate gpa mappings", i.e. do INVEPT and not simply INVVPID. Other than nested VPID handling, which now calls vpid_sync_context() directly, the only scenario where KVM can safely do INVVPID instead of INVEPT (when EPT is enabled) is if KVM is flushing TLB entries from the guest's perspective, i.e. is only required to invalidate linear mappings. For SVM, flushing TLB entries from the guest's perspective can be done by flushing the current ASID, as changes to the guest's page tables are associated only with the current ASID. Adding a dedicated ->tlb_flush_guest() paves the way toward removing @invalidate_gpa, which is a potentially dangerous control flag as its meaning is not exactly crystal clear, even for those who are familiar with the subtleties of what mappings Intel CPUs are/aren't allowed to keep across various invalidation scenarios. Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-15-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/svm/svm.c | 6 ++++++ arch/x86/kvm/vmx/vmx.c | 13 +++++++++++++ arch/x86/kvm/x86.c | 2 +- 4 files changed, 26 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8c1e094a639f..a7c1ea136c8c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1114,6 +1114,12 @@ struct kvm_x86_ops { */ void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr); + /* + * Flush any TLB entries created by the guest. Like tlb_flush_gva(), + * does not need to flush GPA->HPA mappings. + */ + void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); + void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2f379bacbb26..2c0fcb87a054 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3170,6 +3170,11 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) invlpga(gva, svm->vmcb->control.asid); } +static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + svm_flush_tlb(vcpu, false); +} + static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) { } @@ -3939,6 +3944,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .tlb_flush = svm_flush_tlb, .tlb_flush_gva = svm_flush_tlb_gva, + .tlb_flush_guest = svm_flush_tlb_guest, .run = svm_vcpu_run, .handle_exit = handle_exit, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d973ff095066..094acb53829b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2851,6 +2851,18 @@ static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr) */ } +static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + /* + * vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0 + * or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit + * are required to flush GVA->{G,H}PA mappings from the TLB if vpid is + * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed), + * i.e. no explicit INVVPID is necessary. + */ + vpid_sync_context(to_vmx(vcpu)->vpid); +} + static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; @@ -7718,6 +7730,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .tlb_flush = vmx_flush_tlb, .tlb_flush_gva = vmx_flush_tlb_gva, + .tlb_flush_guest = vmx_flush_tlb_guest, .run = vmx_vcpu_run, .handle_exit = vmx_handle_exit, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3984574e09bf..c7ad142b511f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2719,7 +2719,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_vcpu_flush_tlb(vcpu, false); + kvm_x86_ops.tlb_flush_guest(vcpu); vcpu->arch.st.preempted = 0; -- cgit v1.2.3 From 2ddddd0b4e89e1fc30ed257653239005d2f31f5b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 20 Apr 2020 09:49:26 -0700 Subject: Drivers: hv: Move AEOI determination to architecture dependent code Hyper-V on ARM64 doesn't provide a flag for the AEOI recommendation in ms_hyperv.hints, so having the test in architecture independent code doesn't work. Resolve this by moving the check of the flag to an architecture dependent helper function. No functionality is changed. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20200420164926.24471-1-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/include/asm/mshyperv.h | 2 ++ drivers/hv/hv.c | 6 +----- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 6b79515abb82..7c2bbd6675dc 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -34,6 +34,8 @@ typedef int (*hyperv_fill_flush_list_func)( rdmsrl(HV_X64_MSR_SINT0 + int_num, val) #define hv_set_synint_state(int_num, val) \ wrmsrl(HV_X64_MSR_SINT0 + int_num, val) +#define hv_recommend_using_aeoi() \ + (!(ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)) #define hv_get_crash_ctl(val) \ rdmsrl(HV_X64_MSR_CRASH_CTL, val) diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 6098e0cbdb4b..533c8b82b344 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -184,11 +184,7 @@ void hv_synic_enable_regs(unsigned int cpu) shared_sint.vector = HYPERVISOR_CALLBACK_VECTOR; shared_sint.masked = false; - if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED) - shared_sint.auto_eoi = false; - else - shared_sint.auto_eoi = true; - + shared_sint.auto_eoi = hv_recommend_using_aeoi(); hv_set_synint_state(VMBUS_MESSAGE_SINT, shared_sint.as_uint64); /* Enable the global synic bit */ -- cgit v1.2.3 From 0baedd792713063213f1e2060dc6a5d536638f0a Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 25 Mar 2020 12:28:24 -0400 Subject: KVM: x86: make Hyper-V PV TLB flush use tlb_flush_guest() Hyper-V PV TLB flush mechanism does TLB flush on behalf of the guest so doing tlb_flush_all() is an overkill, switch to using tlb_flush_guest() (just like KVM PV TLB flush mechanism) instead. Introduce KVM_REQ_HV_TLB_FLUSH to support the change. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/hyperv.c | 3 +-- arch/x86/kvm/x86.c | 10 +++++++++- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a7c1ea136c8c..9951b01df57c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -83,6 +83,8 @@ #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HV_TLB_FLUSH \ + KVM_ARCH_REQ_FLAGS(26, KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index bcefa9d4e57e..b850f676abe4 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1425,8 +1425,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, * vcpu->arch.cr3 may not be up-to-date for running vCPUs so we can't * analyze it here, flush TLB regardless of the specified address space. */ - kvm_make_vcpus_request_mask(kvm, - KVM_REQ_TLB_FLUSH | KVM_REQUEST_NO_WAKEUP, + kvm_make_vcpus_request_mask(kvm, KVM_REQ_HV_TLB_FLUSH, vcpu_mask, &hv_vcpu->tlb_flush); ret_success: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c7ad142b511f..a7df68af65e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2696,6 +2696,12 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); } +static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_guest(vcpu); +} + static void record_steal_time(struct kvm_vcpu *vcpu) { struct kvm_host_map map; @@ -2719,7 +2725,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st->preempted & KVM_VCPU_FLUSH_TLB); if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB) - kvm_x86_ops.tlb_flush_guest(vcpu); + kvm_vcpu_flush_tlb_guest(vcpu); vcpu->arch.st.preempted = 0; @@ -8218,6 +8224,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_load_pgd(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb(vcpu, true); + if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) + kvm_vcpu_flush_tlb_guest(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; -- cgit v1.2.3 From f55ac304ca47039368a5971fa61ebc8160c90659 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:12 -0700 Subject: KVM: x86: Drop @invalidate_gpa param from kvm_x86_ops' tlb_flush() Drop @invalidate_gpa from ->tlb_flush() and kvm_vcpu_flush_tlb() now that all callers pass %true for said param, or ignore the param (SVM has an internal call to svm_flush_tlb() in svm_flush_tlb_guest that somewhat arbitrarily passes %false). Remove __vmx_flush_tlb() as it is no longer used. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-17-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 6 +++--- arch/x86/kvm/svm/svm.h | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/vmx/vmx.h | 42 ++++++++++++----------------------------- arch/x86/kvm/x86.c | 6 +++--- 8 files changed, 24 insertions(+), 42 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9951b01df57c..7156c749677a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1103,7 +1103,7 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); + void (*tlb_flush)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 32c9f4b2a281..081f6b220b6e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5179,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_pgd(vcpu); - kvm_x86_ops.tlb_flush(vcpu, true); + kvm_x86_ops.tlb_flush(vcpu); out: return r; } diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 90a1ca939627..c62893102777 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -279,7 +279,7 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; svm->nested.intercept = nested_vmcb->control.intercept; - svm_flush_tlb(&svm->vcpu, true); + svm_flush_tlb(&svm->vcpu); svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 2c0fcb87a054..aafd18589fda 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1603,7 +1603,7 @@ int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu, true); + svm_flush_tlb(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -3153,7 +3153,7 @@ static int svm_set_identity_map_addr(struct kvm *kvm, u64 ident_addr) return 0; } -void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +void svm_flush_tlb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3172,7 +3172,7 @@ static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) static void svm_flush_tlb_guest(struct kvm_vcpu *vcpu) { - svm_flush_tlb(vcpu, false); + svm_flush_tlb(vcpu); } static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index df3474f4fb02..ca95204f9dde 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -360,7 +360,7 @@ u32 svm_msrpm_offset(u32 msr); void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer); void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); -void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); +void svm_flush_tlb(struct kvm_vcpu *vcpu); void disable_nmi_singlestep(struct vcpu_svm *svm); /* nested.c */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 8b55f43433a4..d843ef34aafd 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6083,7 +6083,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) if (flexpriority_enabled) { sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; - vmx_flush_tlb(vcpu, true); + vmx_flush_tlb(vcpu); } break; case LAPIC_MODE_X2APIC: @@ -6101,7 +6101,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) { if (!is_guest_mode(vcpu)) { vmcs_write64(APIC_ACCESS_ADDR, hpa); - vmx_flush_tlb(vcpu, true); + vmx_flush_tlb(vcpu); } } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 1560296dde25..45978552524e 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -500,46 +500,28 @@ static inline struct vmcs *alloc_vmcs(bool shadow) u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa); -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, - bool invalidate_gpa) -{ - if (enable_ept && (invalidate_gpa || !enable_vpid)) { - if (!VALID_PAGE(vcpu->arch.mmu->root_hpa)) - return; - ept_sync_context(construct_eptp(vcpu, - vcpu->arch.mmu->root_hpa)); - } else { - vpid_sync_context(vpid); - } -} - -static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * Flush all EPTP/VPID contexts if the TLB flush _may_ have been - * invoked via kvm_flush_remote_tlbs(), which always passes %true for - * @invalidate_gpa. Flushing remote TLBs requires all contexts to be - * flushed, not just the active context. + * Flush all EPTP/VPID contexts, as the TLB flush _may_ have been + * invoked via kvm_flush_remote_tlbs(). Flushing remote TLBs requires + * all contexts to be flushed, not just the active context. * * Note, this also ensures a deferred TLB flush with VPID enabled and * EPT disabled invalidates the "correct" VPID, by nuking both L1 and * L2's VPIDs. */ - if (invalidate_gpa) { - if (enable_ept) { - ept_sync_global(); - } else if (enable_vpid) { - if (cpu_has_vmx_invvpid_global()) { - vpid_sync_vcpu_global(); - } else { - vpid_sync_vcpu_single(vmx->vpid); - vpid_sync_vcpu_single(vmx->nested.vpid02); - } + if (enable_ept) { + ept_sync_global(); + } else if (enable_vpid) { + if (cpu_has_vmx_invvpid_global()) { + vpid_sync_vcpu_global(); + } else { + vpid_sync_vcpu_single(vmx->vpid); + vpid_sync_vcpu_single(vmx->nested.vpid02); } - } else { - __vmx_flush_tlb(vcpu, vmx->vpid, false); } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a7df68af65e5..daf16247870c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2690,10 +2690,10 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.time = 0; } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush(vcpu, invalidate_gpa); + kvm_x86_ops.tlb_flush(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -8223,7 +8223,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu, true); + kvm_vcpu_flush_tlb(vcpu); if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { -- cgit v1.2.3 From 7780938cc70b848650722762fa4c7496fa68f9ec Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:18 -0700 Subject: KVM: x86: Rename ->tlb_flush() to ->tlb_flush_all() Rename ->tlb_flush() to ->tlb_flush_all() in preparation for adding a new hook to flush only the current ASID/context. Opportunstically replace the comment in vmx_flush_tlb() that explains why it flushes all EPTP/VPID contexts with a comment explaining why it unconditionally uses INVEPT when EPT is enabled. I.e. rely on the "all" part of the name to clarify why it does global INVEPT/INVVPID. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-23-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 16 +++++++--------- arch/x86/kvm/x86.c | 6 +++--- 5 files changed, 13 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7156c749677a..e6305d5e28fa 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1103,7 +1103,7 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*tlb_flush_all)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 081f6b220b6e..85e17a057094 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5179,7 +5179,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) if (r) goto out; kvm_mmu_load_pgd(vcpu); - kvm_x86_ops.tlb_flush(vcpu); + kvm_x86_ops.tlb_flush_all(vcpu); out: return r; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fa5bb1b62059..72b976e64df9 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3944,7 +3944,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, - .tlb_flush = svm_flush_tlb, + .tlb_flush_all = svm_flush_tlb, .tlb_flush_gva = svm_flush_tlb_gva, .tlb_flush_guest = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0f1dfbae649f..da868846d96d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2838,18 +2838,16 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * Flush all EPTP/VPID contexts, as the TLB flush _may_ have been - * invoked via kvm_flush_remote_tlbs(). Flushing remote TLBs requires - * all contexts to be flushed, not just the active context. - * - * Note, this also ensures a deferred TLB flush with VPID enabled and - * EPT disabled invalidates the "correct" VPID, by nuking both L1 and - * L2's VPIDs. + * INVEPT must be issued when EPT is enabled, irrespective of VPID, as + * the CPU is not required to invalidate guest-physical mappings on + * VM-Entry, even if VPID is disabled. Guest-physical mappings are + * associated with the root EPT structure and not any particular VPID + * (INVVPID also isn't required to invalidate guest-physical mappings). */ if (enable_ept) { ept_sync_global(); @@ -7765,7 +7763,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, - .tlb_flush = vmx_flush_tlb, + .tlb_flush_all = vmx_flush_tlb_all, .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index daf16247870c..cd2a3d01bffb 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2690,10 +2690,10 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.time = 0; } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) { ++vcpu->stat.tlb_flush; - kvm_x86_ops.tlb_flush(vcpu); + kvm_x86_ops.tlb_flush_all(vcpu); } static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) @@ -8223,7 +8223,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu); + kvm_vcpu_flush_tlb_all(vcpu); if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { -- cgit v1.2.3 From eeeb4f67a6cd437da1f5d1a20596cdc2d7b50551 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:20 -0700 Subject: KVM: x86: Introduce KVM_REQ_TLB_FLUSH_CURRENT to flush current ASID Add KVM_REQ_TLB_FLUSH_CURRENT to allow optimized TLB flushing of VMX's EPTP/VPID contexts[*] from the KVM MMU and/or in a deferred manner, e.g. to flush L2's context during nested VM-Enter. Convert KVM_REQ_TLB_FLUSH to KVM_REQ_TLB_FLUSH_CURRENT in flows where the flush is directly associated with vCPU-scoped instruction emulation, i.e. MOV CR3 and INVPCID. Add a comment in vmx_vcpu_load_vmcs() above its KVM_REQ_TLB_FLUSH to make it clear that it deliberately requests a flush of all contexts. Service any pending flush request on nested VM-Exit as it's possible a nested VM-Exit could occur after requesting a flush for L2. Add the same logic for nested VM-Enter even though it's _extremely_ unlikely for flush to be pending on nested VM-Enter, but theoretically possible (in the future) due to RSM (SMM) emulation. [*] Intel also has an Address Space Identifier (ASID) concept, e.g. EPTP+VPID+PCID == ASID, it's just not documented in the SDM because the rules of invalidation are different based on which piece of the ASID is being changed, i.e. whether the EPTP, VPID, or PCID context must be invalidated. Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-25-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 +++- arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/vmx/nested.c | 7 +++++++ arch/x86/kvm/vmx/vmx.c | 7 ++++++- arch/x86/kvm/x86.c | 11 +++++++++-- arch/x86/kvm/x86.h | 6 ++++++ 6 files changed, 32 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e6305d5e28fa..72e9c4492f47 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -83,8 +83,9 @@ #define KVM_REQ_GET_VMCS12_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_HV_TLB_FLUSH \ - KVM_ARCH_REQ_FLAGS(26, KVM_REQUEST_NO_WAKEUP) + KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -1104,6 +1105,7 @@ struct kvm_x86_ops { void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); void (*tlb_flush_all)(struct kvm_vcpu *vcpu); + void (*tlb_flush_current)(struct kvm_vcpu *vcpu); int (*tlb_remote_flush)(struct kvm *kvm); int (*tlb_remote_flush_with_range)(struct kvm *kvm, struct kvm_tlb_range *range); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 72b976e64df9..66123848448d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3945,6 +3945,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_rflags = svm_set_rflags, .tlb_flush_all = svm_flush_tlb, + .tlb_flush_current = svm_flush_tlb, .tlb_flush_gva = svm_flush_tlb_gva, .tlb_flush_guest = svm_flush_tlb, diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index ba1aedb535ef..567a7a0f30e0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3208,6 +3208,9 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, u32 exit_reason = EXIT_REASON_INVALID_STATE; u32 exit_qual; + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + evaluate_pending_interrupts = exec_controls_get(vmx) & (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING); if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu)) @@ -4274,6 +4277,10 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); + /* Service the TLB flush request for L2 before switching to L1. */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); + leave_guest_mode(vcpu); if (nested_cpu_has_preemption_timer(vmcs12)) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index da868846d96d..d958cee52f41 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1338,6 +1338,10 @@ void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu) void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; + /* + * Flush all EPTP/VPID contexts, the new pCPU may have stale + * TLB entries from its previous association with the vCPU. + */ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); /* @@ -5468,7 +5472,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) if (kvm_get_active_pcid(vcpu) == operand.pcid) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) @@ -7764,6 +7768,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_rflags = vmx_set_rflags, .tlb_flush_all = vmx_flush_tlb_all, + .tlb_flush_current = vmx_flush_tlb_current, .tlb_flush_gva = vmx_flush_tlb_gva, .tlb_flush_guest = vmx_flush_tlb_guest, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd2a3d01bffb..ebbe34d89469 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1019,7 +1019,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) { if (!skip_tlb_flush) { kvm_mmu_sync_roots(vcpu); - kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); } return 0; } @@ -8222,10 +8222,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) kvm_mmu_load_pgd(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { kvm_vcpu_flush_tlb_all(vcpu); + + /* Flushing all ASIDs flushes the current ASID... */ + kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); + } + if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) + kvm_vcpu_flush_tlb_current(vcpu); if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu)) kvm_vcpu_flush_tlb_guest(vcpu); + if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index b968acc0516f..7b5ed8ed628e 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -125,6 +125,12 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; } +static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops.tlb_flush_current(vcpu); +} + static inline int is_pae(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); -- cgit v1.2.3 From a4148b7ca2a5afe1295a41b5e30048cabcb74f8d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:24 -0700 Subject: KVM: VMX: Retrieve APIC access page HPA only when necessary Move the retrieval of the HPA associated with L1's APIC access page into VMX code to avoid unnecessarily calling gfn_to_page(), e.g. when the vCPU is in guest mode (L2). Alternatively, the optimization logic in VMX could be mirrored into the common x86 code, but that will get ugly fast when further optimizations are introduced. Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-29-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/vmx/vmx.c | 16 ++++++++++++++-- arch/x86/kvm/x86.c | 13 +------------ 3 files changed, 16 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 72e9c4492f47..541e2df8fc6e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1152,7 +1152,7 @@ struct kvm_x86_ops { bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); - void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); + void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); int (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4739b780c74e..8550da629a61 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6140,16 +6140,28 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) vmx_update_msr_bitmap(vcpu); } -static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) +static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) { + struct page *page; + /* Defer reload until vmcs01 is the current VMCS. */ if (is_guest_mode(vcpu)) { to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true; return; } - vmcs_write64(APIC_ACCESS_ADDR, hpa); + page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); + if (is_error_page(page)) + return; + + vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page)); vmx_flush_tlb_current(vcpu); + + /* + * Do not pin apic access page in memory, the MMU notifier + * will call us again if it is migrated or swapped out. + */ + put_page(page); } static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ebbe34d89469..90aa4abbc0a6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8157,24 +8157,13 @@ int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { - struct page *page = NULL; - if (!lapic_in_kernel(vcpu)) return; if (!kvm_x86_ops.set_apic_access_page_addr) return; - page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); - if (is_error_page(page)) - return; - kvm_x86_ops.set_apic_access_page_addr(vcpu, page_to_phys(page)); - - /* - * Do not pin apic access page in memory, the MMU notifier - * will call us again if it is migrated or swapped out. - */ - put_page(page); + kvm_x86_ops.set_apic_access_page_addr(vcpu); } void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From 4a632ac6ca66fb29b94a16495624c58f4d313f2f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:27 -0700 Subject: KVM: x86/mmu: Add separate override for MMU sync during fast CR3 switch Add a separate "skip" override for MMU sync, a future change to avoid TLB flushes on nested VMX transitions may need to sync the MMU even if the TLB flush is unnecessary. Suggested-by: Paolo Bonzini Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-32-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 ++- arch/x86/kvm/mmu/mmu.c | 13 +++++++------ arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/x86.c | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 541e2df8fc6e..ee5886a5006a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1524,7 +1524,8 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush); +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush, + bool skip_mmu_sync); void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 6a939204d467..97d1e9b80b8e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4303,7 +4303,7 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, union kvm_mmu_page_role new_role, - bool skip_tlb_flush) + bool skip_tlb_flush, bool skip_mmu_sync) { if (!fast_cr3_switch(vcpu, new_cr3, new_role)) { kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); @@ -4318,10 +4318,10 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, */ kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); - if (!skip_tlb_flush) { + if (!skip_mmu_sync) kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); + if (!skip_tlb_flush) kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); - } /* * The last MMIO access's GVA and GPA are cached in the VCPU. When @@ -4334,10 +4334,11 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush) +void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush, + bool skip_mmu_sync) { __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), - skip_tlb_flush); + skip_tlb_flush, skip_mmu_sync); } EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); @@ -5030,7 +5031,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false); + __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, false, false); if (new_role.as_u64 == context->mmu_role.as_u64) return; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index a7a526a21292..bf5efb30cc21 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1101,7 +1101,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne } if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3, false); + kvm_mmu_new_cr3(vcpu, cr3, false, false); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 90aa4abbc0a6..7bf5193ed9b4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1031,7 +1031,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush); + kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); -- cgit v1.2.3 From be01e8e2c632c41c69bb30e7196661ec6e8fdc10 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 20 Mar 2020 14:28:32 -0700 Subject: KVM: x86: Replace "cr3" with "pgd" in "new cr3/pgd" related code Rename functions and variables in kvm_mmu_new_cr3() and related code to replace "cr3" with "pgd", i.e. continue the work started by commit 727a7e27cf88a ("KVM: x86: rename set_cr3 callback and related flags to load_mmu_pgd"). kvm_mmu_new_cr3() and company are not always loading a new CR3, e.g. when nested EPT is enabled "cr3" is actually an EPTP. Signed-off-by: Sean Christopherson Message-Id: <20200320212833.3507-37-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 +++--- arch/x86/kvm/mmu/mmu.c | 58 ++++++++++++++++++++--------------------- arch/x86/kvm/vmx/nested.c | 6 ++--- arch/x86/kvm/vmx/vmx.c | 2 +- arch/x86/kvm/x86.c | 2 +- 5 files changed, 38 insertions(+), 38 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ee5886a5006a..d099749168f0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -375,12 +375,12 @@ struct rsvd_bits_validate { }; struct kvm_mmu_root_info { - gpa_t cr3; + gpa_t pgd; hpa_t hpa; }; #define KVM_MMU_ROOT_INFO_INVALID \ - ((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE }) + ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE }) #define KVM_MMU_NUM_PREV_ROOTS 3 @@ -406,7 +406,7 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - gpa_t root_cr3; + gpa_t root_pgd; union kvm_mmu_role mmu_role; u8 root_level; u8 shadow_root_level; @@ -1524,7 +1524,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gva_t gva, hpa_t root_hpa); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush, +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, bool skip_mmu_sync); void kvm_configure_mmu(bool enable_tdp, int tdp_page_level); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 10fb9e7f1eca..3385d6dde541 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3665,7 +3665,7 @@ void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, &invalid_list); mmu->root_hpa = INVALID_PAGE; } - mmu->root_cr3 = 0; + mmu->root_pgd = 0; } kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); @@ -3722,8 +3722,8 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu) } else BUG(); - /* root_cr3 is ignored for direct MMUs. */ - vcpu->arch.mmu->root_cr3 = 0; + /* root_pgd is ignored for direct MMUs. */ + vcpu->arch.mmu->root_pgd = 0; return 0; } @@ -3732,11 +3732,11 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; u64 pdptr, pm_mask; - gfn_t root_gfn, root_cr3; + gfn_t root_gfn, root_pgd; int i; - root_cr3 = vcpu->arch.mmu->get_guest_pgd(vcpu); - root_gfn = root_cr3 >> PAGE_SHIFT; + root_pgd = vcpu->arch.mmu->get_guest_pgd(vcpu); + root_gfn = root_pgd >> PAGE_SHIFT; if (mmu_check_root(vcpu, root_gfn)) return 1; @@ -3761,7 +3761,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ++sp->root_count; spin_unlock(&vcpu->kvm->mmu_lock); vcpu->arch.mmu->root_hpa = root; - goto set_root_cr3; + goto set_root_pgd; } /* @@ -3827,8 +3827,8 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu->root_hpa = __pa(vcpu->arch.mmu->lm_root); } -set_root_cr3: - vcpu->arch.mmu->root_cr3 = root_cr3; +set_root_pgd: + vcpu->arch.mmu->root_pgd = root_pgd; return 0; } @@ -4244,49 +4244,49 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu, context->nx = false; } -static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t cr3, +static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd, union kvm_mmu_page_role role) { - return (role.direct || cr3 == root->cr3) && + return (role.direct || pgd == root->pgd) && VALID_PAGE(root->hpa) && page_header(root->hpa) && role.word == page_header(root->hpa)->role.word; } /* - * Find out if a previously cached root matching the new CR3/role is available. + * Find out if a previously cached root matching the new pgd/role is available. * The current root is also inserted into the cache. * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is * returned. * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and * false is returned. This root should now be freed by the caller. */ -static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3, +static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { uint i; struct kvm_mmu_root_info root; struct kvm_mmu *mmu = vcpu->arch.mmu; - root.cr3 = mmu->root_cr3; + root.pgd = mmu->root_pgd; root.hpa = mmu->root_hpa; - if (is_root_usable(&root, new_cr3, new_role)) + if (is_root_usable(&root, new_pgd, new_role)) return true; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { swap(root, mmu->prev_roots[i]); - if (is_root_usable(&root, new_cr3, new_role)) + if (is_root_usable(&root, new_pgd, new_role)) break; } mmu->root_hpa = root.hpa; - mmu->root_cr3 = root.cr3; + mmu->root_pgd = root.pgd; return i < KVM_MMU_NUM_PREV_ROOTS; } -static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, +static bool fast_pgd_switch(struct kvm_vcpu *vcpu, gpa_t new_pgd, union kvm_mmu_page_role new_role) { struct kvm_mmu *mmu = vcpu->arch.mmu; @@ -4298,17 +4298,17 @@ static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3, */ if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL && mmu->root_level >= PT64_ROOT_4LEVEL) - return !mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT) && - cached_root_available(vcpu, new_cr3, new_role); + return !mmu_check_root(vcpu, new_pgd >> PAGE_SHIFT) && + cached_root_available(vcpu, new_pgd, new_role); return false; } -static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, +static void __kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, union kvm_mmu_page_role new_role, bool skip_tlb_flush, bool skip_mmu_sync) { - if (!fast_cr3_switch(vcpu, new_cr3, new_role)) { + if (!fast_pgd_switch(vcpu, new_pgd, new_role)) { kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, KVM_MMU_ROOT_CURRENT); return; } @@ -4337,13 +4337,13 @@ static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, __clear_sp_write_flooding_count(page_header(vcpu->arch.mmu->root_hpa)); } -void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush, +void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd, bool skip_tlb_flush, bool skip_mmu_sync) { - __kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu), + __kvm_mmu_new_pgd(vcpu, new_pgd, kvm_mmu_calc_root_page_role(vcpu), skip_tlb_flush, skip_mmu_sync); } -EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3); +EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd); static unsigned long get_cr3(struct kvm_vcpu *vcpu) { @@ -5034,7 +5034,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty, execonly, level); - __kvm_mmu_new_cr3(vcpu, new_eptp, new_role.base, true, true); + __kvm_mmu_new_pgd(vcpu, new_eptp, new_role.base, true, true); if (new_role.as_u64 == context->mmu_role.as_u64) return; @@ -5551,7 +5551,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && - pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) { + pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); tlb_flush = true; } @@ -5705,13 +5705,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu = &vcpu->arch.root_mmu; vcpu->arch.root_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.root_mmu.root_cr3 = 0; + vcpu->arch.root_mmu.root_pgd = 0; vcpu->arch.root_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.root_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; vcpu->arch.guest_mmu.root_hpa = INVALID_PAGE; - vcpu->arch.guest_mmu.root_cr3 = 0; + vcpu->arch.guest_mmu.root_pgd = 0; vcpu->arch.guest_mmu.translate_gpa = translate_gpa; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) vcpu->arch.guest_mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7606d544ce9c..aca57d8da400 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1148,7 +1148,7 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne * nested_vmx_transition_mmu_sync for details on skipping the MMU sync. */ if (!nested_ept) - kvm_mmu_new_cr3(vcpu, cr3, true, + kvm_mmu_new_pgd(vcpu, cr3, true, !nested_vmx_transition_mmu_sync(vcpu)); vcpu->arch.cr3 = cr3; @@ -5228,13 +5228,13 @@ static int handle_invept(struct kvm_vcpu *vcpu) VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); roots_to_free = 0; - if (nested_ept_root_matches(mmu->root_hpa, mmu->root_cr3, + if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd, operand.eptp)) roots_to_free |= KVM_MMU_ROOT_CURRENT; for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (nested_ept_root_matches(mmu->prev_roots[i].hpa, - mmu->prev_roots[i].cr3, + mmu->prev_roots[i].pgd, operand.eptp)) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a87b8c1e8de6..c9c959fb8a66 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5476,7 +5476,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) - if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3) + if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd) == operand.pcid) roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bf5193ed9b4..de77bc9bd0d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1031,7 +1031,7 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) return 1; - kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); + kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush); vcpu->arch.cr3 = cr3; kvm_register_mark_available(vcpu, VCPU_EXREG_CR3); -- cgit v1.2.3 From 5addc235199f15ae964e7ac6b20cf43f4a661573 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Apr 2020 13:34:53 -0700 Subject: KVM: VMX: Cache vmcs.EXIT_QUALIFICATION using arch avail_reg flags Introduce a new "extended register" type, EXIT_INFO_1 (to pair with the nomenclature in .get_exit_info()), and use it to cache VMX's vmcs.EXIT_QUALIFICATION. Signed-off-by: Sean Christopherson Message-Id: <20200415203454.8296-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/nested.c | 20 ++++++++++---------- arch/x86/kvm/vmx/vmx.c | 28 ++++++++++++++-------------- arch/x86/kvm/vmx/vmx.h | 15 ++++++++++++++- 4 files changed, 39 insertions(+), 25 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d099749168f0..9673ec700cd0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -170,6 +170,7 @@ enum kvm_reg { VCPU_EXREG_CR3, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, + VCPU_EXREG_EXIT_INFO_1, }; enum { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index dad6b05e35ae..dc16b2f8ce97 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4604,7 +4604,7 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer) gva_t gva; struct x86_exception e; - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmcs_read32(VMX_INSTRUCTION_INFO), false, sizeof(*vmpointer), &gva)) return 1; @@ -4869,7 +4869,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@ -4955,7 +4955,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) { struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu) : get_vmcs12(vcpu); - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); struct vcpu_vmx *vmx = to_vmx(vcpu); struct x86_exception e; @@ -5140,7 +5140,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) /* Emulate the VMPTRST instruction */ static int handle_vmptrst(struct kvm_vcpu *vcpu) { - unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qual = vmx_get_exit_qual(vcpu); u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO); gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr; struct x86_exception e; @@ -5208,7 +5208,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) /* According to the Intel VMX instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { @@ -5290,7 +5290,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) /* according to the intel vmx instruction reference, the memory * operand is read even if it isn't needed (e.g., for type==global) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) { @@ -5420,7 +5420,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) fail: nested_vmx_vmexit(vcpu, vmx->exit_reason, vmcs_read32(VM_EXIT_INTR_INFO), - vmcs_readl(EXIT_QUALIFICATION)); + vmx_get_exit_qual(vcpu)); return 1; } @@ -5471,7 +5471,7 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu, if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS)) return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; @@ -5525,7 +5525,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu, static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int cr = exit_qualification & 15; int reg; unsigned long val; @@ -5849,7 +5849,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) } exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - exit_qual = vmcs_readl(EXIT_QUALIFICATION); + exit_qual = vmx_get_exit_qual(vcpu); trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, vmx->idt_vectoring_info, exit_intr_info, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e0587f532b64..b7aab2284a9c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4698,7 +4698,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) } if (is_page_fault(intr_info)) { - cr2 = vmcs_readl(EXIT_QUALIFICATION); + cr2 = vmx_get_exit_qual(vcpu); /* EPT won't cause page fault directly */ WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); @@ -4714,7 +4714,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); return 1; case DB_VECTOR: - dr6 = vmcs_readl(EXIT_QUALIFICATION); + dr6 = vmx_get_exit_qual(vcpu); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~DR_TRAP_BITS; @@ -4769,7 +4769,7 @@ static int handle_io(struct kvm_vcpu *vcpu) int size, in, string; unsigned port; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); string = (exit_qualification & 16) != 0; ++vcpu->stat.io_exits; @@ -4860,7 +4860,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) int err; int ret; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); cr = exit_qualification & 15; reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { @@ -4937,7 +4937,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) unsigned long exit_qualification; int dr, dr7, reg; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); dr = exit_qualification & DEBUG_REG_ACCESS_NUM; /* First, if DR does not exist, trigger UD */ @@ -5050,7 +5050,7 @@ static int handle_invd(struct kvm_vcpu *vcpu) static int handle_invlpg(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); kvm_mmu_invlpg(vcpu, exit_qualification); return kvm_skip_emulated_instruction(vcpu); @@ -5082,7 +5082,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { if (likely(fasteoi)) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int access_type, offset; access_type = exit_qualification & APIC_ACCESS_TYPE; @@ -5103,7 +5103,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); int vector = exit_qualification & 0xff; /* EOI-induced VM exit is trap-like and thus no need to adjust IP */ @@ -5113,7 +5113,7 @@ static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu) static int handle_apic_write(struct kvm_vcpu *vcpu) { - unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + unsigned long exit_qualification = vmx_get_exit_qual(vcpu); u32 offset = exit_qualification & 0xfff; /* APIC-write VM exit is trap-like and thus no need to adjust IP */ @@ -5134,7 +5134,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); reason = (u32)exit_qualification >> 30; if (reason == TASK_SWITCH_GATE && idt_v) { @@ -5184,7 +5184,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) gpa_t gpa; u64 error_code; - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * EPT violation happened while executing iret from NMI, @@ -5444,7 +5444,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) */ - if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION), + if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu), vmx_instruction_info, false, sizeof(operand), &gva)) return 1; @@ -5520,7 +5520,7 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) trace_kvm_pml_full(vcpu->vcpu_id); - exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + exit_qualification = vmx_get_exit_qual(vcpu); /* * PML buffer FULL happened while executing iret from NMI, @@ -5634,7 +5634,7 @@ static const int kvm_vmx_max_exit_handlers = static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { - *info1 = vmcs_readl(EXIT_QUALIFICATION); + *info1 = vmx_get_exit_qual(vcpu); *info2 = vmcs_read32(VM_EXIT_INTR_INFO); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 78c99a60fa49..a13eafec67fc 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -210,6 +210,7 @@ struct vcpu_vmx { */ bool guest_state_loaded; + unsigned long exit_qualification; u32 exit_intr_info; u32 idt_vectoring_info; ulong rflags; @@ -449,7 +450,8 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_RFLAGS) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) - | (1 << VCPU_EXREG_CR3)); + | (1 << VCPU_EXREG_CR3) + | (1 << VCPU_EXREG_EXIT_INFO_1)); vcpu->arch.regs_dirty = 0; } @@ -493,6 +495,17 @@ static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1); + vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + } + return vmx->exit_qualification; +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -- cgit v1.2.3 From 8791585837f659943936b8e1cce9d039436ad1ca Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 15 Apr 2020 13:34:54 -0700 Subject: KVM: VMX: Cache vmcs.EXIT_INTR_INFO using arch avail_reg flags Introduce a new "extended register" type, EXIT_INFO_2 (to pair with the nomenclature in .get_exit_info()), and use it to cache VMX's vmcs.EXIT_INTR_INFO. Drop a comment in vmx_recover_nmi_blocking() that is obsoleted by the generic caching mechanism. Signed-off-by: Sean Christopherson Message-Id: <20200415203454.8296-6-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/nested.c | 10 +++++----- arch/x86/kvm/vmx/vmx.c | 20 ++++++++------------ arch/x86/kvm/vmx/vmx.h | 14 +++++++++++++- 4 files changed, 27 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9673ec700cd0..ceba205d32a2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -171,6 +171,7 @@ enum kvm_reg { VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, VCPU_EXREG_EXIT_INFO_1, + VCPU_EXREG_EXIT_INFO_2, }; enum { diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index dc16b2f8ce97..78744c6e4a04 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5419,7 +5419,7 @@ static int handle_vmfunc(struct kvm_vcpu *vcpu) fail: nested_vmx_vmexit(vcpu, vmx->exit_reason, - vmcs_read32(VM_EXIT_INTR_INFO), + vmx_get_intr_info(vcpu), vmx_get_exit_qual(vcpu)); return 1; } @@ -5652,7 +5652,7 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) @@ -5708,12 +5708,12 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) */ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) { + u32 intr_info = vmx_get_intr_info(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - u32 intr_info; switch (exit_reason) { case EXIT_REASON_EXCEPTION_NMI: - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + intr_info = vmx_get_intr_info(vcpu); if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) @@ -5848,7 +5848,7 @@ bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu) goto reflect_vmexit; } - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + exit_intr_info = vmx_get_intr_info(vcpu); exit_qual = vmx_get_exit_qual(vcpu); trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason, exit_qual, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index b7aab2284a9c..50951cdbe76f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5635,7 +5635,7 @@ static const int kvm_vmx_max_exit_handlers = static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) { *info1 = vmx_get_exit_qual(vcpu); - *info2 = vmcs_read32(VM_EXIT_INTR_INFO); + *info2 = vmx_get_intr_info(vcpu); } static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx) @@ -6298,16 +6298,16 @@ static void vmx_apicv_post_state_restore(struct kvm_vcpu *vcpu) static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) { - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + u32 intr_info = vmx_get_intr_info(&vmx->vcpu); /* if exit due to PF check for async PF */ - if (is_page_fault(vmx->exit_intr_info)) { + if (is_page_fault(intr_info)) { vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); /* Handle machine checks before interrupts are enabled */ - } else if (is_machine_check(vmx->exit_intr_info)) { + } else if (is_machine_check(intr_info)) { kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ - } else if (is_nmi(vmx->exit_intr_info)) { + } else if (is_nmi(intr_info)) { kvm_before_interrupt(&vmx->vcpu); asm("int $2"); kvm_after_interrupt(&vmx->vcpu); @@ -6322,9 +6322,8 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned long tmp; #endif gate_desc *desc; - u32 intr_info; + u32 intr_info = vmx_get_intr_info(vcpu); - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (WARN_ONCE(!is_external_intr(intr_info), "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6405,11 +6404,8 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) if (enable_vnmi) { if (vmx->loaded_vmcs->nmi_known_unmasked) return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + + exit_intr_info = vmx_get_intr_info(&vmx->vcpu); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a13eafec67fc..edfb739e5907 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -451,7 +451,8 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3) - | (1 << VCPU_EXREG_EXIT_INFO_1)); + | (1 << VCPU_EXREG_EXIT_INFO_1) + | (1 << VCPU_EXREG_EXIT_INFO_2)); vcpu->arch.regs_dirty = 0; } @@ -506,6 +507,17 @@ static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu) return vmx->exit_qualification; } +static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) { + kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2); + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + } + return vmx->exit_intr_info; +} + struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags); void free_vmcs(struct vmcs *vmcs); int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs); -- cgit v1.2.3 From a9ab13ff6e844ad5b3ed39339e6db9a76bb539ad Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 10 Apr 2020 10:47:03 -0700 Subject: KVM: X86: Improve latency for single target IPI fastpath IPI and Timer cause the main MSRs write vmexits in cloud environment observation, let's optimize virtual IPI latency more aggressively to inject target IPI as soon as possible. Running kvm-unit-tests/vmexit.flat IPI testing on SKX server, disable adaptive advance lapic timer and adaptive halt-polling to avoid the interference, this patch can give another 7% improvement. w/o fastpath -> x86.c fastpath 4238 -> 3543 16.4% x86.c fastpath -> vmx.c fastpath 3543 -> 3293 7% w/o fastpath -> vmx.c fastpath 4238 -> 3293 22.3% Cc: Haiwei Li Signed-off-by: Wanpeng Li Signed-off-by: Sean Christopherson Message-Id: <20200410174703.1138-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 ++--- arch/x86/kvm/svm/svm.c | 24 ++++++++++++++++-------- arch/x86/kvm/vmx/vmx.c | 22 +++++++++++++--------- arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 34 insertions(+), 23 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ceba205d32a2..f26df2cb0591 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1126,7 +1126,7 @@ struct kvm_x86_ops { */ void (*tlb_flush_guest)(struct kvm_vcpu *vcpu); - void (*run)(struct kvm_vcpu *vcpu); + enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); @@ -1176,8 +1176,7 @@ struct kvm_x86_ops { struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception); - void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath); + void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); int (*check_nested_events)(struct kvm_vcpu *vcpu); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 747e60c65c98..a6f4e1bdb045 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3299,10 +3299,21 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } +static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +{ + if (!is_guest_mode(vcpu) && + to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && + to_svm(vcpu)->vmcb->control.exit_info_1) + return handle_fastpath_set_msr_irqoff(vcpu); + + return EXIT_FASTPATH_NONE; +} + void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); -static void svm_vcpu_run(struct kvm_vcpu *vcpu) +static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu) { + enum exit_fastpath_completion exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; @@ -3314,7 +3325,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) * again. */ if (unlikely(svm->nested.exit_required)) - return; + return EXIT_FASTPATH_NONE; /* * Disable singlestep if we're injecting an interrupt/exception. @@ -3398,6 +3409,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) stgi(); /* Any pending NMI will happen here */ + exit_fastpath = svm_exit_handlers_fastpath(vcpu); if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_after_interrupt(&svm->vcpu); @@ -3426,6 +3438,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm_handle_mce(svm); mark_all_clean(svm->vmcb); + return exit_fastpath; } static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long root) @@ -3727,13 +3740,8 @@ out: return ret; } -static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { - if (!is_guest_mode(vcpu) && - to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && - to_svm(vcpu)->vmcb->control.exit_info_1) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 1fc10b821b78..766303b31949 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6350,8 +6350,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) } STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff); -static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion *exit_fastpath) +static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6359,9 +6358,6 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu, handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI) handle_exception_nmi_irqoff(vmx); - else if (!is_guest_mode(vcpu) && - vmx->exit_reason == EXIT_REASON_MSR_WRITE) - *exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); } static bool vmx_has_emulated_msr(int index) @@ -6565,8 +6561,9 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); -static void vmx_vcpu_run(struct kvm_vcpu *vcpu) +static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu) { + enum exit_fastpath_completion exit_fastpath; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; @@ -6578,7 +6575,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) - return; + return EXIT_FASTPATH_NONE; if (vmx->ple_window_dirty) { vmx->ple_window_dirty = false; @@ -6726,7 +6723,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(vmx->fail)) { vmx->exit_reason = 0xdead; - return; + return EXIT_FASTPATH_NONE; } vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); @@ -6734,13 +6731,20 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_machine_check(); if (unlikely(vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) - return; + return EXIT_FASTPATH_NONE; + + if (!is_guest_mode(vcpu) && vmx->exit_reason == EXIT_REASON_MSR_WRITE) + exit_fastpath = handle_fastpath_set_msr_irqoff(vcpu); + else + exit_fastpath = EXIT_FASTPATH_NONE; vmx->loaded_vmcs->launched = 1; vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); vmx_recover_nmi_blocking(vmx); vmx_complete_interrupts(vmx); + + return exit_fastpath; } static void vmx_free_vcpu(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8104dc9ff5b2..8d9bcd5faac5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8179,7 +8179,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); - enum exit_fastpath_completion exit_fastpath = EXIT_FASTPATH_NONE; + enum exit_fastpath_completion exit_fastpath; bool req_immediate_exit = false; @@ -8406,7 +8406,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } - kvm_x86_ops.run(vcpu); + exit_fastpath = kvm_x86_ops.run(vcpu); /* * Do this here before restoring debug registers on the host. And @@ -8438,7 +8438,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_x86_ops.handle_exit_irqoff(vcpu, &exit_fastpath); + kvm_x86_ops.handle_exit_irqoff(vcpu); /* * Consume any pending interrupts, including the possible source of -- cgit v1.2.3 From b746046238bb99b8f703c79f6d95357428fb6476 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2020 10:15:51 +0200 Subject: objtool: Better handle IRET Teach objtool a little more about IRET so that we can avoid using the SAVE/RESTORE annotation. In particular, make the weird corner case in insn->restore go away. The purpose of that corner case is to deal with the fact that UNWIND_HINT_RESTORE lands on the instruction after IRET, but that instruction can end up being outside the basic block, consider: if (cond) sync_core() foo(); Then the hint will land on foo(), and we'll encounter the restore hint without ever having seen the save hint. By teaching objtool about the arch specific exception frame size, and assuming that any IRET in an STT_FUNC symbol is an exception frame sized POP, we can remove the use of save/restore hints for this code. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Reviewed-by: Alexandre Chartre Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200416115118.631224674@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 -- tools/objtool/arch.h | 1 + tools/objtool/arch/x86/decode.c | 14 ++++++++++++-- tools/objtool/check.c | 29 ++++++++++++++++------------- 4 files changed, 29 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3bcf27caf6c9..3eeaaeb75638 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -727,7 +727,6 @@ static inline void sync_core(void) unsigned int tmp; asm volatile ( - UNWIND_HINT_SAVE "mov %%ss, %0\n\t" "pushq %q0\n\t" "pushq %%rsp\n\t" @@ -737,7 +736,6 @@ static inline void sync_core(void) "pushq %q0\n\t" "pushq $1f\n\t" "iretq\n\t" - UNWIND_HINT_RESTORE "1:" : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif diff --git a/tools/objtool/arch.h b/tools/objtool/arch.h index f9883c431949..55396dfe0d07 100644 --- a/tools/objtool/arch.h +++ b/tools/objtool/arch.h @@ -19,6 +19,7 @@ enum insn_type { INSN_CALL, INSN_CALL_DYNAMIC, INSN_RETURN, + INSN_EXCEPTION_RETURN, INSN_CONTEXT_SWITCH, INSN_STACK, INSN_BUG, diff --git a/tools/objtool/arch/x86/decode.c b/tools/objtool/arch/x86/decode.c index 199b4084a13c..32736383ead1 100644 --- a/tools/objtool/arch/x86/decode.c +++ b/tools/objtool/arch/x86/decode.c @@ -446,9 +446,19 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, *type = INSN_RETURN; break; + case 0xcf: /* iret */ + *type = INSN_EXCEPTION_RETURN; + + /* add $40, %rsp */ + op->src.type = OP_SRC_ADD; + op->src.reg = CFI_SP; + op->src.offset = 5*8; + op->dest.type = OP_DEST_REG; + op->dest.reg = CFI_SP; + break; + case 0xca: /* retf */ case 0xcb: /* retf */ - case 0xcf: /* iret */ *type = INSN_CONTEXT_SWITCH; break; @@ -494,7 +504,7 @@ int arch_decode_instruction(struct elf *elf, struct section *sec, *immediate = insn.immediate.nbytes ? insn.immediate.value : 0; - if (*type == INSN_STACK) + if (*type == INSN_STACK || *type == INSN_EXCEPTION_RETURN) list_add_tail(&op->list, ops_list); else free(op); diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 9e854fd128d4..781b3a3c2ba6 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -2065,15 +2065,14 @@ static int validate_return(struct symbol *func, struct instruction *insn, struct * tools/objtool/Documentation/stack-validation.txt. */ static int validate_branch(struct objtool_file *file, struct symbol *func, - struct instruction *first, struct insn_state state) + struct instruction *insn, struct insn_state state) { struct alternative *alt; - struct instruction *insn, *next_insn; + struct instruction *next_insn; struct section *sec; u8 visited; int ret; - insn = first; sec = insn->sec; if (insn->alt_group && list_empty(&insn->alts)) { @@ -2126,16 +2125,6 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, } if (!save_insn->visited) { - /* - * Oops, no state to copy yet. - * Hopefully we can reach this - * instruction from another branch - * after the save insn has been - * visited. - */ - if (insn == first) - return 0; - WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", sec, insn->offset); return 1; @@ -2228,6 +2217,20 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; + case INSN_EXCEPTION_RETURN: + if (handle_insn_ops(insn, &state)) + return 1; + + /* + * This handles x86's sync_core() case, where we use an + * IRET to self. All 'normal' IRET instructions are in + * STT_NOTYPE entry symbols. + */ + if (func) + break; + + return 0; + case INSN_CONTEXT_SWITCH: if (func && (!next_insn || !next_insn->hint)) { WARN_FUNC("unsupported instruction in callable function", -- cgit v1.2.3 From e25eea89bb8853763a22fa2547199cf96b571ba1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 16:38:19 +0200 Subject: objtool: Introduce HINT_RET_OFFSET Normally objtool ensures a function keeps the stack layout invariant. But there is a useful exception, it is possible to stuff the return stack in order to 'inject' a 'call': push $fun ret In this case the invariant mentioned above is violated. Add an objtool HINT to annotate this and allow a function exit with a modified stack frame. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Reviewed-by: Alexandre Chartre Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200416115118.690601403@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/orc_types.h | 1 + arch/x86/include/asm/unwind_hints.h | 10 ++++++++++ tools/arch/x86/include/asm/orc_types.h | 1 + tools/objtool/check.c | 24 ++++++++++++++++-------- tools/objtool/check.h | 4 +++- 5 files changed, 31 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 6e060907c163..5f18ca7ac51a 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -60,6 +60,7 @@ #define ORC_TYPE_REGS_IRET 2 #define UNWIND_HINT_TYPE_SAVE 3 #define UNWIND_HINT_TYPE_RESTORE 4 +#define UNWIND_HINT_TYPE_RET_OFFSET 5 #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index f5e2eb12cb71..aabf7ace0476 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -94,6 +94,16 @@ UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE .endm + +/* + * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN + * and sibling calls. On these, sp_offset denotes the expected offset from + * initial_func_cfi. + */ +.macro UNWIND_HINT_RET_OFFSET sp_offset=8 + UNWIND_HINT type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset +.endm + #else /* !__ASSEMBLY__ */ #define UNWIND_HINT(sp_reg, sp_offset, type, end) \ diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index 6e060907c163..5f18ca7ac51a 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -60,6 +60,7 @@ #define ORC_TYPE_REGS_IRET 2 #define UNWIND_HINT_TYPE_SAVE 3 #define UNWIND_HINT_TYPE_RESTORE 4 +#define UNWIND_HINT_TYPE_RET_OFFSET 5 #ifndef __ASSEMBLY__ /* diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 781b3a3c2ba6..93c88ac51f0f 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1261,6 +1261,9 @@ static int read_unwind_hints(struct objtool_file *file) } else if (hint->type == UNWIND_HINT_TYPE_RESTORE) { insn->restore = true; insn->hint = true; + + } else if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) { + insn->ret_offset = hint->sp_offset; continue; } @@ -1424,20 +1427,25 @@ static bool is_fentry_call(struct instruction *insn) return false; } -static bool has_modified_stack_frame(struct insn_state *state) +static bool has_modified_stack_frame(struct instruction *insn, struct insn_state *state) { + u8 ret_offset = insn->ret_offset; int i; - if (state->cfa.base != initial_func_cfi.cfa.base || - state->cfa.offset != initial_func_cfi.cfa.offset || - state->stack_size != initial_func_cfi.cfa.offset || - state->drap) + if (state->cfa.base != initial_func_cfi.cfa.base || state->drap) + return true; + + if (state->cfa.offset != initial_func_cfi.cfa.offset + ret_offset) return true; - for (i = 0; i < CFI_NUM_REGS; i++) + if (state->stack_size != initial_func_cfi.cfa.offset + ret_offset) + return true; + + for (i = 0; i < CFI_NUM_REGS; i++) { if (state->regs[i].base != initial_func_cfi.regs[i].base || state->regs[i].offset != initial_func_cfi.regs[i].offset) return true; + } return false; } @@ -2014,7 +2022,7 @@ static int validate_call(struct instruction *insn, struct insn_state *state) static int validate_sibling_call(struct instruction *insn, struct insn_state *state) { - if (has_modified_stack_frame(state)) { + if (has_modified_stack_frame(insn, state)) { WARN_FUNC("sibling call from callable instruction with modified stack frame", insn->sec, insn->offset); return 1; @@ -2043,7 +2051,7 @@ static int validate_return(struct symbol *func, struct instruction *insn, struct return 1; } - if (func && has_modified_stack_frame(state)) { + if (func && has_modified_stack_frame(insn, state)) { WARN_FUNC("return with modified stack frame", insn->sec, insn->offset); return 1; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 2c55f7591443..81ce27e62c04 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -33,9 +33,11 @@ struct instruction { unsigned int len; enum insn_type type; unsigned long immediate; - bool alt_group, dead_end, ignore, hint, save, restore, ignore_alts; + bool alt_group, dead_end, ignore, ignore_alts; + bool hint, save, restore; bool retpoline_safe; u8 visited; + u8 ret_offset; struct symbol *call_dest; struct instruction *jump_dest; struct instruction *first_jump_src; -- cgit v1.2.3 From c536ed2fffd5dbf81fe2dede8ef294e0cbb08f72 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 1 Apr 2020 16:54:26 +0200 Subject: objtool: Remove SAVE/RESTORE hints The SAVE/RESTORE hints are now unused; remove them. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Reviewed-by: Alexandre Chartre Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200416115118.926738768@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/orc_types.h | 4 +--- arch/x86/include/asm/unwind_hints.h | 27 ---------------------- tools/arch/x86/include/asm/orc_types.h | 4 +--- tools/objtool/check.c | 42 +++------------------------------- tools/objtool/check.h | 2 +- 5 files changed, 6 insertions(+), 73 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/orc_types.h b/arch/x86/include/asm/orc_types.h index 5f18ca7ac51a..d25534940bde 100644 --- a/arch/x86/include/asm/orc_types.h +++ b/arch/x86/include/asm/orc_types.h @@ -58,9 +58,7 @@ #define ORC_TYPE_CALL 0 #define ORC_TYPE_REGS 1 #define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_SAVE 3 -#define UNWIND_HINT_TYPE_RESTORE 4 -#define UNWIND_HINT_TYPE_RET_OFFSET 5 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 #ifndef __ASSEMBLY__ /* diff --git a/arch/x86/include/asm/unwind_hints.h b/arch/x86/include/asm/unwind_hints.h index aabf7ace0476..7d903fdb3f43 100644 --- a/arch/x86/include/asm/unwind_hints.h +++ b/arch/x86/include/asm/unwind_hints.h @@ -86,15 +86,6 @@ UNWIND_HINT sp_offset=\sp_offset .endm -.macro UNWIND_HINT_SAVE - UNWIND_HINT type=UNWIND_HINT_TYPE_SAVE -.endm - -.macro UNWIND_HINT_RESTORE - UNWIND_HINT type=UNWIND_HINT_TYPE_RESTORE -.endm - - /* * RET_OFFSET: Used on instructions that terminate a function; mostly RETURN * and sibling calls. On these, sp_offset denotes the expected offset from @@ -104,24 +95,6 @@ UNWIND_HINT type=UNWIND_HINT_TYPE_RET_OFFSET sp_offset=\sp_offset .endm -#else /* !__ASSEMBLY__ */ - -#define UNWIND_HINT(sp_reg, sp_offset, type, end) \ - "987: \n\t" \ - ".pushsection .discard.unwind_hints\n\t" \ - /* struct unwind_hint */ \ - ".long 987b - .\n\t" \ - ".short " __stringify(sp_offset) "\n\t" \ - ".byte " __stringify(sp_reg) "\n\t" \ - ".byte " __stringify(type) "\n\t" \ - ".byte " __stringify(end) "\n\t" \ - ".balign 4 \n\t" \ - ".popsection\n\t" - -#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE, 0) - -#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE, 0) - #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_UNWIND_HINTS_H */ diff --git a/tools/arch/x86/include/asm/orc_types.h b/tools/arch/x86/include/asm/orc_types.h index 5f18ca7ac51a..d25534940bde 100644 --- a/tools/arch/x86/include/asm/orc_types.h +++ b/tools/arch/x86/include/asm/orc_types.h @@ -58,9 +58,7 @@ #define ORC_TYPE_CALL 0 #define ORC_TYPE_REGS 1 #define ORC_TYPE_REGS_IRET 2 -#define UNWIND_HINT_TYPE_SAVE 3 -#define UNWIND_HINT_TYPE_RESTORE 4 -#define UNWIND_HINT_TYPE_RET_OFFSET 5 +#define UNWIND_HINT_TYPE_RET_OFFSET 3 #ifndef __ASSEMBLY__ /* diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 93c88ac51f0f..464f10c0a5ac 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1254,15 +1254,7 @@ static int read_unwind_hints(struct objtool_file *file) cfa = &insn->state.cfa; - if (hint->type == UNWIND_HINT_TYPE_SAVE) { - insn->save = true; - continue; - - } else if (hint->type == UNWIND_HINT_TYPE_RESTORE) { - insn->restore = true; - insn->hint = true; - - } else if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) { + if (hint->type == UNWIND_HINT_TYPE_RET_OFFSET) { insn->ret_offset = hint->sp_offset; continue; } @@ -2113,37 +2105,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 0; } - if (insn->hint) { - if (insn->restore) { - struct instruction *save_insn, *i; - - i = insn; - save_insn = NULL; - sym_for_each_insn_continue_reverse(file, func, i) { - if (i->save) { - save_insn = i; - break; - } - } - - if (!save_insn) { - WARN_FUNC("no corresponding CFI save for CFI restore", - sec, insn->offset); - return 1; - } - - if (!save_insn->visited) { - WARN_FUNC("objtool isn't smart enough to handle this CFI save/restore combo", - sec, insn->offset); - return 1; - } - - insn->state = save_insn->state; - } - + if (insn->hint) state = insn->state; - - } else + else insn->state = state; insn->visited |= visited; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index 81ce27e62c04..7c30760bee74 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -34,7 +34,7 @@ struct instruction { enum insn_type type; unsigned long immediate; bool alt_group, dead_end, ignore, ignore_alts; - bool hint, save, restore; + bool hint; bool retpoline_safe; u8 visited; u8 ret_offset; -- cgit v1.2.3 From 2a89b674fd6834dacf2a6edfbdf5607c163dd36e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 19 Mar 2020 13:19:34 -0400 Subject: get rid of csum_partial_copy_to_user() For historical reasons some architectures call their csum_and_copy_to_user() csum_partial_copy_to_user() instead (and supply a macro defining the former as the latter). That's the last remnants of old experiment that went nowhere; time to bury them. Rename those to csum_and_copy_to_user() and get rid of the macros. Signed-off-by: Al Viro --- arch/sparc/include/asm/checksum_32.h | 7 +++---- arch/x86/include/asm/checksum_64.h | 4 +--- arch/x86/lib/csum-wrappers_64.c | 6 +++--- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/sparc/include/asm/checksum_32.h b/arch/sparc/include/asm/checksum_32.h index 5fc98d80b03b..450ddfb444c8 100644 --- a/arch/sparc/include/asm/checksum_32.h +++ b/arch/sparc/include/asm/checksum_32.h @@ -83,8 +83,10 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, return (__force __wsum)ret; } +#define HAVE_CSUM_COPY_USER + static inline __wsum -csum_partial_copy_to_user(const void *src, void __user *dst, int len, +csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, int *err) { if (!access_ok(dst, len)) { @@ -113,9 +115,6 @@ csum_partial_copy_to_user(const void *src, void __user *dst, int len, } } -#define HAVE_CSUM_COPY_USER -#define csum_and_copy_to_user csum_partial_copy_to_user - /* ihl is always 5 or greater, almost always is 5, and iph is word aligned * the majority of the time. */ diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 3ec6d3267cf9..ac9c06494827 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -141,13 +141,11 @@ extern __visible __wsum csum_partial_copy_generic(const void *src, const void *d extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp); -extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst, +extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp); extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum); -/* Old names. To be removed. */ -#define csum_and_copy_to_user csum_partial_copy_to_user #define csum_and_copy_from_user csum_partial_copy_from_user /** diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index c66c8b00f236..875c2f5968a0 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -71,7 +71,7 @@ out_err: EXPORT_SYMBOL(csum_partial_copy_from_user); /** - * csum_partial_copy_to_user - Copy and checksum to user space. + * csum_and_copy_to_user - Copy and checksum to user space. * @src: source address * @dst: destination address (user space) * @len: number of bytes to be copied. @@ -82,7 +82,7 @@ EXPORT_SYMBOL(csum_partial_copy_from_user); * src and dst are best aligned to 64bits. */ __wsum -csum_partial_copy_to_user(const void *src, void __user *dst, +csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp) { __wsum ret; @@ -116,7 +116,7 @@ csum_partial_copy_to_user(const void *src, void __user *dst, clac(); return ret; } -EXPORT_SYMBOL(csum_partial_copy_to_user); +EXPORT_SYMBOL(csum_and_copy_to_user); /** * csum_partial_copy_nocheck - Copy and checksum. -- cgit v1.2.3 From 62d0fd591db1f9dcf68fb963b3a94af085a6b166 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 22 Apr 2020 01:13:55 +0900 Subject: arch: split MODULE_ARCH_VERMAGIC definitions out to As the bug report [1] pointed out, must be included after . I believe we should not impose any include order restriction. We often sort include directives alphabetically, but it is just coding style convention. Technically, we can include header files in any order by making every header self-contained. Currently, arch-specific MODULE_ARCH_VERMAGIC is defined in , which is not included from . Hence, the straight-forward fix-up would be as follows: |--- a/include/linux/vermagic.h |+++ b/include/linux/vermagic.h |@@ -1,5 +1,6 @@ | /* SPDX-License-Identifier: GPL-2.0 */ | #include |+#include | | /* Simply sanity version stamp for modules. */ | #ifdef CONFIG_SMP This works enough, but for further cleanups, I split MODULE_ARCH_VERMAGIC definitions into . With this, and will be orthogonal, and the location of MODULE_ARCH_VERMAGIC definitions will be consistent. For arc and ia64, MODULE_PROC_FAMILY is only used for defining MODULE_ARCH_VERMAGIC. I squashed it. For hexagon, nds32, and xtensa, I removed entirely because they contained nothing but MODULE_ARCH_VERMAGIC definition. Kbuild will automatically generate at build-time, wrapping . [1] https://lore.kernel.org/lkml/20200411155623.GA22175@zn.tnic Reported-by: Borislav Petkov Signed-off-by: Masahiro Yamada Acked-by: Jessica Yu --- arch/arc/include/asm/module.h | 5 --- arch/arc/include/asm/vermagic.h | 8 +++++ arch/arm/include/asm/module.h | 24 ------------- arch/arm/include/asm/vermagic.h | 31 +++++++++++++++++ arch/arm64/include/asm/module.h | 2 -- arch/arm64/include/asm/vermagic.h | 10 ++++++ arch/hexagon/include/asm/module.h | 13 ------- arch/hexagon/include/asm/vermagic.h | 13 +++++++ arch/ia64/include/asm/module.h | 4 --- arch/ia64/include/asm/vermagic.h | 15 ++++++++ arch/mips/include/asm/module.h | 61 --------------------------------- arch/mips/include/asm/vermagic.h | 66 +++++++++++++++++++++++++++++++++++ arch/nds32/include/asm/module.h | 11 ------ arch/nds32/include/asm/vermagic.h | 9 +++++ arch/powerpc/include/asm/module.h | 18 ---------- arch/powerpc/include/asm/vermagic.h | 20 +++++++++++ arch/riscv/include/asm/module.h | 2 -- arch/riscv/include/asm/vermagic.h | 9 +++++ arch/sh/include/asm/module.h | 28 --------------- arch/sh/include/asm/vermagic.h | 34 +++++++++++++++++++ arch/x86/include/asm/module.h | 60 -------------------------------- arch/x86/include/asm/vermagic.h | 68 +++++++++++++++++++++++++++++++++++++ arch/xtensa/include/asm/module.h | 20 ----------- arch/xtensa/include/asm/vermagic.h | 17 ++++++++++ include/asm-generic/Kbuild | 1 + include/asm-generic/vermagic.h | 7 ++++ include/linux/vermagic.h | 8 +++-- 27 files changed, 313 insertions(+), 251 deletions(-) create mode 100644 arch/arc/include/asm/vermagic.h create mode 100644 arch/arm/include/asm/vermagic.h create mode 100644 arch/arm64/include/asm/vermagic.h delete mode 100644 arch/hexagon/include/asm/module.h create mode 100644 arch/hexagon/include/asm/vermagic.h create mode 100644 arch/ia64/include/asm/vermagic.h create mode 100644 arch/mips/include/asm/vermagic.h delete mode 100644 arch/nds32/include/asm/module.h create mode 100644 arch/nds32/include/asm/vermagic.h create mode 100644 arch/powerpc/include/asm/vermagic.h create mode 100644 arch/riscv/include/asm/vermagic.h create mode 100644 arch/sh/include/asm/vermagic.h create mode 100644 arch/x86/include/asm/vermagic.h delete mode 100644 arch/xtensa/include/asm/module.h create mode 100644 arch/xtensa/include/asm/vermagic.h create mode 100644 include/asm-generic/vermagic.h (limited to 'arch/x86/include') diff --git a/arch/arc/include/asm/module.h b/arch/arc/include/asm/module.h index 48f13a4ace4b..f534a1fef070 100644 --- a/arch/arc/include/asm/module.h +++ b/arch/arc/include/asm/module.h @@ -3,7 +3,6 @@ * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) * * Amit Bhor, Sameer Dhavale: Codito Technologies 2004 - */ #ifndef _ASM_ARC_MODULE_H @@ -19,8 +18,4 @@ struct mod_arch_specific { const char *secstr; }; -#define MODULE_PROC_FAMILY "ARC700" - -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY - #endif /* _ASM_ARC_MODULE_H */ diff --git a/arch/arc/include/asm/vermagic.h b/arch/arc/include/asm/vermagic.h new file mode 100644 index 000000000000..a10257d2c62c --- /dev/null +++ b/arch/arc/include/asm/vermagic.h @@ -0,0 +1,8 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#define MODULE_ARCH_VERMAGIC "ARC700" + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index 182163b55546..4b0df09cbe67 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -37,30 +37,6 @@ struct mod_arch_specific { struct module; u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val); -/* - * Add the ARM architecture version to the version magic string - */ -#define MODULE_ARCH_VERMAGIC_ARMVSN "ARMv" __stringify(__LINUX_ARM_ARCH__) " " - -/* Add __virt_to_phys patching state as well */ -#ifdef CONFIG_ARM_PATCH_PHYS_VIRT -#define MODULE_ARCH_VERMAGIC_P2V "p2v8 " -#else -#define MODULE_ARCH_VERMAGIC_P2V "" -#endif - -/* Add instruction set architecture tag to distinguish ARM/Thumb kernels */ -#ifdef CONFIG_THUMB2_KERNEL -#define MODULE_ARCH_VERMAGIC_ARMTHUMB "thumb2 " -#else -#define MODULE_ARCH_VERMAGIC_ARMTHUMB "" -#endif - -#define MODULE_ARCH_VERMAGIC \ - MODULE_ARCH_VERMAGIC_ARMVSN \ - MODULE_ARCH_VERMAGIC_ARMTHUMB \ - MODULE_ARCH_VERMAGIC_P2V - #ifdef CONFIG_THUMB2_KERNEL #define HAVE_ARCH_KALLSYMS_SYMBOL_VALUE static inline unsigned long kallsyms_symbol_value(const Elf_Sym *sym) diff --git a/arch/arm/include/asm/vermagic.h b/arch/arm/include/asm/vermagic.h new file mode 100644 index 000000000000..62ce94e26a63 --- /dev/null +++ b/arch/arm/include/asm/vermagic.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#include + +/* + * Add the ARM architecture version to the version magic string + */ +#define MODULE_ARCH_VERMAGIC_ARMVSN "ARMv" __stringify(__LINUX_ARM_ARCH__) " " + +/* Add __virt_to_phys patching state as well */ +#ifdef CONFIG_ARM_PATCH_PHYS_VIRT +#define MODULE_ARCH_VERMAGIC_P2V "p2v8 " +#else +#define MODULE_ARCH_VERMAGIC_P2V "" +#endif + +/* Add instruction set architecture tag to distinguish ARM/Thumb kernels */ +#ifdef CONFIG_THUMB2_KERNEL +#define MODULE_ARCH_VERMAGIC_ARMTHUMB "thumb2 " +#else +#define MODULE_ARCH_VERMAGIC_ARMTHUMB "" +#endif + +#define MODULE_ARCH_VERMAGIC \ + MODULE_ARCH_VERMAGIC_ARMVSN \ + MODULE_ARCH_VERMAGIC_ARMTHUMB \ + MODULE_ARCH_VERMAGIC_P2V + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 1e93de68c044..4e7fa2623896 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -7,8 +7,6 @@ #include -#define MODULE_ARCH_VERMAGIC "aarch64" - #ifdef CONFIG_ARM64_MODULE_PLTS struct mod_plt_sec { int plt_shndx; diff --git a/arch/arm64/include/asm/vermagic.h b/arch/arm64/include/asm/vermagic.h new file mode 100644 index 000000000000..a1eec6a000f1 --- /dev/null +++ b/arch/arm64/include/asm/vermagic.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2012 ARM Ltd. + */ +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#define MODULE_ARCH_VERMAGIC "aarch64" + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/hexagon/include/asm/module.h b/arch/hexagon/include/asm/module.h deleted file mode 100644 index e8de4fe03543..000000000000 --- a/arch/hexagon/include/asm/module.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. - */ - -#ifndef _ASM_MODULE_H -#define _ASM_MODULE_H - -#include - -#define MODULE_ARCH_VERMAGIC __stringify(PROCESSOR_MODEL_NAME) " " - -#endif diff --git a/arch/hexagon/include/asm/vermagic.h b/arch/hexagon/include/asm/vermagic.h new file mode 100644 index 000000000000..0e8dedc8c486 --- /dev/null +++ b/arch/hexagon/include/asm/vermagic.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2010-2011, The Linux Foundation. All rights reserved. + */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#include + +#define MODULE_ARCH_VERMAGIC __stringify(PROCESSOR_MODEL_NAME) " " + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h index f319144260ce..5a29652e6def 100644 --- a/arch/ia64/include/asm/module.h +++ b/arch/ia64/include/asm/module.h @@ -26,10 +26,6 @@ struct mod_arch_specific { unsigned int next_got_entry; /* index of next available got entry */ }; -#define MODULE_PROC_FAMILY "ia64" -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY \ - "gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__) - #define ARCH_SHF_SMALL SHF_IA_64_SHORT #endif /* _ASM_IA64_MODULE_H */ diff --git a/arch/ia64/include/asm/vermagic.h b/arch/ia64/include/asm/vermagic.h new file mode 100644 index 000000000000..29c7424f4c25 --- /dev/null +++ b/arch/ia64/include/asm/vermagic.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2003 Hewlett-Packard Co + * David Mosberger-Tang + */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#include + +#define MODULE_ARCH_VERMAGIC "ia64" \ + "gcc-" __stringify(__GNUC__) "." __stringify(__GNUC_MINOR__) + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/mips/include/asm/module.h b/arch/mips/include/asm/module.h index 9846047b3d3d..724a0882576b 100644 --- a/arch/mips/include/asm/module.h +++ b/arch/mips/include/asm/module.h @@ -83,65 +83,4 @@ search_module_dbetables(unsigned long addr) } #endif -#ifdef CONFIG_CPU_BMIPS -#define MODULE_PROC_FAMILY "BMIPS " -#elif defined CONFIG_CPU_MIPS32_R1 -#define MODULE_PROC_FAMILY "MIPS32_R1 " -#elif defined CONFIG_CPU_MIPS32_R2 -#define MODULE_PROC_FAMILY "MIPS32_R2 " -#elif defined CONFIG_CPU_MIPS32_R6 -#define MODULE_PROC_FAMILY "MIPS32_R6 " -#elif defined CONFIG_CPU_MIPS64_R1 -#define MODULE_PROC_FAMILY "MIPS64_R1 " -#elif defined CONFIG_CPU_MIPS64_R2 -#define MODULE_PROC_FAMILY "MIPS64_R2 " -#elif defined CONFIG_CPU_MIPS64_R6 -#define MODULE_PROC_FAMILY "MIPS64_R6 " -#elif defined CONFIG_CPU_R3000 -#define MODULE_PROC_FAMILY "R3000 " -#elif defined CONFIG_CPU_TX39XX -#define MODULE_PROC_FAMILY "TX39XX " -#elif defined CONFIG_CPU_VR41XX -#define MODULE_PROC_FAMILY "VR41XX " -#elif defined CONFIG_CPU_R4X00 -#define MODULE_PROC_FAMILY "R4X00 " -#elif defined CONFIG_CPU_TX49XX -#define MODULE_PROC_FAMILY "TX49XX " -#elif defined CONFIG_CPU_R5000 -#define MODULE_PROC_FAMILY "R5000 " -#elif defined CONFIG_CPU_R5500 -#define MODULE_PROC_FAMILY "R5500 " -#elif defined CONFIG_CPU_NEVADA -#define MODULE_PROC_FAMILY "NEVADA " -#elif defined CONFIG_CPU_R10000 -#define MODULE_PROC_FAMILY "R10000 " -#elif defined CONFIG_CPU_RM7000 -#define MODULE_PROC_FAMILY "RM7000 " -#elif defined CONFIG_CPU_SB1 -#define MODULE_PROC_FAMILY "SB1 " -#elif defined CONFIG_CPU_LOONGSON32 -#define MODULE_PROC_FAMILY "LOONGSON32 " -#elif defined CONFIG_CPU_LOONGSON2EF -#define MODULE_PROC_FAMILY "LOONGSON2EF " -#elif defined CONFIG_CPU_LOONGSON64 -#define MODULE_PROC_FAMILY "LOONGSON64 " -#elif defined CONFIG_CPU_CAVIUM_OCTEON -#define MODULE_PROC_FAMILY "OCTEON " -#elif defined CONFIG_CPU_XLR -#define MODULE_PROC_FAMILY "XLR " -#elif defined CONFIG_CPU_XLP -#define MODULE_PROC_FAMILY "XLP " -#else -#error MODULE_PROC_FAMILY undefined for your processor configuration -#endif - -#ifdef CONFIG_32BIT -#define MODULE_KERNEL_TYPE "32BIT " -#elif defined CONFIG_64BIT -#define MODULE_KERNEL_TYPE "64BIT " -#endif - -#define MODULE_ARCH_VERMAGIC \ - MODULE_PROC_FAMILY MODULE_KERNEL_TYPE - #endif /* _ASM_MODULE_H */ diff --git a/arch/mips/include/asm/vermagic.h b/arch/mips/include/asm/vermagic.h new file mode 100644 index 000000000000..24dc3d35161c --- /dev/null +++ b/arch/mips/include/asm/vermagic.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#ifdef CONFIG_CPU_BMIPS +#define MODULE_PROC_FAMILY "BMIPS " +#elif defined CONFIG_CPU_MIPS32_R1 +#define MODULE_PROC_FAMILY "MIPS32_R1 " +#elif defined CONFIG_CPU_MIPS32_R2 +#define MODULE_PROC_FAMILY "MIPS32_R2 " +#elif defined CONFIG_CPU_MIPS32_R6 +#define MODULE_PROC_FAMILY "MIPS32_R6 " +#elif defined CONFIG_CPU_MIPS64_R1 +#define MODULE_PROC_FAMILY "MIPS64_R1 " +#elif defined CONFIG_CPU_MIPS64_R2 +#define MODULE_PROC_FAMILY "MIPS64_R2 " +#elif defined CONFIG_CPU_MIPS64_R6 +#define MODULE_PROC_FAMILY "MIPS64_R6 " +#elif defined CONFIG_CPU_R3000 +#define MODULE_PROC_FAMILY "R3000 " +#elif defined CONFIG_CPU_TX39XX +#define MODULE_PROC_FAMILY "TX39XX " +#elif defined CONFIG_CPU_VR41XX +#define MODULE_PROC_FAMILY "VR41XX " +#elif defined CONFIG_CPU_R4X00 +#define MODULE_PROC_FAMILY "R4X00 " +#elif defined CONFIG_CPU_TX49XX +#define MODULE_PROC_FAMILY "TX49XX " +#elif defined CONFIG_CPU_R5000 +#define MODULE_PROC_FAMILY "R5000 " +#elif defined CONFIG_CPU_R5500 +#define MODULE_PROC_FAMILY "R5500 " +#elif defined CONFIG_CPU_NEVADA +#define MODULE_PROC_FAMILY "NEVADA " +#elif defined CONFIG_CPU_R10000 +#define MODULE_PROC_FAMILY "R10000 " +#elif defined CONFIG_CPU_RM7000 +#define MODULE_PROC_FAMILY "RM7000 " +#elif defined CONFIG_CPU_SB1 +#define MODULE_PROC_FAMILY "SB1 " +#elif defined CONFIG_CPU_LOONGSON32 +#define MODULE_PROC_FAMILY "LOONGSON32 " +#elif defined CONFIG_CPU_LOONGSON2EF +#define MODULE_PROC_FAMILY "LOONGSON2EF " +#elif defined CONFIG_CPU_LOONGSON64 +#define MODULE_PROC_FAMILY "LOONGSON64 " +#elif defined CONFIG_CPU_CAVIUM_OCTEON +#define MODULE_PROC_FAMILY "OCTEON " +#elif defined CONFIG_CPU_XLR +#define MODULE_PROC_FAMILY "XLR " +#elif defined CONFIG_CPU_XLP +#define MODULE_PROC_FAMILY "XLP " +#else +#error MODULE_PROC_FAMILY undefined for your processor configuration +#endif + +#ifdef CONFIG_32BIT +#define MODULE_KERNEL_TYPE "32BIT " +#elif defined CONFIG_64BIT +#define MODULE_KERNEL_TYPE "64BIT " +#endif + +#define MODULE_ARCH_VERMAGIC \ + MODULE_PROC_FAMILY MODULE_KERNEL_TYPE + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/nds32/include/asm/module.h b/arch/nds32/include/asm/module.h deleted file mode 100644 index a3a08e993c65..000000000000 --- a/arch/nds32/include/asm/module.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -// Copyright (C) 2005-2017 Andes Technology Corporation - -#ifndef _ASM_NDS32_MODULE_H -#define _ASM_NDS32_MODULE_H - -#include - -#define MODULE_ARCH_VERMAGIC "NDS32v3" - -#endif /* _ASM_NDS32_MODULE_H */ diff --git a/arch/nds32/include/asm/vermagic.h b/arch/nds32/include/asm/vermagic.h new file mode 100644 index 000000000000..f772e7ba33f1 --- /dev/null +++ b/arch/nds32/include/asm/vermagic.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +// Copyright (C) 2005-2017 Andes Technology Corporation + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#define MODULE_ARCH_VERMAGIC "NDS32v3" + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h index 356658711a86..5398bfc465b4 100644 --- a/arch/powerpc/include/asm/module.h +++ b/arch/powerpc/include/asm/module.h @@ -3,28 +3,10 @@ #define _ASM_POWERPC_MODULE_H #ifdef __KERNEL__ -/* - */ - #include #include #include - -#ifdef CONFIG_MPROFILE_KERNEL -#define MODULE_ARCH_VERMAGIC_FTRACE "mprofile-kernel " -#else -#define MODULE_ARCH_VERMAGIC_FTRACE "" -#endif - -#ifdef CONFIG_RELOCATABLE -#define MODULE_ARCH_VERMAGIC_RELOCATABLE "relocatable " -#else -#define MODULE_ARCH_VERMAGIC_RELOCATABLE "" -#endif - -#define MODULE_ARCH_VERMAGIC MODULE_ARCH_VERMAGIC_FTRACE MODULE_ARCH_VERMAGIC_RELOCATABLE - #ifndef __powerpc64__ /* * Thanks to Paul M for explaining this. diff --git a/arch/powerpc/include/asm/vermagic.h b/arch/powerpc/include/asm/vermagic.h new file mode 100644 index 000000000000..b054a8576e5d --- /dev/null +++ b/arch/powerpc/include/asm/vermagic.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#ifdef CONFIG_MPROFILE_KERNEL +#define MODULE_ARCH_VERMAGIC_FTRACE "mprofile-kernel " +#else +#define MODULE_ARCH_VERMAGIC_FTRACE "" +#endif + +#ifdef CONFIG_RELOCATABLE +#define MODULE_ARCH_VERMAGIC_RELOCATABLE "relocatable " +#else +#define MODULE_ARCH_VERMAGIC_RELOCATABLE "" +#endif + +#define MODULE_ARCH_VERMAGIC \ + MODULE_ARCH_VERMAGIC_FTRACE MODULE_ARCH_VERMAGIC_RELOCATABLE + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/riscv/include/asm/module.h b/arch/riscv/include/asm/module.h index 46202dad365d..76aa96a9fc08 100644 --- a/arch/riscv/include/asm/module.h +++ b/arch/riscv/include/asm/module.h @@ -6,8 +6,6 @@ #include -#define MODULE_ARCH_VERMAGIC "riscv" - struct module; unsigned long module_emit_got_entry(struct module *mod, unsigned long val); unsigned long module_emit_plt_entry(struct module *mod, unsigned long val); diff --git a/arch/riscv/include/asm/vermagic.h b/arch/riscv/include/asm/vermagic.h new file mode 100644 index 000000000000..7b9441a57466 --- /dev/null +++ b/arch/riscv/include/asm/vermagic.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright (C) 2017 Andes Technology Corporation */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#define MODULE_ARCH_VERMAGIC "riscv" + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/sh/include/asm/module.h b/arch/sh/include/asm/module.h index 9f38fb35fe96..337663a028db 100644 --- a/arch/sh/include/asm/module.h +++ b/arch/sh/include/asm/module.h @@ -11,32 +11,4 @@ struct mod_arch_specific { }; #endif -#ifdef CONFIG_CPU_LITTLE_ENDIAN -# ifdef CONFIG_CPU_SH2 -# define MODULE_PROC_FAMILY "SH2LE " -# elif defined CONFIG_CPU_SH3 -# define MODULE_PROC_FAMILY "SH3LE " -# elif defined CONFIG_CPU_SH4 -# define MODULE_PROC_FAMILY "SH4LE " -# elif defined CONFIG_CPU_SH5 -# define MODULE_PROC_FAMILY "SH5LE " -# else -# error unknown processor family -# endif -#else -# ifdef CONFIG_CPU_SH2 -# define MODULE_PROC_FAMILY "SH2BE " -# elif defined CONFIG_CPU_SH3 -# define MODULE_PROC_FAMILY "SH3BE " -# elif defined CONFIG_CPU_SH4 -# define MODULE_PROC_FAMILY "SH4BE " -# elif defined CONFIG_CPU_SH5 -# define MODULE_PROC_FAMILY "SH5BE " -# else -# error unknown processor family -# endif -#endif - -#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY - #endif /* _ASM_SH_MODULE_H */ diff --git a/arch/sh/include/asm/vermagic.h b/arch/sh/include/asm/vermagic.h new file mode 100644 index 000000000000..13d8eaa9188e --- /dev/null +++ b/arch/sh/include/asm/vermagic.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#ifdef CONFIG_CPU_LITTLE_ENDIAN +# ifdef CONFIG_CPU_SH2 +# define MODULE_PROC_FAMILY "SH2LE " +# elif defined CONFIG_CPU_SH3 +# define MODULE_PROC_FAMILY "SH3LE " +# elif defined CONFIG_CPU_SH4 +# define MODULE_PROC_FAMILY "SH4LE " +# elif defined CONFIG_CPU_SH5 +# define MODULE_PROC_FAMILY "SH5LE " +# else +# error unknown processor family +# endif +#else +# ifdef CONFIG_CPU_SH2 +# define MODULE_PROC_FAMILY "SH2BE " +# elif defined CONFIG_CPU_SH3 +# define MODULE_PROC_FAMILY "SH3BE " +# elif defined CONFIG_CPU_SH4 +# define MODULE_PROC_FAMILY "SH4BE " +# elif defined CONFIG_CPU_SH5 +# define MODULE_PROC_FAMILY "SH5BE " +# else +# error unknown processor family +# endif +#endif + +#define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index c215d2762488..e988bac0a4a1 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -13,64 +13,4 @@ struct mod_arch_specific { #endif }; -#ifdef CONFIG_X86_64 -/* X86_64 does not define MODULE_PROC_FAMILY */ -#elif defined CONFIG_M486SX -#define MODULE_PROC_FAMILY "486SX " -#elif defined CONFIG_M486 -#define MODULE_PROC_FAMILY "486 " -#elif defined CONFIG_M586 -#define MODULE_PROC_FAMILY "586 " -#elif defined CONFIG_M586TSC -#define MODULE_PROC_FAMILY "586TSC " -#elif defined CONFIG_M586MMX -#define MODULE_PROC_FAMILY "586MMX " -#elif defined CONFIG_MCORE2 -#define MODULE_PROC_FAMILY "CORE2 " -#elif defined CONFIG_MATOM -#define MODULE_PROC_FAMILY "ATOM " -#elif defined CONFIG_M686 -#define MODULE_PROC_FAMILY "686 " -#elif defined CONFIG_MPENTIUMII -#define MODULE_PROC_FAMILY "PENTIUMII " -#elif defined CONFIG_MPENTIUMIII -#define MODULE_PROC_FAMILY "PENTIUMIII " -#elif defined CONFIG_MPENTIUMM -#define MODULE_PROC_FAMILY "PENTIUMM " -#elif defined CONFIG_MPENTIUM4 -#define MODULE_PROC_FAMILY "PENTIUM4 " -#elif defined CONFIG_MK6 -#define MODULE_PROC_FAMILY "K6 " -#elif defined CONFIG_MK7 -#define MODULE_PROC_FAMILY "K7 " -#elif defined CONFIG_MK8 -#define MODULE_PROC_FAMILY "K8 " -#elif defined CONFIG_MELAN -#define MODULE_PROC_FAMILY "ELAN " -#elif defined CONFIG_MCRUSOE -#define MODULE_PROC_FAMILY "CRUSOE " -#elif defined CONFIG_MEFFICEON -#define MODULE_PROC_FAMILY "EFFICEON " -#elif defined CONFIG_MWINCHIPC6 -#define MODULE_PROC_FAMILY "WINCHIPC6 " -#elif defined CONFIG_MWINCHIP3D -#define MODULE_PROC_FAMILY "WINCHIP3D " -#elif defined CONFIG_MCYRIXIII -#define MODULE_PROC_FAMILY "CYRIXIII " -#elif defined CONFIG_MVIAC3_2 -#define MODULE_PROC_FAMILY "VIAC3-2 " -#elif defined CONFIG_MVIAC7 -#define MODULE_PROC_FAMILY "VIAC7 " -#elif defined CONFIG_MGEODEGX1 -#define MODULE_PROC_FAMILY "GEODEGX1 " -#elif defined CONFIG_MGEODE_LX -#define MODULE_PROC_FAMILY "GEODE " -#else -#error unknown processor family -#endif - -#ifdef CONFIG_X86_32 -# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY -#endif - #endif /* _ASM_X86_MODULE_H */ diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h new file mode 100644 index 000000000000..75884d2cdec3 --- /dev/null +++ b/arch/x86/include/asm/vermagic.h @@ -0,0 +1,68 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#ifdef CONFIG_X86_64 +/* X86_64 does not define MODULE_PROC_FAMILY */ +#elif defined CONFIG_M486SX +#define MODULE_PROC_FAMILY "486SX " +#elif defined CONFIG_M486 +#define MODULE_PROC_FAMILY "486 " +#elif defined CONFIG_M586 +#define MODULE_PROC_FAMILY "586 " +#elif defined CONFIG_M586TSC +#define MODULE_PROC_FAMILY "586TSC " +#elif defined CONFIG_M586MMX +#define MODULE_PROC_FAMILY "586MMX " +#elif defined CONFIG_MCORE2 +#define MODULE_PROC_FAMILY "CORE2 " +#elif defined CONFIG_MATOM +#define MODULE_PROC_FAMILY "ATOM " +#elif defined CONFIG_M686 +#define MODULE_PROC_FAMILY "686 " +#elif defined CONFIG_MPENTIUMII +#define MODULE_PROC_FAMILY "PENTIUMII " +#elif defined CONFIG_MPENTIUMIII +#define MODULE_PROC_FAMILY "PENTIUMIII " +#elif defined CONFIG_MPENTIUMM +#define MODULE_PROC_FAMILY "PENTIUMM " +#elif defined CONFIG_MPENTIUM4 +#define MODULE_PROC_FAMILY "PENTIUM4 " +#elif defined CONFIG_MK6 +#define MODULE_PROC_FAMILY "K6 " +#elif defined CONFIG_MK7 +#define MODULE_PROC_FAMILY "K7 " +#elif defined CONFIG_MK8 +#define MODULE_PROC_FAMILY "K8 " +#elif defined CONFIG_MELAN +#define MODULE_PROC_FAMILY "ELAN " +#elif defined CONFIG_MCRUSOE +#define MODULE_PROC_FAMILY "CRUSOE " +#elif defined CONFIG_MEFFICEON +#define MODULE_PROC_FAMILY "EFFICEON " +#elif defined CONFIG_MWINCHIPC6 +#define MODULE_PROC_FAMILY "WINCHIPC6 " +#elif defined CONFIG_MWINCHIP3D +#define MODULE_PROC_FAMILY "WINCHIP3D " +#elif defined CONFIG_MCYRIXIII +#define MODULE_PROC_FAMILY "CYRIXIII " +#elif defined CONFIG_MVIAC3_2 +#define MODULE_PROC_FAMILY "VIAC3-2 " +#elif defined CONFIG_MVIAC7 +#define MODULE_PROC_FAMILY "VIAC7 " +#elif defined CONFIG_MGEODEGX1 +#define MODULE_PROC_FAMILY "GEODEGX1 " +#elif defined CONFIG_MGEODE_LX +#define MODULE_PROC_FAMILY "GEODE " +#else +#error unknown processor family +#endif + +#ifdef CONFIG_X86_32 +# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY +#else +# define MODULE_ARCH_VERMAGIC "" +#endif + +#endif /* _ASM_VERMAGIC_H */ diff --git a/arch/xtensa/include/asm/module.h b/arch/xtensa/include/asm/module.h deleted file mode 100644 index 488b40c6f9b9..000000000000 --- a/arch/xtensa/include/asm/module.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * include/asm-xtensa/module.h - * - * This file contains the module code specific to the Xtensa architecture. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ - -#ifndef _XTENSA_MODULE_H -#define _XTENSA_MODULE_H - -#define MODULE_ARCH_VERMAGIC "xtensa-" __stringify(XCHAL_CORE_ID) " " - -#include - -#endif /* _XTENSA_MODULE_H */ diff --git a/arch/xtensa/include/asm/vermagic.h b/arch/xtensa/include/asm/vermagic.h new file mode 100644 index 000000000000..6d9c670e4ba9 --- /dev/null +++ b/arch/xtensa/include/asm/vermagic.h @@ -0,0 +1,17 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2001 - 2005 Tensilica Inc. + */ + +#ifndef _ASM_VERMAGIC_H +#define _ASM_VERMAGIC_H + +#include +#include + +#define MODULE_ARCH_VERMAGIC "xtensa-" __stringify(XCHAL_CORE_ID) " " + +#endif /* _ASM_VERMAGIC_H */ diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index 36341dfded70..44ec80e70518 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -56,6 +56,7 @@ mandatory-y += topology.h mandatory-y += trace_clock.h mandatory-y += uaccess.h mandatory-y += unaligned.h +mandatory-y += vermagic.h mandatory-y += vga.h mandatory-y += word-at-a-time.h mandatory-y += xor.h diff --git a/include/asm-generic/vermagic.h b/include/asm-generic/vermagic.h new file mode 100644 index 000000000000..084274a1219e --- /dev/null +++ b/include/asm-generic/vermagic.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _ASM_GENERIC_VERMAGIC_H +#define _ASM_GENERIC_VERMAGIC_H + +#define MODULE_ARCH_VERMAGIC "" + +#endif /* _ASM_GENERIC_VERMAGIC_H */ diff --git a/include/linux/vermagic.h b/include/linux/vermagic.h index 9aced11e9000..dc236577b92f 100644 --- a/include/linux/vermagic.h +++ b/include/linux/vermagic.h @@ -1,5 +1,9 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_VERMAGIC_H +#define _LINUX_VERMAGIC_H + #include +#include /* Simply sanity version stamp for modules. */ #ifdef CONFIG_SMP @@ -24,9 +28,6 @@ #else #define MODULE_VERMAGIC_MODVERSIONS "" #endif -#ifndef MODULE_ARCH_VERMAGIC -#define MODULE_ARCH_VERMAGIC "" -#endif #ifdef RANDSTRUCT_PLUGIN #include #define MODULE_RANDSTRUCT_PLUGIN "RANDSTRUCT_PLUGIN_" RANDSTRUCT_HASHED_SEED @@ -41,3 +42,4 @@ MODULE_ARCH_VERMAGIC \ MODULE_RANDSTRUCT_PLUGIN +#endif /* _LINUX_VERMAGIC_H */ -- cgit v1.2.3 From d073569363d9f076a568ce8c31250d332ccf33ce Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Apr 2020 17:27:44 +0200 Subject: x86/mm: Cleanup pgprot_4k_2_large() and pgprot_large_2_4k() Make use of lower level helpers that operate on the raw protection values to make the code a little easier to understand, and to also avoid extra conversions in a few callers. [ Qian: Fix a wrongly placed bracket in the original submission. Reported and fixed by Qian Cai . Details in second Link: below. ] Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200408152745.1565832-4-hch@lst.de Link: https://lkml.kernel.org/r/1ED37D02-125F-4919-861A-371981581D9E@lca.pw --- arch/x86/include/asm/pgtable_types.h | 26 +++++++++++++------------- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pgtable.c | 8 ++------ 3 files changed, 16 insertions(+), 20 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 75fe903124f8..a3b78d84b26a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -488,24 +488,24 @@ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) { return __pgprot(cachemode2protval(pcm)); } -static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) +static inline unsigned long protval_4k_2_large(unsigned long val) { - pgprotval_t val = pgprot_val(pgprot); - pgprot_t new; - - pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); - return new; +} +static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) +{ + return __pgprot(protval_4k_2_large(pgprot_val(pgprot))); +} +static inline unsigned long protval_large_2_4k(unsigned long val) +{ + return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | + ((val & _PAGE_PAT_LARGE) >> + (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); } static inline pgprot_t pgprot_large_2_4k(pgprot_t pgprot) { - pgprotval_t val = pgprot_val(pgprot); - pgprot_t new; - - pgprot_val(new) = (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | - ((val & _PAGE_PAT_LARGE) >> - (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); - return new; + return __pgprot(protval_large_2_4k(pgprot_val(pgprot))); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3b289c2f75cd..9a497ba02440 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -367,7 +367,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, pgprot_t prot; pgprot_val(prot) = pgprot_val(PAGE_KERNEL_LARGE) | - pgprot_val(pgprot_4k_2_large(cachemode2pgprot(cache))); + protval_4k_2_large(cachemode2protval(cache)); BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK)); for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7bd2c3a52297..c54d1d0a8e3b 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -706,11 +706,9 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot) if (pud_present(*pud) && !pud_huge(*pud)) return 0; - prot = pgprot_4k_2_large(prot); - set_pte((pte_t *)pud, pfn_pte( (u64)addr >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } @@ -738,11 +736,9 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) if (pmd_present(*pmd) && !pmd_huge(*pmd)) return 0; - prot = pgprot_4k_2_large(prot); - set_pte((pte_t *)pmd, pfn_pte( (u64)addr >> PAGE_SHIFT, - __pgprot(pgprot_val(prot) | _PAGE_PSE))); + __pgprot(protval_4k_2_large(pgprot_val(prot)) | _PAGE_PSE))); return 1; } -- cgit v1.2.3 From de17a37896e1ad9e17ebd5274a50c33e18c9cb90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 8 Apr 2020 17:27:45 +0200 Subject: x86/mm: Unexport __cachemode2pte_tbl Exporting the raw data for a table is generally a bad idea. Move cachemode2protval() out of line given that it isn't really used in the fast path, and then mark __cachemode2pte_tbl static. Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200408152745.1565832-5-hch@lst.de --- arch/x86/include/asm/pgtable_types.h | 14 ++------------ arch/x86/mm/init.c | 11 +++++++++-- arch/x86/mm/pat/set_memory.c | 5 +++++ 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a3b78d84b26a..567abdbd64d3 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -467,8 +467,6 @@ static inline pteval_t pte_flags(pte_t pte) return native_pte_val(pte) & PTE_FLAGS_MASK; } -extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; - #define __pte2cm_idx(cb) \ ((((cb) >> (_PAGE_BIT_PAT - 2)) & 4) | \ (((cb) >> (_PAGE_BIT_PCD - 1)) & 2) | \ @@ -478,16 +476,8 @@ extern uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM]; (((i) & 2) << (_PAGE_BIT_PCD - 1)) | \ (((i) & 1) << _PAGE_BIT_PWT)) -static inline unsigned long cachemode2protval(enum page_cache_mode pcm) -{ - if (likely(pcm == 0)) - return 0; - return __cachemode2pte_tbl[pcm]; -} -static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) -{ - return __pgprot(cachemode2protval(pcm)); -} +unsigned long cachemode2protval(enum page_cache_mode pcm); + static inline unsigned long protval_4k_2_large(unsigned long val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 4a55d687c246..71720dd8f28a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -49,7 +49,7 @@ * Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2. */ -uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { +static uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WB ] = 0 | 0 , [_PAGE_CACHE_MODE_WC ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_UC_MINUS] = 0 | _PAGE_PCD, @@ -57,7 +57,14 @@ uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = { [_PAGE_CACHE_MODE_WT ] = 0 | _PAGE_PCD, [_PAGE_CACHE_MODE_WP ] = 0 | _PAGE_PCD, }; -EXPORT_SYMBOL(__cachemode2pte_tbl); + +unsigned long cachemode2protval(enum page_cache_mode pcm) +{ + if (likely(pcm == 0)) + return 0; + return __cachemode2pte_tbl[pcm]; +} +EXPORT_SYMBOL(cachemode2protval); static uint8_t __pte2cachemode_tbl[8] = { [__pte2cm_idx( 0 | 0 | 0 )] = _PAGE_CACHE_MODE_WB, diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 59eca6a94ce7..a28f0c345303 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -68,6 +68,11 @@ static DEFINE_SPINLOCK(cpa_lock); #define CPA_PAGES_ARRAY 4 #define CPA_NO_CHECK_ALIAS 8 /* Do not search for aliases */ +static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm) +{ + return __pgprot(cachemode2protval(pcm)); +} + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; -- cgit v1.2.3 From 325518e9b743686f471e7a4ef617b57c91386795 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 22 Apr 2020 18:53:08 +0200 Subject: x86/mm: Use pgprotval_t in protval_4k_2_large() and protval_large_2_4k() Use the proper type for "raw" page table values. Signed-off-by: Christoph Hellwig Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200422170116.GA28345@lst.de --- arch/x86/include/asm/pgtable_types.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 567abdbd64d3..7b6ddcf77d70 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -478,7 +478,7 @@ static inline pteval_t pte_flags(pte_t pte) unsigned long cachemode2protval(enum page_cache_mode pcm); -static inline unsigned long protval_4k_2_large(unsigned long val) +static inline pgprotval_t protval_4k_2_large(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT) << (_PAGE_BIT_PAT_LARGE - _PAGE_BIT_PAT)); @@ -487,7 +487,7 @@ static inline pgprot_t pgprot_4k_2_large(pgprot_t pgprot) { return __pgprot(protval_4k_2_large(pgprot_val(pgprot))); } -static inline unsigned long protval_large_2_4k(unsigned long val) +static inline pgprotval_t protval_large_2_4k(pgprotval_t val) { return (val & ~(_PAGE_PAT | _PAGE_PAT_LARGE)) | ((val & _PAGE_PAT_LARGE) >> -- cgit v1.2.3 From 33b22172452f05c351fd2fa24c28d2e76c7b0692 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 17 Apr 2020 10:24:18 -0400 Subject: KVM: x86: move nested-related kvm_x86_ops to a separate struct Clean up some of the patching of kvm_x86_ops, by moving kvm_x86_ops related to nested virtualization into a separate struct. As a result, these ops will always be non-NULL on VMX. This is not a problem: * check_nested_events is only called if is_guest_mode(vcpu) returns true * get_nested_state treats VMXOFF state the same as nested being disabled * set_nested_state fails if you attempt to set nested state while nesting is disabled * nested_enable_evmcs could already be called on a CPU without VMX enabled in CPUID. * nested_get_evmcs_version was fixed in the previous patch Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 29 ++++++++++++++++------------- arch/x86/kvm/hyperv.c | 4 ++-- arch/x86/kvm/svm/nested.c | 6 +++++- arch/x86/kvm/svm/svm.c | 13 +++++-------- arch/x86/kvm/svm/svm.h | 3 ++- arch/x86/kvm/vmx/nested.c | 16 +++++++++------- arch/x86/kvm/vmx/nested.h | 2 ++ arch/x86/kvm/vmx/vmx.c | 7 +------ arch/x86/kvm/x86.c | 28 ++++++++++++++-------------- 9 files changed, 56 insertions(+), 52 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f26df2cb0591..a239a297be33 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1178,7 +1178,6 @@ struct kvm_x86_ops { struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); - int (*check_nested_events)(struct kvm_vcpu *vcpu); void (*request_immediate_exit)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *kvm, int cpu); @@ -1211,6 +1210,7 @@ struct kvm_x86_ops { /* pmu operations of sub-arch */ const struct kvm_pmu_ops *pmu_ops; + const struct kvm_x86_nested_ops *nested_ops; /* * Architecture specific hooks for vCPU blocking due to @@ -1238,14 +1238,6 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); - int (*get_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - unsigned user_data_size); - int (*set_nested_state)(struct kvm_vcpu *vcpu, - struct kvm_nested_state __user *user_kvm_nested_state, - struct kvm_nested_state *kvm_state); - bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); - int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); @@ -1257,16 +1249,27 @@ struct kvm_x86_ops { int (*get_msr_feature)(struct kvm_msr_entry *entry); - int (*nested_enable_evmcs)(struct kvm_vcpu *vcpu, - uint16_t *vmcs_version); - uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); }; +struct kvm_x86_nested_ops { + int (*check_events)(struct kvm_vcpu *vcpu); + int (*get_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + unsigned user_data_size); + int (*set_state)(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state); + bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu); + + int (*enable_evmcs)(struct kvm_vcpu *vcpu, + uint16_t *vmcs_version); + uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); +}; + struct kvm_x86_init_ops { int (*cpu_has_kvm_support)(void); int (*disabled_by_bios)(void); diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index b850f676abe4..2f96ff9e60ee 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1799,8 +1799,8 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, }; int i, nent = ARRAY_SIZE(cpuid_entries); - if (kvm_x86_ops.nested_get_evmcs_version) - evmcs_ver = kvm_x86_ops.nested_get_evmcs_version(vcpu); + if (kvm_x86_ops.nested_ops->get_evmcs_version) + evmcs_ver = kvm_x86_ops.nested_ops->get_evmcs_version(vcpu); /* Skip NESTED_FEATURES if eVMCS is not supported */ if (!evmcs_ver) diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 3e5bd739a6f6..6ea047e6882e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -784,7 +784,7 @@ static bool nested_exit_on_intr(struct vcpu_svm *svm) return (svm->nested.intercept & 1ULL); } -int svm_check_nested_events(struct kvm_vcpu *vcpu) +static int svm_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); bool block_nested_events = @@ -825,3 +825,7 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } + +struct kvm_x86_nested_ops svm_nested_ops = { + .check_events = svm_check_nested_events, +}; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index eb95283ab68d..c86f7278509b 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3902,9 +3902,9 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) /* * TODO: Last condition latch INIT signals on vCPU when * vCPU is in guest-mode and vmcb12 defines intercept on INIT. - * To properly emulate the INIT intercept, SVM should implement - * kvm_x86_ops.check_nested_events() and call nested_svm_vmexit() - * there if an INIT signal is pending. + * To properly emulate the INIT intercept, + * svm_check_nested_events() should call nested_svm_vmexit() + * if an INIT signal is pending. */ return !gif_set(svm) || (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT)); @@ -4032,6 +4032,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .sched_in = svm_sched_in, .pmu_ops = &amd_pmu_ops, + .nested_ops = &svm_nested_ops, + .deliver_posted_interrupt = svm_deliver_avic_intr, .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt, .update_pi_irte = svm_update_pi_irte, @@ -4046,14 +4048,9 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, .apic_init_signal_blocked = svm_apic_init_signal_blocked, - - .check_nested_events = svm_check_nested_events, }; static struct kvm_x86_init_ops svm_init_ops __initdata = { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index ca95204f9dde..98c2890d561d 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -398,9 +398,10 @@ int nested_svm_exit_handled(struct vcpu_svm *svm); int nested_svm_check_permissions(struct vcpu_svm *svm); int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -int svm_check_nested_events(struct kvm_vcpu *vcpu); int nested_svm_exit_special(struct vcpu_svm *svm); +extern struct kvm_x86_nested_ops svm_nested_ops; + /* avic.c */ #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index f228339cd0a0..56074d3443e0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -6440,12 +6440,14 @@ __init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid; exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc; - ops->check_nested_events = vmx_check_nested_events; - ops->get_nested_state = vmx_get_nested_state; - ops->set_nested_state = vmx_set_nested_state; - ops->get_vmcs12_pages = nested_get_vmcs12_pages; - ops->nested_enable_evmcs = nested_enable_evmcs; - ops->nested_get_evmcs_version = nested_get_evmcs_version; - return 0; } + +struct kvm_x86_nested_ops vmx_nested_ops = { + .check_events = vmx_check_nested_events, + .get_state = vmx_get_nested_state, + .set_state = vmx_set_nested_state, + .get_vmcs12_pages = nested_get_vmcs12_pages, + .enable_evmcs = nested_enable_evmcs, + .get_evmcs_version = nested_get_evmcs_version, +}; diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h index 1514ff4db77f..7ce9572c3d3a 100644 --- a/arch/x86/kvm/vmx/nested.h +++ b/arch/x86/kvm/vmx/nested.h @@ -278,4 +278,6 @@ static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) #define nested_guest_cr4_valid nested_cr4_valid #define nested_host_cr4_valid nested_cr4_valid +extern struct kvm_x86_nested_ops vmx_nested_ops; + #endif /* __KVM_X86_VMX_NESTED_H */ diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 766303b31949..455cd2c8dbce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7862,6 +7862,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .post_block = vmx_post_block, .pmu_ops = &intel_pmu_ops, + .nested_ops = &vmx_nested_ops, .update_pi_irte = vmx_update_pi_irte, @@ -7877,12 +7878,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .pre_leave_smm = vmx_pre_leave_smm, .enable_smi_window = enable_smi_window, - .check_nested_events = NULL, - .get_nested_state = NULL, - .set_nested_state = NULL, - .get_vmcs12_pages = NULL, - .nested_enable_evmcs = NULL, - .nested_get_evmcs_version = NULL, .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0492baeb78ab..8c0b77ac8dc6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3442,14 +3442,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_X2APIC_API_VALID_FLAGS; break; case KVM_CAP_NESTED_STATE: - r = kvm_x86_ops.get_nested_state ? - kvm_x86_ops.get_nested_state(NULL, NULL, 0) : 0; + r = kvm_x86_ops.nested_ops->get_state ? + kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; break; case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: r = kvm_x86_ops.enable_direct_tlbflush != NULL; break; case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - r = kvm_x86_ops.nested_enable_evmcs != NULL; + r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; break; default: break; @@ -4235,9 +4235,9 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, return kvm_hv_activate_synic(vcpu, cap->cap == KVM_CAP_HYPERV_SYNIC2); case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: - if (!kvm_x86_ops.nested_enable_evmcs) + if (!kvm_x86_ops.nested_ops->enable_evmcs) return -ENOTTY; - r = kvm_x86_ops.nested_enable_evmcs(vcpu, &vmcs_version); + r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); if (!r) { user_ptr = (void __user *)(uintptr_t)cap->args[0]; if (copy_to_user(user_ptr, &vmcs_version, @@ -4552,7 +4552,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_data_size; r = -EINVAL; - if (!kvm_x86_ops.get_nested_state) + if (!kvm_x86_ops.nested_ops->get_state) break; BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); @@ -4560,8 +4560,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (get_user(user_data_size, &user_kvm_nested_state->size)) break; - r = kvm_x86_ops.get_nested_state(vcpu, user_kvm_nested_state, - user_data_size); + r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, + user_data_size); if (r < 0) break; @@ -4582,7 +4582,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, int idx; r = -EINVAL; - if (!kvm_x86_ops.set_nested_state) + if (!kvm_x86_ops.nested_ops->set_state) break; r = -EFAULT; @@ -4604,7 +4604,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, break; idx = srcu_read_lock(&vcpu->kvm->srcu); - r = kvm_x86_ops.set_nested_state(vcpu, user_kvm_nested_state, &kvm_state); + r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); srcu_read_unlock(&vcpu->kvm->srcu, idx); break; } @@ -7700,7 +7700,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) * from L2 to L1. */ if (is_guest_mode(vcpu)) { - r = kvm_x86_ops.check_nested_events(vcpu); + r = kvm_x86_ops.nested_ops->check_events(vcpu); if (r != 0) return r; } @@ -7762,7 +7762,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) * KVM_REQ_EVENT only on certain events and not unconditionally? */ if (is_guest_mode(vcpu)) { - r = kvm_x86_ops.check_nested_events(vcpu); + r = kvm_x86_ops.nested_ops->check_events(vcpu); if (r != 0) return r; } @@ -8185,7 +8185,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_request_pending(vcpu)) { if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) { - if (unlikely(!kvm_x86_ops.get_vmcs12_pages(vcpu))) { + if (unlikely(!kvm_x86_ops.nested_ops->get_vmcs12_pages(vcpu))) { r = 0; goto out; } @@ -8528,7 +8528,7 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu) static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) { if (is_guest_mode(vcpu)) - kvm_x86_ops.check_nested_events(vcpu); + kvm_x86_ops.nested_ops->check_events(vcpu); return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && !vcpu->arch.apf.halted); -- cgit v1.2.3 From b4b89a02724245c4f4eda9dbfb7895ddf122ca7f Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Thu, 19 Mar 2020 22:00:24 -0400 Subject: efi/gop: Add prototypes for query_mode and set_mode Add prototypes and argmap for the Graphics Output Protocol's QueryMode and SetMode functions. Signed-off-by: Arvind Sankar Link: https://lore.kernel.org/r/20200320020028.1936003-11-nivedita@alum.mit.edu Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 4 ++++ drivers/firmware/efi/libstub/efistub.h | 6 ++++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 8391c115c0ec..f59cba117dcb 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -307,6 +307,10 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_load_file(protocol, path, policy, bufsize, buf) \ ((protocol), (path), (policy), efi64_zero_upper(bufsize), (buf)) +/* Graphics Output Protocol */ +#define __efi64_argmap_query_mode(gop, mode, size, info) \ + ((gop), (mode), efi64_zero_upper(size), efi64_zero_upper(info)) + /* * The macros below handle the plumbing for the argument mapping. To add a * mapping for a specific EFI method, simply define a macro diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index d9ad8582dbea..321244ed20a4 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -314,8 +314,10 @@ typedef union efi_graphics_output_protocol efi_graphics_output_protocol_t; union efi_graphics_output_protocol { struct { - void *query_mode; - void *set_mode; + efi_status_t (__efiapi *query_mode)(efi_graphics_output_protocol_t *, + u32, unsigned long *, + efi_graphics_output_mode_info_t **); + efi_status_t (__efiapi *set_mode) (efi_graphics_output_protocol_t *, u32); void *blt; efi_graphics_output_protocol_mode_t *mode; }; -- cgit v1.2.3 From 54b34aa0a7295c0fc82c72c15bf42c05ad720d5f Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:33 +0300 Subject: platform/x86: intel_scu_ipc: Split out SCU IPC functionality from the SCU driver The SCU IPC functionality is usable outside of Intel MID devices. For example modern Intel CPUs include the same thing but now it is called PMC (Power Management Controller) instead of SCU. To make the IPC available for those split the driver into core part (intel_scu_ipc.c) and the SCU PCI driver part (intel_scu_pcidrv.c) which then calls the former before it goes and creates rest of the SCU devices. The SCU IPC will also register a new class that gets assigned to the device that is created under the parent PCI device. We also split the Kconfig symbols so that INTEL_SCU_IPC enables the SCU IPC library and INTEL_SCU_PCI the SCU driver and convert the users accordingly. While there remove default y from the INTEL_SCU_PCI symbol as it is already selected by X86_INTEL_MID. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/Kconfig | 2 +- arch/x86/include/asm/intel_scu_ipc.h | 18 ++++ drivers/mfd/Kconfig | 4 +- drivers/platform/x86/Kconfig | 24 +++-- drivers/platform/x86/Makefile | 1 + drivers/platform/x86/intel_scu_ipc.c | 172 +++++++++++++++++++++----------- drivers/platform/x86/intel_scu_pcidrv.c | 55 ++++++++++ 7 files changed, 208 insertions(+), 68 deletions(-) create mode 100644 drivers/platform/x86/intel_scu_pcidrv.c (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d6104ea8af0..5947c7a16d78 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -595,7 +595,7 @@ config X86_INTEL_MID select I2C select DW_APB_TIMER select APB_TIMER - select INTEL_SCU_IPC + select INTEL_SCU_PCI select MFD_INTEL_MSIC ---help--- Select to build a kernel capable of supporting Intel MID (Mobile diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 2a1442ba6e78..c53d18f9440b 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -2,6 +2,7 @@ #ifndef _ASM_X86_INTEL_SCU_IPC_H_ #define _ASM_X86_INTEL_SCU_IPC_H_ +#include #include #define IPCMSG_INDIRECT_READ 0x02 @@ -19,6 +20,23 @@ #define IPC_CMD_VRTC_SETTIME 1 /* Set time */ #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ +struct device; +struct intel_scu_ipc_dev; + +/** + * struct intel_scu_ipc_data - Data used to configure SCU IPC + * @mem: Base address of SCU IPC MMIO registers + * @irq: The IRQ number used for SCU (optional) + */ +struct intel_scu_ipc_data { + struct resource mem; + int irq; +}; + +struct intel_scu_ipc_dev * +intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data); + /* Read single register */ int intel_scu_ipc_ioread8(u16 addr, u8 *data); diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 0a59249198d3..5e21a78b6923 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -593,7 +593,7 @@ config INTEL_SOC_PMIC_MRFLD tristate "Support for Intel Merrifield Basin Cove PMIC" depends on GPIOLIB depends on ACPI - depends on INTEL_SCU_IPC + depends on INTEL_SCU select MFD_CORE select REGMAP_IRQ help @@ -625,7 +625,7 @@ config MFD_INTEL_LPSS_PCI config MFD_INTEL_MSIC bool "Intel MSIC" - depends on INTEL_SCU_IPC + depends on INTEL_SCU select MFD_CORE help Select this option to enable access to Intel MSIC (Avatele diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 0ad7ad8cf8e1..d1e4d49c35e2 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1295,7 +1295,7 @@ config INTEL_MFLD_THERMAL config INTEL_MID_POWER_BUTTON tristate "power button driver for Intel MID platforms" - depends on INTEL_SCU_IPC && INPUT + depends on INTEL_SCU && INPUT help This driver handles the power button on the Intel MID platforms. @@ -1342,17 +1342,25 @@ config INTEL_PUNIT_IPC which is used to bridge the communications between kernel and P-Unit. config INTEL_SCU_IPC - bool "Intel SCU IPC Support" + bool + +config INTEL_SCU + bool + select INTEL_SCU_IPC + +config INTEL_SCU_PCI + bool "Intel SCU PCI driver" depends on X86_INTEL_MID - default y - ---help--- - IPC is used to bridge the communications between kernel and SCU on - some embedded Intel x86 platforms. This is not needed for PC-type - machines. + select INTEL_SCU + help + This driver is used to bridge the communications between kernel + and SCU on some embedded Intel x86 platforms. It also creates + devices that are connected to the SoC through the SCU. This is + not needed for PC-type machines. config INTEL_SCU_IPC_UTIL tristate "Intel SCU IPC utility driver" - depends on INTEL_SCU_IPC + depends on INTEL_SCU ---help--- The IPC Util driver provides an interface with the SCU enabling low level access for debug work and updating the firmware. Say diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index 53408d965874..ef995a0f04f0 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o intel_pmc_core_pltdrv.o obj-$(CONFIG_INTEL_PMC_IPC) += intel_pmc_ipc.o obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o +obj-$(CONFIG_INTEL_SCU_PCI) += intel_scu_pcidrv.o obj-$(CONFIG_INTEL_SCU_IPC_UTIL) += intel_scu_ipcutil.o obj-$(CONFIG_INTEL_TELEMETRY) += intel_telemetry_core.o \ intel_telemetry_pltdrv.o \ diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 3d7da5266136..584e33c37708 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -18,11 +18,10 @@ #include #include #include -#include -#include -#include +#include +#include +#include -#include #include /* IPC defines the following message types */ @@ -55,14 +54,13 @@ #define IPC_IOC 0x100 /* IPC command register IOC bit */ struct intel_scu_ipc_dev { - struct device *dev; + struct device dev; + struct resource mem; + int irq; void __iomem *ipc_base; struct completion cmd_complete; - u8 irq_mode; }; -static struct intel_scu_ipc_dev ipcdev; /* Only one for now */ - #define IPC_STATUS 0x04 #define IPC_STATUS_IRQ BIT(2) #define IPC_STATUS_ERR BIT(1) @@ -78,8 +76,14 @@ static struct intel_scu_ipc_dev ipcdev; /* Only one for now */ /* Timeout in jiffies */ #define IPC_TIMEOUT (3 * HZ) +static struct intel_scu_ipc_dev *ipcdev; /* Only one for now */ static DEFINE_MUTEX(ipclock); /* lock used to prevent multiple call to SCU */ +static struct class intel_scu_ipc_class = { + .name = "intel_scu_ipc", + .owner = THIS_MODULE, +}; + /* * Send ipc command * Command Register (Write Only): @@ -143,7 +147,7 @@ static inline int busy_loop(struct intel_scu_ipc_dev *scu) usleep_range(50, 100); } while (time_before(jiffies, end)); - dev_err(scu->dev, "IPC timed out"); + dev_err(&scu->dev, "IPC timed out"); return -ETIMEDOUT; } @@ -153,7 +157,7 @@ static inline int ipc_wait_for_interrupt(struct intel_scu_ipc_dev *scu) int status; if (!wait_for_completion_timeout(&scu->cmd_complete, IPC_TIMEOUT)) { - dev_err(scu->dev, "IPC timed out\n"); + dev_err(&scu->dev, "IPC timed out\n"); return -ETIMEDOUT; } @@ -166,13 +170,13 @@ static inline int ipc_wait_for_interrupt(struct intel_scu_ipc_dev *scu) static int intel_scu_ipc_check_status(struct intel_scu_ipc_dev *scu) { - return scu->irq_mode ? ipc_wait_for_interrupt(scu) : busy_loop(scu); + return scu->irq > 0 ? ipc_wait_for_interrupt(scu) : busy_loop(scu); } /* Read/Write power control(PMIC in Langwell, MSIC in PenWell) registers */ static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id) { - struct intel_scu_ipc_dev *scu = &ipcdev; + struct intel_scu_ipc_dev *scu; int nc; u32 offset = 0; int err; @@ -182,11 +186,11 @@ static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id) memset(cbuf, 0, sizeof(cbuf)); mutex_lock(&ipclock); - - if (scu->dev == NULL) { + if (!ipcdev) { mutex_unlock(&ipclock); return -ENODEV; } + scu = ipcdev; for (nc = 0; nc < count; nc++, offset += 2) { cbuf[offset] = addr[nc]; @@ -326,14 +330,15 @@ EXPORT_SYMBOL(intel_scu_ipc_update_register); */ int intel_scu_ipc_simple_command(int cmd, int sub) { - struct intel_scu_ipc_dev *scu = &ipcdev; + struct intel_scu_ipc_dev *scu; int err; mutex_lock(&ipclock); - if (scu->dev == NULL) { + if (!ipcdev) { mutex_unlock(&ipclock); return -ENODEV; } + scu = ipcdev; ipc_command(scu, sub << 12 | cmd); err = intel_scu_ipc_check_status(scu); mutex_unlock(&ipclock); @@ -356,14 +361,15 @@ EXPORT_SYMBOL(intel_scu_ipc_simple_command); int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, u32 *out, int outlen) { - struct intel_scu_ipc_dev *scu = &ipcdev; + struct intel_scu_ipc_dev *scu; int i, err; mutex_lock(&ipclock); - if (scu->dev == NULL) { + if (!ipcdev) { mutex_unlock(&ipclock); return -ENODEV; } + scu = ipcdev; for (i = 0; i < inlen; i++) ipc_data_writel(scu, *in++, 4 * i); @@ -399,61 +405,113 @@ static irqreturn_t ioc(int irq, void *dev_id) return IRQ_HANDLED; } +static void intel_scu_ipc_release(struct device *dev) +{ + struct intel_scu_ipc_dev *scu; + + scu = container_of(dev, struct intel_scu_ipc_dev, dev); + if (scu->irq > 0) + free_irq(scu->irq, scu); + iounmap(scu->ipc_base); + release_mem_region(scu->mem.start, resource_size(&scu->mem)); + kfree(scu); +} + /** - * ipc_probe - probe an Intel SCU IPC - * @pdev: the PCI device matching - * @id: entry in the match table + * intel_scu_ipc_register() - Register SCU IPC device + * @parent: Parent device + * @scu_data: Data used to configure SCU IPC * - * Enable and install an intel SCU IPC. This appears in the PCI space - * but uses some hard coded addresses as well. + * Call this function to register SCU IPC mechanism under @parent. + * Returns pointer to the new SCU IPC device or ERR_PTR() in case of + * failure. */ -static int ipc_probe(struct pci_dev *pdev, const struct pci_device_id *id) +struct intel_scu_ipc_dev * +intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data) { int err; - struct intel_scu_ipc_dev *scu = &ipcdev; + struct intel_scu_ipc_dev *scu; + void __iomem *ipc_base; - if (scu->dev) /* We support only one SCU */ - return -EBUSY; + mutex_lock(&ipclock); + /* We support only one IPC */ + if (ipcdev) { + err = -EBUSY; + goto err_unlock; + } - err = pcim_enable_device(pdev); - if (err) - return err; + scu = kzalloc(sizeof(*scu), GFP_KERNEL); + if (!scu) { + err = -ENOMEM; + goto err_unlock; + } - err = pcim_iomap_regions(pdev, 1 << 0, pci_name(pdev)); - if (err) - return err; + scu->dev.parent = parent; + scu->dev.class = &intel_scu_ipc_class; + scu->dev.release = intel_scu_ipc_release; + dev_set_name(&scu->dev, "intel_scu_ipc"); + if (!request_mem_region(scu_data->mem.start, resource_size(&scu_data->mem), + "intel_scu_ipc")) { + err = -EBUSY; + goto err_free; + } + + ipc_base = ioremap(scu_data->mem.start, resource_size(&scu_data->mem)); + if (!ipc_base) { + err = -ENOMEM; + goto err_release; + } + + scu->ipc_base = ipc_base; + scu->mem = scu_data->mem; + scu->irq = scu_data->irq; init_completion(&scu->cmd_complete); - scu->ipc_base = pcim_iomap_table(pdev)[0]; + if (scu->irq > 0) { + err = request_irq(scu->irq, ioc, 0, "intel_scu_ipc", scu); + if (err) + goto err_unmap; + } - err = devm_request_irq(&pdev->dev, pdev->irq, ioc, 0, "intel_scu_ipc", - scu); - if (err) - return err; + /* + * After this point intel_scu_ipc_release() takes care of + * releasing the SCU IPC resources once refcount drops to zero. + */ + err = device_register(&scu->dev); + if (err) { + put_device(&scu->dev); + goto err_unlock; + } /* Assign device at last */ - scu->dev = &pdev->dev; + ipcdev = scu; + mutex_unlock(&ipclock); - intel_scu_devices_create(); + return scu; - pci_set_drvdata(pdev, scu); - return 0; +err_unmap: + iounmap(ipc_base); +err_release: + release_mem_region(scu_data->mem.start, resource_size(&scu_data->mem)); +err_free: + kfree(scu); +err_unlock: + mutex_unlock(&ipclock); + + return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(intel_scu_ipc_register); -static const struct pci_device_id pci_ids[] = { - { PCI_VDEVICE(INTEL, 0x080e) }, - { PCI_VDEVICE(INTEL, 0x08ea) }, - { PCI_VDEVICE(INTEL, 0x11a0) }, - {} -}; +static int __init intel_scu_ipc_init(void) +{ + return class_register(&intel_scu_ipc_class); +} +subsys_initcall(intel_scu_ipc_init); -static struct pci_driver ipc_driver = { - .driver = { - .suppress_bind_attrs = true, - }, - .name = "intel_scu_ipc", - .id_table = pci_ids, - .probe = ipc_probe, -}; -builtin_pci_driver(ipc_driver); +static void __exit intel_scu_ipc_exit(void) +{ + class_unregister(&intel_scu_ipc_class); +} +module_exit(intel_scu_ipc_exit); diff --git a/drivers/platform/x86/intel_scu_pcidrv.c b/drivers/platform/x86/intel_scu_pcidrv.c new file mode 100644 index 000000000000..b869ec2eda0e --- /dev/null +++ b/drivers/platform/x86/intel_scu_pcidrv.c @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * PCI driver for the Intel SCU. + * + * Copyright (C) 2008-2010, 2015, 2020 Intel Corporation + * Authors: Sreedhara DS (sreedhara.ds@intel.com) + * Mika Westerberg + */ + +#include +#include +#include + +#include +#include + +static int intel_scu_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct intel_scu_ipc_data scu_data = {}; + struct intel_scu_ipc_dev *scu; + int ret; + + ret = pcim_enable_device(pdev); + if (ret) + return ret; + + scu_data.mem = pdev->resource[0]; + scu_data.irq = pdev->irq; + + scu = intel_scu_ipc_register(&pdev->dev, &scu_data); + if (IS_ERR(scu)) + return PTR_ERR(scu); + + intel_scu_devices_create(); + return 0; +} + +static const struct pci_device_id pci_ids[] = { + { PCI_VDEVICE(INTEL, 0x080e) }, + { PCI_VDEVICE(INTEL, 0x08ea) }, + { PCI_VDEVICE(INTEL, 0x11a0) }, + {} +}; + +static struct pci_driver intel_scu_pci_driver = { + .driver = { + .suppress_bind_attrs = true, + }, + .name = "intel_scu", + .id_table = pci_ids, + .probe = intel_scu_pci_probe, +}; + +builtin_pci_driver(intel_scu_pci_driver); -- cgit v1.2.3 From dd88564937390a9ebf4fb9c0ef21652ff6e35780 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:35 +0300 Subject: platform/x86: intel_scu_ipc: Move legacy SCU IPC API to a separate header In preparation for introducing a new API for SCU IPC, move the legacy API and constants to a separate header that is is subject to be removed eventually. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel_scu_ipc.h | 56 +------------------------- arch/x86/include/asm/intel_scu_ipc_legacy.h | 62 +++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 55 deletions(-) create mode 100644 arch/x86/include/asm/intel_scu_ipc_legacy.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index c53d18f9440b..2d0e485842fd 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -3,22 +3,6 @@ #define _ASM_X86_INTEL_SCU_IPC_H_ #include -#include - -#define IPCMSG_INDIRECT_READ 0x02 -#define IPCMSG_INDIRECT_WRITE 0x05 - -#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ - -#define IPCMSG_WARM_RESET 0xF0 -#define IPCMSG_COLD_RESET 0xF1 -#define IPCMSG_SOFT_RESET 0xF2 -#define IPCMSG_COLD_BOOT 0xF3 - -#define IPCMSG_VRTC 0xFA /* Set vRTC device */ - /* Command id associated with message IPCMSG_VRTC */ - #define IPC_CMD_VRTC_SETTIME 1 /* Set time */ - #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ struct device; struct intel_scu_ipc_dev; @@ -37,44 +21,6 @@ struct intel_scu_ipc_dev * intel_scu_ipc_register(struct device *parent, const struct intel_scu_ipc_data *scu_data); -/* Read single register */ -int intel_scu_ipc_ioread8(u16 addr, u8 *data); - -/* Read a vector */ -int intel_scu_ipc_readv(u16 *addr, u8 *data, int len); - -/* Write single register */ -int intel_scu_ipc_iowrite8(u16 addr, u8 data); - -/* Write a vector */ -int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); - -/* Update single register based on the mask */ -int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); - -/* Issue commands to the SCU with or without data */ -int intel_scu_ipc_simple_command(int cmd, int sub); -int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, - u32 *out, int outlen); - -extern struct blocking_notifier_head intel_scu_notifier; - -static inline void intel_scu_notifier_add(struct notifier_block *nb) -{ - blocking_notifier_chain_register(&intel_scu_notifier, nb); -} - -static inline void intel_scu_notifier_remove(struct notifier_block *nb) -{ - blocking_notifier_chain_unregister(&intel_scu_notifier, nb); -} - -static inline int intel_scu_notifier_post(unsigned long v, void *p) -{ - return blocking_notifier_call_chain(&intel_scu_notifier, v, p); -} - -#define SCU_AVAILABLE 1 -#define SCU_DOWN 2 +#include #endif diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h new file mode 100644 index 000000000000..d3a02bc07edd --- /dev/null +++ b/arch/x86/include/asm/intel_scu_ipc_legacy.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INTEL_SCU_IPC_LEGACY_H_ +#define _ASM_X86_INTEL_SCU_IPC_LEGACY_H_ + +#include + +#define IPCMSG_INDIRECT_READ 0x02 +#define IPCMSG_INDIRECT_WRITE 0x05 + +#define IPCMSG_COLD_OFF 0x80 /* Only for Tangier */ + +#define IPCMSG_WARM_RESET 0xF0 +#define IPCMSG_COLD_RESET 0xF1 +#define IPCMSG_SOFT_RESET 0xF2 +#define IPCMSG_COLD_BOOT 0xF3 + +#define IPCMSG_VRTC 0xFA /* Set vRTC device */ +/* Command id associated with message IPCMSG_VRTC */ +#define IPC_CMD_VRTC_SETTIME 1 /* Set time */ +#define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ + +/* Read single register */ +int intel_scu_ipc_ioread8(u16 addr, u8 *data); + +/* Read a vector */ +int intel_scu_ipc_readv(u16 *addr, u8 *data, int len); + +/* Write single register */ +int intel_scu_ipc_iowrite8(u16 addr, u8 data); + +/* Write a vector */ +int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); + +/* Update single register based on the mask */ +int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); + +/* Issue commands to the SCU with or without data */ +int intel_scu_ipc_simple_command(int cmd, int sub); +int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, + u32 *out, int outlen); + +extern struct blocking_notifier_head intel_scu_notifier; + +static inline void intel_scu_notifier_add(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&intel_scu_notifier, nb); +} + +static inline void intel_scu_notifier_remove(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&intel_scu_notifier, nb); +} + +static inline int intel_scu_notifier_post(unsigned long v, void *p) +{ + return blocking_notifier_call_chain(&intel_scu_notifier, v, p); +} + +#define SCU_AVAILABLE 1 +#define SCU_DOWN 2 + +#endif -- cgit v1.2.3 From f57fa18583f538786b66a045446617f5638f032a Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:36 +0300 Subject: platform/x86: intel_scu_ipc: Introduce new SCU IPC API The current SCU IPC API has been operating on a single instance and there has been no way to pin the providing module in place when the SCU IPC is in use. This implements a new API that takes the SCU IPC instance as first parameter (NULL means the single instance is being used). The SCU IPC instance can be retrieved by calling new function intel_scu_ipc_dev_get() that take care of pinning the providing module in place as long as intel_scu_ipc_dev_put() is not called. The old API is updated to call the new API and is is left there in the legacy API header to support the existing users that cannot be converted easily. Subsequent patches will convert most of the users over to the new API. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel_scu_ipc.h | 38 ++++- arch/x86/include/asm/intel_scu_ipc_legacy.h | 45 +++++- drivers/platform/x86/intel_scu_ipc.c | 230 ++++++++++++++++++++++------ 3 files changed, 252 insertions(+), 61 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 2d0e485842fd..d5f6ae514172 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -18,8 +18,42 @@ struct intel_scu_ipc_data { }; struct intel_scu_ipc_dev * -intel_scu_ipc_register(struct device *parent, - const struct intel_scu_ipc_data *scu_data); +__intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner); + +#define intel_scu_ipc_register(parent, scu_data) \ + __intel_scu_ipc_register(parent, scu_data, THIS_MODULE) + +struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void); +void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu); +struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev); + +int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr, + u8 *data); +int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr, + u8 data); +int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr, + u8 *data, size_t len); +int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr, + u8 *data, size_t len); + +int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr, + u8 data, u8 mask); + +int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, + int sub); +int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd, + int sub, const void *in, size_t inlen, + size_t size, void *out, size_t outlen); + +static inline int intel_scu_ipc_dev_command(struct intel_scu_ipc_dev *scu, int cmd, + int sub, const void *in, size_t inlen, + void *out, size_t outlen) +{ + return intel_scu_ipc_dev_command_with_size(scu, cmd, sub, in, inlen, + inlen, out, outlen); +} #include diff --git a/arch/x86/include/asm/intel_scu_ipc_legacy.h b/arch/x86/include/asm/intel_scu_ipc_legacy.h index d3a02bc07edd..4cf13fecb673 100644 --- a/arch/x86/include/asm/intel_scu_ipc_legacy.h +++ b/arch/x86/include/asm/intel_scu_ipc_legacy.h @@ -19,25 +19,54 @@ #define IPC_CMD_VRTC_SETTIME 1 /* Set time */ #define IPC_CMD_VRTC_SETALARM 2 /* Set alarm */ +/* Don't call these in new code - they will be removed eventually */ + /* Read single register */ -int intel_scu_ipc_ioread8(u16 addr, u8 *data); +static inline int intel_scu_ipc_ioread8(u16 addr, u8 *data) +{ + return intel_scu_ipc_dev_ioread8(NULL, addr, data); +} /* Read a vector */ -int intel_scu_ipc_readv(u16 *addr, u8 *data, int len); +static inline int intel_scu_ipc_readv(u16 *addr, u8 *data, int len) +{ + return intel_scu_ipc_dev_readv(NULL, addr, data, len); +} /* Write single register */ -int intel_scu_ipc_iowrite8(u16 addr, u8 data); +static inline int intel_scu_ipc_iowrite8(u16 addr, u8 data) +{ + return intel_scu_ipc_dev_iowrite8(NULL, addr, data); +} /* Write a vector */ -int intel_scu_ipc_writev(u16 *addr, u8 *data, int len); +static inline int intel_scu_ipc_writev(u16 *addr, u8 *data, int len) +{ + return intel_scu_ipc_dev_writev(NULL, addr, data, len); +} /* Update single register based on the mask */ -int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask); +static inline int intel_scu_ipc_update_register(u16 addr, u8 data, u8 mask) +{ + return intel_scu_ipc_dev_update(NULL, addr, data, mask); +} /* Issue commands to the SCU with or without data */ -int intel_scu_ipc_simple_command(int cmd, int sub); -int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, - u32 *out, int outlen); +static inline int intel_scu_ipc_simple_command(int cmd, int sub) +{ + return intel_scu_ipc_dev_simple_command(NULL, cmd, sub); +} + +static inline int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, + u32 *out, int outlen) +{ + /* New API takes both inlen and outlen as bytes so convert here */ + size_t inbytes = inlen * sizeof(u32); + size_t outbytes = outlen * sizeof(u32); + + return intel_scu_ipc_dev_command_with_size(NULL, cmd, sub, in, inbytes, + inlen, out, outbytes); +} extern struct blocking_notifier_head intel_scu_notifier; diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index e8ea250aeda3..608034ea7af5 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -56,6 +56,7 @@ struct intel_scu_ipc_dev { struct device dev; struct resource mem; + struct module *owner; int irq; void __iomem *ipc_base; struct completion cmd_complete; @@ -84,6 +85,102 @@ static struct class intel_scu_ipc_class = { .owner = THIS_MODULE, }; +/** + * intel_scu_ipc_dev_get() - Get SCU IPC instance + * + * The recommended new API takes SCU IPC instance as parameter and this + * function can be called by driver to get the instance. This also makes + * sure the driver providing the IPC functionality cannot be unloaded + * while the caller has the instance. + * + * Call intel_scu_ipc_dev_put() to release the instance. + * + * Returns %NULL if SCU IPC is not currently available. + */ +struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void) +{ + struct intel_scu_ipc_dev *scu = NULL; + + mutex_lock(&ipclock); + if (ipcdev) { + get_device(&ipcdev->dev); + /* + * Prevent the IPC provider from being unloaded while it + * is being used. + */ + if (!try_module_get(ipcdev->owner)) + put_device(&ipcdev->dev); + else + scu = ipcdev; + } + + mutex_unlock(&ipclock); + return scu; +} +EXPORT_SYMBOL_GPL(intel_scu_ipc_dev_get); + +/** + * intel_scu_ipc_dev_put() - Put SCU IPC instance + * @scu: SCU IPC instance + * + * This function releases the SCU IPC instance retrieved from + * intel_scu_ipc_dev_get() and allows the driver providing IPC to be + * unloaded. + */ +void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu) +{ + if (scu) { + module_put(scu->owner); + put_device(&scu->dev); + } +} +EXPORT_SYMBOL_GPL(intel_scu_ipc_dev_put); + +struct intel_scu_ipc_devres { + struct intel_scu_ipc_dev *scu; +}; + +static void devm_intel_scu_ipc_dev_release(struct device *dev, void *res) +{ + struct intel_scu_ipc_devres *dr = res; + struct intel_scu_ipc_dev *scu = dr->scu; + + intel_scu_ipc_dev_put(scu); +} + +/** + * devm_intel_scu_ipc_dev_get() - Allocate managed SCU IPC device + * @dev: Device requesting the SCU IPC device + * + * The recommended new API takes SCU IPC instance as parameter and this + * function can be called by driver to get the instance. This also makes + * sure the driver providing the IPC functionality cannot be unloaded + * while the caller has the instance. + * + * Returns %NULL if SCU IPC is not currently available. + */ +struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev) +{ + struct intel_scu_ipc_devres *dr; + struct intel_scu_ipc_dev *scu; + + dr = devres_alloc(devm_intel_scu_ipc_dev_release, sizeof(*dr), GFP_KERNEL); + if (!dr) + return NULL; + + scu = intel_scu_ipc_dev_get(); + if (!scu) { + devres_free(dr); + return NULL; + } + + dr->scu = scu; + devres_add(dev, dr); + + return scu; +} +EXPORT_SYMBOL_GPL(devm_intel_scu_ipc_dev_get); + /* * Send ipc command * Command Register (Write Only): @@ -171,9 +268,9 @@ static int intel_scu_ipc_check_status(struct intel_scu_ipc_dev *scu) } /* Read/Write power control(PMIC in Langwell, MSIC in PenWell) registers */ -static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id) +static int pwr_reg_rdwr(struct intel_scu_ipc_dev *scu, u16 *addr, u8 *data, + u32 count, u32 op, u32 id) { - struct intel_scu_ipc_dev *scu; int nc; u32 offset = 0; int err; @@ -183,11 +280,12 @@ static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id) memset(cbuf, 0, sizeof(cbuf)); mutex_lock(&ipclock); - if (!ipcdev) { + if (!scu) + scu = ipcdev; + if (!scu) { mutex_unlock(&ipclock); return -ENODEV; } - scu = ipcdev; for (nc = 0; nc < count; nc++, offset += 2) { cbuf[offset] = addr[nc]; @@ -223,7 +321,8 @@ static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id) } /** - * intel_scu_ipc_ioread8 - read a word via the SCU + * intel_scu_ipc_dev_ioread8() - Read a byte via the SCU + * @scu: Optional SCU IPC instance * @addr: Register on SCU * @data: Return pointer for read byte * @@ -232,14 +331,15 @@ static int pwr_reg_rdwr(u16 *addr, u8 *data, u32 count, u32 op, u32 id) * * This function may sleep. */ -int intel_scu_ipc_ioread8(u16 addr, u8 *data) +int intel_scu_ipc_dev_ioread8(struct intel_scu_ipc_dev *scu, u16 addr, u8 *data) { - return pwr_reg_rdwr(&addr, data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R); + return pwr_reg_rdwr(scu, &addr, data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R); } -EXPORT_SYMBOL(intel_scu_ipc_ioread8); +EXPORT_SYMBOL(intel_scu_ipc_dev_ioread8); /** - * intel_scu_ipc_iowrite8 - write a byte via the SCU + * intel_scu_ipc_dev_iowrite8() - Write a byte via the SCU + * @scu: Optional SCU IPC instance * @addr: Register on SCU * @data: Byte to write * @@ -248,14 +348,15 @@ EXPORT_SYMBOL(intel_scu_ipc_ioread8); * * This function may sleep. */ -int intel_scu_ipc_iowrite8(u16 addr, u8 data) +int intel_scu_ipc_dev_iowrite8(struct intel_scu_ipc_dev *scu, u16 addr, u8 data) { - return pwr_reg_rdwr(&addr, &data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W); + return pwr_reg_rdwr(scu, &addr, &data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W); } -EXPORT_SYMBOL(intel_scu_ipc_iowrite8); +EXPORT_SYMBOL(intel_scu_ipc_dev_iowrite8); /** - * intel_scu_ipc_readvv - read a set of registers + * intel_scu_ipc_dev_readv() - Read a set of registers + * @scu: Optional SCU IPC instance * @addr: Register list * @data: Bytes to return * @len: Length of array @@ -267,14 +368,16 @@ EXPORT_SYMBOL(intel_scu_ipc_iowrite8); * * This function may sleep. */ -int intel_scu_ipc_readv(u16 *addr, u8 *data, int len) +int intel_scu_ipc_dev_readv(struct intel_scu_ipc_dev *scu, u16 *addr, u8 *data, + size_t len) { - return pwr_reg_rdwr(addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R); + return pwr_reg_rdwr(scu, addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_R); } -EXPORT_SYMBOL(intel_scu_ipc_readv); +EXPORT_SYMBOL(intel_scu_ipc_dev_readv); /** - * intel_scu_ipc_writev - write a set of registers + * intel_scu_ipc_dev_writev() - Write a set of registers + * @scu: Optional SCU IPC instance * @addr: Register list * @data: Bytes to write * @len: Length of array @@ -286,16 +389,18 @@ EXPORT_SYMBOL(intel_scu_ipc_readv); * * This function may sleep. */ -int intel_scu_ipc_writev(u16 *addr, u8 *data, int len) +int intel_scu_ipc_dev_writev(struct intel_scu_ipc_dev *scu, u16 *addr, u8 *data, + size_t len) { - return pwr_reg_rdwr(addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W); + return pwr_reg_rdwr(scu, addr, data, len, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_W); } -EXPORT_SYMBOL(intel_scu_ipc_writev); +EXPORT_SYMBOL(intel_scu_ipc_dev_writev); /** - * intel_scu_ipc_update_register - r/m/w a register + * intel_scu_ipc_dev_update() - Update a register + * @scu: Optional SCU IPC instance * @addr: Register address - * @bits: Bits to update + * @data: Bits to update * @mask: Mask of bits to update * * Read-modify-write power control unit register. The first data argument @@ -306,15 +411,17 @@ EXPORT_SYMBOL(intel_scu_ipc_writev); * This function may sleep. Locking between SCU accesses is handled * for the caller. */ -int intel_scu_ipc_update_register(u16 addr, u8 bits, u8 mask) +int intel_scu_ipc_dev_update(struct intel_scu_ipc_dev *scu, u16 addr, u8 data, + u8 mask) { - u8 data[2] = { bits, mask }; - return pwr_reg_rdwr(&addr, data, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_M); + u8 tmp[2] = { data, mask }; + return pwr_reg_rdwr(scu, &addr, tmp, 1, IPCMSG_PCNTRL, IPC_CMD_PCNTRL_M); } -EXPORT_SYMBOL(intel_scu_ipc_update_register); +EXPORT_SYMBOL(intel_scu_ipc_dev_update); /** - * intel_scu_ipc_simple_command - send a simple command + * intel_scu_ipc_dev_simple_command() - Send a simple command + * @scu: Optional SCU IPC instance * @cmd: Command * @sub: Sub type * @@ -325,14 +432,16 @@ EXPORT_SYMBOL(intel_scu_ipc_update_register); * This function may sleep. Locking for SCU accesses is handled for the * caller. */ -int intel_scu_ipc_simple_command(int cmd, int sub) +int intel_scu_ipc_dev_simple_command(struct intel_scu_ipc_dev *scu, int cmd, + int sub) { - struct intel_scu_ipc_dev *scu; u32 cmdval; int err; mutex_lock(&ipclock); - if (!ipcdev) { + if (!scu) + scu = ipcdev; + if (!scu) { mutex_unlock(&ipclock); return -ENODEV; } @@ -345,44 +454,59 @@ int intel_scu_ipc_simple_command(int cmd, int sub) dev_err(&scu->dev, "IPC command %#x failed with %d\n", cmdval, err); return err; } -EXPORT_SYMBOL(intel_scu_ipc_simple_command); +EXPORT_SYMBOL(intel_scu_ipc_dev_simple_command); /** - * intel_scu_ipc_command - command with data + * intel_scu_ipc_command_with_size() - Command with data + * @scu: Optional SCU IPC instance * @cmd: Command * @sub: Sub type * @in: Input data - * @inlen: Input length in dwords + * @inlen: Input length in bytes + * @size: Input size written to the IPC command register in whatever + * units (dword, byte) the particular firmware requires. Normally + * should be the same as @inlen. * @out: Output data - * @outlen: Output length in dwords + * @outlen: Output length in bytes * * Issue a command to the SCU which involves data transfers. Do the * data copies under the lock but leave it for the caller to interpret. */ -int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, - u32 *out, int outlen) +int intel_scu_ipc_dev_command_with_size(struct intel_scu_ipc_dev *scu, int cmd, + int sub, const void *in, size_t inlen, + size_t size, void *out, size_t outlen) { - struct intel_scu_ipc_dev *scu; - u32 cmdval; + size_t outbuflen = DIV_ROUND_UP(outlen, sizeof(u32)); + size_t inbuflen = DIV_ROUND_UP(inlen, sizeof(u32)); + u32 cmdval, inbuf[4] = {}; int i, err; + if (inbuflen > 4 || outbuflen > 4) + return -EINVAL; + mutex_lock(&ipclock); - if (!ipcdev) { + if (!scu) + scu = ipcdev; + if (!scu) { mutex_unlock(&ipclock); return -ENODEV; } - scu = ipcdev; - for (i = 0; i < inlen; i++) - ipc_data_writel(scu, *in++, 4 * i); + memcpy(inbuf, in, inlen); + for (i = 0; i < inbuflen; i++) + ipc_data_writel(scu, inbuf[i], 4 * i); - cmdval = (inlen << 16) | (sub << 12) | cmd; + cmdval = (size << 16) | (sub << 12) | cmd; ipc_command(scu, cmdval); err = intel_scu_ipc_check_status(scu); if (!err) { - for (i = 0; i < outlen; i++) - *out++ = ipc_data_readl(scu, 4 * i); + u32 outbuf[4] = {}; + + for (i = 0; i < outbuflen; i++) + outbuf[i] = ipc_data_readl(scu, 4 * i); + + memcpy(out, outbuf, outlen); } mutex_unlock(&ipclock); @@ -390,7 +514,7 @@ int intel_scu_ipc_command(int cmd, int sub, u32 *in, int inlen, dev_err(&scu->dev, "IPC command %#x failed with %d\n", cmdval, err); return err; } -EXPORT_SYMBOL(intel_scu_ipc_command); +EXPORT_SYMBOL(intel_scu_ipc_dev_command_with_size); /* * Interrupt handler gets called when ioc bit of IPC_COMMAND_REG set to 1 @@ -423,17 +547,20 @@ static void intel_scu_ipc_release(struct device *dev) } /** - * intel_scu_ipc_register() - Register SCU IPC device + * __intel_scu_ipc_register() - Register SCU IPC device * @parent: Parent device * @scu_data: Data used to configure SCU IPC + * @owner: Module registering the SCU IPC device * * Call this function to register SCU IPC mechanism under @parent. * Returns pointer to the new SCU IPC device or ERR_PTR() in case of - * failure. + * failure. The caller may use the returned instance if it needs to do + * SCU IPC calls itself. */ struct intel_scu_ipc_dev * -intel_scu_ipc_register(struct device *parent, - const struct intel_scu_ipc_data *scu_data) +__intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner) { int err; struct intel_scu_ipc_dev *scu; @@ -452,6 +579,7 @@ intel_scu_ipc_register(struct device *parent, goto err_unlock; } + scu->owner = owner; scu->dev.parent = parent; scu->dev.class = &intel_scu_ipc_class; scu->dev.release = intel_scu_ipc_release; @@ -507,7 +635,7 @@ err_unlock: return ERR_PTR(err); } -EXPORT_SYMBOL_GPL(intel_scu_ipc_register); +EXPORT_SYMBOL_GPL(__intel_scu_ipc_register); static int __init intel_scu_ipc_init(void) { -- cgit v1.2.3 From 7e18c89d6e37bfffa2b161b08f40b53d2ce7ee94 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:40 +0300 Subject: platform/x86: intel_scu_ipc: Add managed function to register SCU IPC Drivers such as intel_pmc_ipc.c can be unloaded as well so in order to support those in this driver add a new function that can be called to unregister the SCU IPC when it is not needed anymore. We also add a managed version of the intel_scu_ipc_register() that takes care of calling intel_scu_ipc_unregister() automatically when the driver is unbound. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel_scu_ipc.h | 10 ++++++ drivers/platform/x86/intel_scu_ipc.c | 62 ++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index d5f6ae514172..11d457af68c5 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -25,6 +25,16 @@ __intel_scu_ipc_register(struct device *parent, #define intel_scu_ipc_register(parent, scu_data) \ __intel_scu_ipc_register(parent, scu_data, THIS_MODULE) +void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu); + +struct intel_scu_ipc_dev * +__devm_intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner); + +#define devm_intel_scu_ipc_register(parent, scu_data) \ + __devm_intel_scu_ipc_register(parent, scu_data, THIS_MODULE) + struct intel_scu_ipc_dev *intel_scu_ipc_dev_get(void); void intel_scu_ipc_dev_put(struct intel_scu_ipc_dev *scu); struct intel_scu_ipc_dev *devm_intel_scu_ipc_dev_get(struct device *dev); diff --git a/drivers/platform/x86/intel_scu_ipc.c b/drivers/platform/x86/intel_scu_ipc.c index 608034ea7af5..d9cf7f7602b0 100644 --- a/drivers/platform/x86/intel_scu_ipc.c +++ b/drivers/platform/x86/intel_scu_ipc.c @@ -637,6 +637,68 @@ err_unlock: } EXPORT_SYMBOL_GPL(__intel_scu_ipc_register); +/** + * intel_scu_ipc_unregister() - Unregister SCU IPC + * @scu: SCU IPC handle + * + * This unregisters the SCU IPC device and releases the acquired + * resources once the refcount goes to zero. + */ +void intel_scu_ipc_unregister(struct intel_scu_ipc_dev *scu) +{ + mutex_lock(&ipclock); + if (!WARN_ON(!ipcdev)) { + ipcdev = NULL; + device_unregister(&scu->dev); + } + mutex_unlock(&ipclock); +} +EXPORT_SYMBOL_GPL(intel_scu_ipc_unregister); + +static void devm_intel_scu_ipc_unregister(struct device *dev, void *res) +{ + struct intel_scu_ipc_devres *dr = res; + struct intel_scu_ipc_dev *scu = dr->scu; + + intel_scu_ipc_unregister(scu); +} + +/** + * __devm_intel_scu_ipc_register() - Register managed SCU IPC device + * @parent: Parent device + * @scu_data: Data used to configure SCU IPC + * @owner: Module registering the SCU IPC device + * + * Call this function to register managed SCU IPC mechanism under + * @parent. Returns pointer to the new SCU IPC device or ERR_PTR() in + * case of failure. The caller may use the returned instance if it needs + * to do SCU IPC calls itself. + */ +struct intel_scu_ipc_dev * +__devm_intel_scu_ipc_register(struct device *parent, + const struct intel_scu_ipc_data *scu_data, + struct module *owner) +{ + struct intel_scu_ipc_devres *dr; + struct intel_scu_ipc_dev *scu; + + dr = devres_alloc(devm_intel_scu_ipc_unregister, sizeof(*dr), GFP_KERNEL); + if (!dr) + return NULL; + + scu = __intel_scu_ipc_register(parent, scu_data, owner); + if (IS_ERR(scu)) { + devres_free(dr); + return scu; + } + + dr->scu = scu; + devres_add(parent, dr); + + return scu; +} +EXPORT_SYMBOL_GPL(__devm_intel_scu_ipc_register); + static int __init intel_scu_ipc_init(void) { return class_register(&intel_scu_ipc_class); -- cgit v1.2.3 From 4181bc8f6fab150f197a696861599502727e0e95 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:43 +0300 Subject: mfd: intel_soc_pmic_bxtwc: Convert to use new SCU IPC API Convert the Intel Broxton Whiskey Cover PMIC driver to use the new SCU IPC API. This allows us to get rid of the PMC IPC implementation which is now covered in SCU IPC driver. We drop the error log if the IPC command fails because intel_scu_ipc_dev_command() does that already. Also move PMIC specific IPC message constants to the PMIC driver from the intel_pmc_ipc.h header. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel_pmc_ipc.h | 3 --- drivers/mfd/intel_soc_pmic_bxtwc.c | 34 +++++++++++++++++----------------- 2 files changed, 17 insertions(+), 20 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index e6da1ce26256..b438a488f613 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -3,9 +3,6 @@ #define _ASM_X86_INTEL_PMC_IPC_H_ /* Commands */ -#define PMC_IPC_PMIC_ACCESS 0xFF -#define PMC_IPC_PMIC_ACCESS_READ 0x0 -#define PMC_IPC_PMIC_ACCESS_WRITE 0x1 #define PMC_IPC_USB_PWR_CTRL 0xF0 #define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF #define PMC_IPC_PHY_CONFIG 0xEE diff --git a/drivers/mfd/intel_soc_pmic_bxtwc.c b/drivers/mfd/intel_soc_pmic_bxtwc.c index 739cfb5b69fe..eba89780dbe7 100644 --- a/drivers/mfd/intel_soc_pmic_bxtwc.c +++ b/drivers/mfd/intel_soc_pmic_bxtwc.c @@ -15,7 +15,7 @@ #include #include -#include +#include /* PMIC device registers */ #define REG_ADDR_MASK 0xFF00 @@ -58,6 +58,10 @@ /* Whiskey Cove PMIC share same ACPI ID between different platforms */ #define BROXTON_PMIC_WC_HRV 4 +#define PMC_PMIC_ACCESS 0xFF +#define PMC_PMIC_READ 0x0 +#define PMC_PMIC_WRITE 0x1 + enum bxtwc_irqs { BXTWC_PWRBTN_LVL1_IRQ = 0, BXTWC_TMU_LVL1_IRQ, @@ -288,13 +292,12 @@ static int regmap_ipc_byte_reg_read(void *context, unsigned int reg, ipc_in[0] = reg; ipc_in[1] = i2c_addr; - ret = intel_pmc_ipc_command(PMC_IPC_PMIC_ACCESS, - PMC_IPC_PMIC_ACCESS_READ, - ipc_in, sizeof(ipc_in), (u32 *)ipc_out, 1); - if (ret) { - dev_err(pmic->dev, "Failed to read from PMIC\n"); + ret = intel_scu_ipc_dev_command(pmic->scu, PMC_PMIC_ACCESS, + PMC_PMIC_READ, ipc_in, sizeof(ipc_in), + ipc_out, sizeof(ipc_out)); + if (ret) return ret; - } + *val = ipc_out[0]; return 0; @@ -303,7 +306,6 @@ static int regmap_ipc_byte_reg_read(void *context, unsigned int reg, static int regmap_ipc_byte_reg_write(void *context, unsigned int reg, unsigned int val) { - int ret; int i2c_addr; u8 ipc_in[3]; struct intel_soc_pmic *pmic = context; @@ -321,15 +323,9 @@ static int regmap_ipc_byte_reg_write(void *context, unsigned int reg, ipc_in[0] = reg; ipc_in[1] = i2c_addr; ipc_in[2] = val; - ret = intel_pmc_ipc_command(PMC_IPC_PMIC_ACCESS, - PMC_IPC_PMIC_ACCESS_WRITE, - ipc_in, sizeof(ipc_in), NULL, 0); - if (ret) { - dev_err(pmic->dev, "Failed to write to PMIC\n"); - return ret; - } - - return 0; + return intel_scu_ipc_dev_command(pmic->scu, PMC_PMIC_ACCESS, + PMC_PMIC_WRITE, ipc_in, sizeof(ipc_in), + NULL, 0); } /* sysfs interfaces to r/w PMIC registers, required by initial script */ @@ -457,6 +453,10 @@ static int bxtwc_probe(struct platform_device *pdev) dev_set_drvdata(&pdev->dev, pmic); pmic->dev = &pdev->dev; + pmic->scu = devm_intel_scu_ipc_dev_get(&pdev->dev); + if (!pmic->scu) + return -EPROBE_DEFER; + pmic->regmap = devm_regmap_init(&pdev->dev, NULL, pmic, &bxtwc_regmap_config); if (IS_ERR(pmic->regmap)) { -- cgit v1.2.3 From 68c73fb224778b8fd52b386f971a5cb3bca67878 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:45 +0300 Subject: platform/x86: intel_telemetry: Convert to use new SCU IPC API Convert the Intel Apollo Lake telemetry driver to use the new SCU IPC API. This allows us to get rid of the duplicate PMC IPC implementation which is now covered in SCU IPC driver. Also move telemetry specific IPC message constant to the telemetry driver where it belongs. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel_pmc_ipc.h | 1 - arch/x86/include/asm/intel_telemetry.h | 3 + drivers/platform/x86/intel_telemetry_pltdrv.c | 95 +++++++++++++-------------- 3 files changed, 49 insertions(+), 50 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index b438a488f613..ddc964b9c78c 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -8,7 +8,6 @@ #define PMC_IPC_PHY_CONFIG 0xEE #define PMC_IPC_NORTHPEAK_CTRL 0xED #define PMC_IPC_PM_DEBUG 0xEC -#define PMC_IPC_PMC_TELEMTRY 0xEB #define PMC_IPC_PMC_FW_MSG_CTRL 0xEA /* IPC return code */ diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 2f77e31a1283..274aaf0dae48 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -10,6 +10,8 @@ #define TELEM_MAX_EVENTS_SRAM 28 #define TELEM_MAX_OS_ALLOCATED_EVENTS 20 +#include + enum telemetry_unit { TELEM_PSS = 0, TELEM_IOSS, @@ -51,6 +53,7 @@ struct telemetry_plt_config { struct telemetry_unit_config ioss_config; struct mutex telem_trace_lock; struct mutex telem_lock; + struct intel_scu_ipc_dev *scu; bool telem_in_use; }; diff --git a/drivers/platform/x86/intel_telemetry_pltdrv.c b/drivers/platform/x86/intel_telemetry_pltdrv.c index 987a24e3344e..e45303f99303 100644 --- a/drivers/platform/x86/intel_telemetry_pltdrv.c +++ b/drivers/platform/x86/intel_telemetry_pltdrv.c @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -35,6 +34,7 @@ #define TELEM_SSRAM_STARTTIME_OFFSET 8 #define TELEM_SSRAM_EVTLOG_OFFSET 16 +#define IOSS_TELEM 0xeb #define IOSS_TELEM_EVENT_READ 0x0 #define IOSS_TELEM_EVENT_WRITE 0x1 #define IOSS_TELEM_INFO_READ 0x2 @@ -42,9 +42,6 @@ #define IOSS_TELEM_TRACE_CTL_WRITE 0x6 #define IOSS_TELEM_EVENT_CTL_READ 0x7 #define IOSS_TELEM_EVENT_CTL_WRITE 0x8 -#define IOSS_TELEM_EVT_CTRL_WRITE_SIZE 0x4 -#define IOSS_TELEM_READ_WORD 0x1 -#define IOSS_TELEM_WRITE_FOURBYTES 0x4 #define IOSS_TELEM_EVT_WRITE_SIZE 0x3 #define TELEM_INFO_SRAMEVTS_MASK 0xFF00 @@ -250,17 +247,14 @@ static int telemetry_check_evtid(enum telemetry_unit telem_unit, static inline int telemetry_plt_config_ioss_event(u32 evt_id, int index) { u32 write_buf; - int ret; write_buf = evt_id | TELEM_EVENT_ENABLE; write_buf <<= BITS_PER_BYTE; write_buf |= index; - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, - IOSS_TELEM_EVENT_WRITE, (u8 *)&write_buf, - IOSS_TELEM_EVT_WRITE_SIZE, NULL, 0); - - return ret; + return intel_scu_ipc_dev_command(telm_conf->scu, IOSS_TELEM, + IOSS_TELEM_EVENT_WRITE, &write_buf, + IOSS_TELEM_EVT_WRITE_SIZE, NULL, 0); } static inline int telemetry_plt_config_pss_event(u32 evt_id, int index) @@ -278,6 +272,7 @@ static inline int telemetry_plt_config_pss_event(u32 evt_id, int index) static int telemetry_setup_iossevtconfig(struct telemetry_evtconfig evtconfig, enum telemetry_action action) { + struct intel_scu_ipc_dev *scu = telm_conf->scu; u8 num_ioss_evts, ioss_period; int ret, index, idx; u32 *ioss_evtmap; @@ -288,9 +283,9 @@ static int telemetry_setup_iossevtconfig(struct telemetry_evtconfig evtconfig, ioss_evtmap = evtconfig.evtmap; /* Get telemetry EVENT CTL */ - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, IOSS_TELEM_EVENT_CTL_READ, NULL, 0, - &telem_ctrl, IOSS_TELEM_READ_WORD); + &telem_ctrl, sizeof(telem_ctrl)); if (ret) { pr_err("IOSS TELEM_CTRL Read Failed\n"); return ret; @@ -299,11 +294,9 @@ static int telemetry_setup_iossevtconfig(struct telemetry_evtconfig evtconfig, /* Disable Telemetry */ TELEM_DISABLE(telem_ctrl); - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, - IOSS_TELEM_EVENT_CTL_WRITE, - (u8 *)&telem_ctrl, - IOSS_TELEM_EVT_CTRL_WRITE_SIZE, - NULL, 0); + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, + IOSS_TELEM_EVENT_CTL_WRITE, &telem_ctrl, + sizeof(telem_ctrl), NULL, 0); if (ret) { pr_err("IOSS TELEM_CTRL Event Disable Write Failed\n"); return ret; @@ -315,10 +308,9 @@ static int telemetry_setup_iossevtconfig(struct telemetry_evtconfig evtconfig, /* Clear All Events */ TELEM_CLEAR_EVENTS(telem_ctrl); - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, IOSS_TELEM_EVENT_CTL_WRITE, - (u8 *)&telem_ctrl, - IOSS_TELEM_EVT_CTRL_WRITE_SIZE, + &telem_ctrl, sizeof(telem_ctrl), NULL, 0); if (ret) { pr_err("IOSS TELEM_CTRL Event Disable Write Failed\n"); @@ -344,10 +336,9 @@ static int telemetry_setup_iossevtconfig(struct telemetry_evtconfig evtconfig, /* Clear All Events */ TELEM_CLEAR_EVENTS(telem_ctrl); - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, IOSS_TELEM_EVENT_CTL_WRITE, - (u8 *)&telem_ctrl, - IOSS_TELEM_EVT_CTRL_WRITE_SIZE, + &telem_ctrl, sizeof(telem_ctrl), NULL, 0); if (ret) { pr_err("IOSS TELEM_CTRL Event Disable Write Failed\n"); @@ -396,10 +387,9 @@ static int telemetry_setup_iossevtconfig(struct telemetry_evtconfig evtconfig, TELEM_ENABLE_PERIODIC(telem_ctrl); telem_ctrl |= ioss_period; - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, IOSS_TELEM_EVENT_CTL_WRITE, - (u8 *)&telem_ctrl, - IOSS_TELEM_EVT_CTRL_WRITE_SIZE, NULL, 0); + &telem_ctrl, sizeof(telem_ctrl), NULL, 0); if (ret) { pr_err("IOSS TELEM_CTRL Event Enable Write Failed\n"); return ret; @@ -586,8 +576,9 @@ static int telemetry_setup(struct platform_device *pdev) u32 read_buf, events, event_regs; int ret; - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, IOSS_TELEM_INFO_READ, - NULL, 0, &read_buf, IOSS_TELEM_READ_WORD); + ret = intel_scu_ipc_dev_command(telm_conf->scu, IOSS_TELEM, + IOSS_TELEM_INFO_READ, NULL, 0, + &read_buf, sizeof(read_buf)); if (ret) { dev_err(&pdev->dev, "IOSS TELEM_INFO Read Failed\n"); return ret; @@ -681,6 +672,8 @@ static int telemetry_plt_set_sampling_period(u8 pss_period, u8 ioss_period) mutex_lock(&(telm_conf->telem_lock)); if (ioss_period) { + struct intel_scu_ipc_dev *scu = telm_conf->scu; + if (TELEM_SAMPLE_PERIOD_INVALID(ioss_period)) { pr_err("IOSS Sampling Period Out of Range\n"); ret = -EINVAL; @@ -688,9 +681,9 @@ static int telemetry_plt_set_sampling_period(u8 pss_period, u8 ioss_period) } /* Get telemetry EVENT CTL */ - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, IOSS_TELEM_EVENT_CTL_READ, NULL, 0, - &telem_ctrl, IOSS_TELEM_READ_WORD); + &telem_ctrl, sizeof(telem_ctrl)); if (ret) { pr_err("IOSS TELEM_CTRL Read Failed\n"); goto out; @@ -699,11 +692,10 @@ static int telemetry_plt_set_sampling_period(u8 pss_period, u8 ioss_period) /* Disable Telemetry */ TELEM_DISABLE(telem_ctrl); - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, - IOSS_TELEM_EVENT_CTL_WRITE, - (u8 *)&telem_ctrl, - IOSS_TELEM_EVT_CTRL_WRITE_SIZE, - NULL, 0); + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, + IOSS_TELEM_EVENT_CTL_WRITE, + &telem_ctrl, sizeof(telem_ctrl), + NULL, 0); if (ret) { pr_err("IOSS TELEM_CTRL Event Disable Write Failed\n"); goto out; @@ -715,11 +707,10 @@ static int telemetry_plt_set_sampling_period(u8 pss_period, u8 ioss_period) TELEM_ENABLE_PERIODIC(telem_ctrl); telem_ctrl |= ioss_period; - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, - IOSS_TELEM_EVENT_CTL_WRITE, - (u8 *)&telem_ctrl, - IOSS_TELEM_EVT_CTRL_WRITE_SIZE, - NULL, 0); + ret = intel_scu_ipc_dev_command(scu, IOSS_TELEM, + IOSS_TELEM_EVENT_CTL_WRITE, + &telem_ctrl, sizeof(telem_ctrl), + NULL, 0); if (ret) { pr_err("IOSS TELEM_CTRL Event Enable Write Failed\n"); goto out; @@ -1014,9 +1005,9 @@ static int telemetry_plt_get_trace_verbosity(enum telemetry_unit telem_unit, break; case TELEM_IOSS: - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, - IOSS_TELEM_TRACE_CTL_READ, NULL, 0, &temp, - IOSS_TELEM_READ_WORD); + ret = intel_scu_ipc_dev_command(telm_conf->scu, + IOSS_TELEM, IOSS_TELEM_TRACE_CTL_READ, + NULL, 0, &temp, sizeof(temp)); if (ret) { pr_err("IOSS TRACE_CTL Read Failed\n"); goto out; @@ -1068,9 +1059,9 @@ static int telemetry_plt_set_trace_verbosity(enum telemetry_unit telem_unit, break; case TELEM_IOSS: - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, - IOSS_TELEM_TRACE_CTL_READ, NULL, 0, &temp, - IOSS_TELEM_READ_WORD); + ret = intel_scu_ipc_dev_command(telm_conf->scu, IOSS_TELEM, + IOSS_TELEM_TRACE_CTL_READ, + NULL, 0, &temp, sizeof(temp)); if (ret) { pr_err("IOSS TRACE_CTL Read Failed\n"); goto out; @@ -1079,9 +1070,9 @@ static int telemetry_plt_set_trace_verbosity(enum telemetry_unit telem_unit, TELEM_CLEAR_VERBOSITY_BITS(temp); TELEM_SET_VERBOSITY_BITS(temp, verbosity); - ret = intel_pmc_ipc_command(PMC_IPC_PMC_TELEMTRY, - IOSS_TELEM_TRACE_CTL_WRITE, (u8 *)&temp, - IOSS_TELEM_WRITE_FOURBYTES, NULL, 0); + ret = intel_scu_ipc_dev_command(telm_conf->scu, IOSS_TELEM, + IOSS_TELEM_TRACE_CTL_WRITE, + &temp, sizeof(temp), NULL, 0); if (ret) { pr_err("IOSS TRACE_CTL Verbosity Set Failed\n"); goto out; @@ -1136,6 +1127,12 @@ static int telemetry_pltdrv_probe(struct platform_device *pdev) telm_conf->ioss_config.regmap = mem; + telm_conf->scu = devm_intel_scu_ipc_dev_get(&pdev->dev); + if (!telm_conf->scu) { + ret = -EPROBE_DEFER; + goto out; + } + mutex_init(&telm_conf->telem_lock); mutex_init(&telm_conf->telem_trace_lock); -- cgit v1.2.3 From 7713f9180cb410fbec3ae5d5024a8440e2c6c809 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:47 +0300 Subject: platform/x86: intel_pmc_ipc: Drop intel_pmc_ipc_command() Now that all callers have been converted over to the SCU IPC API we can drop intel_pmc_ipc_command(). Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel_pmc_ipc.h | 8 -------- drivers/platform/x86/intel_pmc_ipc.c | 20 -------------------- 2 files changed, 28 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h index ddc964b9c78c..22848df5faaf 100644 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ b/arch/x86/include/asm/intel_pmc_ipc.h @@ -27,19 +27,11 @@ #if IS_ENABLED(CONFIG_INTEL_PMC_IPC) -int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, - u32 *out, u32 outlen); int intel_pmc_s0ix_counter_read(u64 *data); int intel_pmc_gcr_read64(u32 offset, u64 *data); #else -static inline int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, - u32 *out, u32 outlen) -{ - return -EINVAL; -} - static inline int intel_pmc_s0ix_counter_read(u64 *data) { return -EINVAL; diff --git a/drivers/platform/x86/intel_pmc_ipc.c b/drivers/platform/x86/intel_pmc_ipc.c index 0558eadc2a14..e4110df24490 100644 --- a/drivers/platform/x86/intel_pmc_ipc.c +++ b/drivers/platform/x86/intel_pmc_ipc.c @@ -194,26 +194,6 @@ static int update_no_reboot_bit(void *priv, bool set) PMC_CFG_NO_REBOOT_MASK, value); } -/** - * intel_pmc_ipc_command() - IPC command with input/output data - * @cmd: IPC command code. - * @sub: IPC command sub type. - * @in: input data of this IPC command. - * @inlen: input data length in bytes. - * @out: output data of this IPC command. - * @outlen: output data length in dwords. - * - * Send an IPC command to PMC with input/output data. - * - * Return: an IPC error code or 0 on success. - */ -int intel_pmc_ipc_command(u32 cmd, u32 sub, u8 *in, u32 inlen, - u32 *out, u32 outlen) -{ - return intel_scu_ipc_dev_command(NULL, cmd, sub, in, inlen, out, outlen); -} -EXPORT_SYMBOL_GPL(intel_pmc_ipc_command); - static int ipc_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct intel_pmc_ipc_dev *pmc = &ipcdev; -- cgit v1.2.3 From 781adff21c841196e846a5a88505f80d04b06016 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:48 +0300 Subject: x86/platform/intel-mid: Add empty stubs for intel_scu_devices_[create|destroy]() This allows to call the functions even when CONFIG_X86_INTEL_MID is not enabled. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel-mid.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index 8e5af119dc2d..de58391bdee0 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -88,11 +88,17 @@ static inline bool intel_mid_has_msic(void) return (intel_mid_identify_cpu() == INTEL_MID_CPU_CHIP_PENWELL); } +extern void intel_scu_devices_create(void); +extern void intel_scu_devices_destroy(void); + #else /* !CONFIG_X86_INTEL_MID */ #define intel_mid_identify_cpu() 0 #define intel_mid_has_msic() 0 +static inline void intel_scu_devices_create(void) { } +static inline void intel_scu_devices_destroy(void) { } + #endif /* !CONFIG_X86_INTEL_MID */ enum intel_mid_timer_options { @@ -115,9 +121,6 @@ extern enum intel_mid_timer_options intel_mid_timer_options; #define SFI_MTMR_MAX_NUM 8 #define SFI_MRTC_MAX 8 -extern void intel_scu_devices_create(void); -extern void intel_scu_devices_destroy(void); - /* VRTC timer */ #define MRST_VRTC_MAP_SZ 1024 /* #define MRST_VRTC_PGOFFSET 0xc00 */ -- cgit v1.2.3 From 0759a8730c7070299556af8dddeecce90955c8ae Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:50 +0300 Subject: platform/x86: intel_telemetry: Add telemetry_get_pltdata() Add new function that allows telemetry modules to get pointer to the platform specific configuration. This is needed to allow the telemetry debugfs module to fetch PMC IPC instance in the subsequent patch. This also allows us to replace telemetry_pltconfig_valid() with telemetry_get_pltdata() as well. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- arch/x86/include/asm/intel_telemetry.h | 2 +- drivers/platform/x86/intel_telemetry_core.c | 17 ++++++----------- drivers/platform/x86/intel_telemetry_debugfs.c | 3 +-- 3 files changed, 8 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 274aaf0dae48..2c0e7d7a10e9 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -95,7 +95,7 @@ int telemetry_set_pltdata(const struct telemetry_core_ops *ops, int telemetry_clear_pltdata(void); -int telemetry_pltconfig_valid(void); +struct telemetry_plt_config *telemetry_get_pltdata(void); int telemetry_get_evtname(enum telemetry_unit telem_unit, const char **name, int len); diff --git a/drivers/platform/x86/intel_telemetry_core.c b/drivers/platform/x86/intel_telemetry_core.c index d4040bb222b4..fdf55b5d6948 100644 --- a/drivers/platform/x86/intel_telemetry_core.c +++ b/drivers/platform/x86/intel_telemetry_core.c @@ -353,21 +353,16 @@ int telemetry_clear_pltdata(void) EXPORT_SYMBOL_GPL(telemetry_clear_pltdata); /** - * telemetry_pltconfig_valid() - Checkif platform config is valid + * telemetry_get_pltdata() - Return telemetry platform config * - * Usage by other than telemetry module is invalid - * - * Return: 0 success, < 0 for failure + * May be used by other telemetry modules to get platform specific + * configuration. */ -int telemetry_pltconfig_valid(void) +struct telemetry_plt_config *telemetry_get_pltdata(void) { - if (telm_core_conf.plt_config) - return 0; - - else - return -EINVAL; + return telm_core_conf.plt_config; } -EXPORT_SYMBOL_GPL(telemetry_pltconfig_valid); +EXPORT_SYMBOL_GPL(telemetry_get_pltdata); static inline int telemetry_get_pssevtname(enum telemetry_unit telem_unit, const char **name, int len) diff --git a/drivers/platform/x86/intel_telemetry_debugfs.c b/drivers/platform/x86/intel_telemetry_debugfs.c index 8a53d3b485b3..6cac3e05b817 100644 --- a/drivers/platform/x86/intel_telemetry_debugfs.c +++ b/drivers/platform/x86/intel_telemetry_debugfs.c @@ -910,8 +910,7 @@ static int __init telemetry_debugfs_init(void) debugfs_conf = (struct telemetry_debugfs_conf *)id->driver_data; - err = telemetry_pltconfig_valid(); - if (err < 0) { + if (!telemetry_get_pltdata()) { pr_info("Invalid pltconfig, ensure IPC1 device is enabled in BIOS\n"); return -ENODEV; } -- cgit v1.2.3 From 25f1ca31e230598eaf3c38d387a355a64bd772a7 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 16 Apr 2020 11:15:51 +0300 Subject: platform/x86: intel_pmc_ipc: Convert to MFD This driver only creates a bunch of platform devices sharing resources belonging to the PMC device. This is pretty much what MFD subsystem is for so move the driver there, renaming it to intel_pmc_bxt.c which should be more clear what it is. MFD subsystem provides nice helper APIs for subdevice creation so convert the driver to use those. Unfortunately the ACPI device includes separate resources for most of the subdevices so we cannot simply call mfd_add_devices() to create all of them but instead we need to call it separately for each device. The new MFD driver continues to expose two sysfs attributes that allow userspace to send IPC commands to the PMC/SCU to avoid breaking any existing applications that may use these. Generally this is bad idea so document this in the ABI documentation. Signed-off-by: Mika Westerberg Reviewed-by: Andy Shevchenko Signed-off-by: Lee Jones --- .../ABI/obsolete/sysfs-driver-intel_pmc_bxt | 22 + arch/x86/include/asm/intel_pmc_ipc.h | 47 -- arch/x86/include/asm/intel_telemetry.h | 1 + drivers/mfd/Kconfig | 16 +- drivers/mfd/Makefile | 1 + drivers/mfd/intel_pmc_bxt.c | 468 +++++++++++++++ drivers/platform/x86/Kconfig | 16 +- drivers/platform/x86/Makefile | 1 - drivers/platform/x86/intel_pmc_ipc.c | 645 --------------------- drivers/platform/x86/intel_telemetry_debugfs.c | 12 +- drivers/platform/x86/intel_telemetry_pltdrv.c | 2 + drivers/usb/typec/tcpm/Kconfig | 2 +- drivers/watchdog/iTCO_wdt.c | 25 +- include/linux/mfd/intel_pmc_bxt.h | 53 ++ include/linux/platform_data/itco_wdt.h | 11 +- 15 files changed, 602 insertions(+), 720 deletions(-) create mode 100644 Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt delete mode 100644 arch/x86/include/asm/intel_pmc_ipc.h create mode 100644 drivers/mfd/intel_pmc_bxt.c delete mode 100644 drivers/platform/x86/intel_pmc_ipc.c create mode 100644 include/linux/mfd/intel_pmc_bxt.h (limited to 'arch/x86/include') diff --git a/Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt b/Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt new file mode 100644 index 000000000000..39d5659f388b --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-driver-intel_pmc_bxt @@ -0,0 +1,22 @@ +These files allow sending arbitrary IPC commands to the PMC/SCU which +may be dangerous. These will be removed eventually and should not be +used in any new applications. + +What: /sys/bus/platform/devices/INT34D2:00/simplecmd +Date: Jun 2015 +KernelVersion: 4.1 +Contact: Mika Westerberg +Description: This interface allows userspace to send an arbitrary + IPC command to the PMC/SCU. + + Format: %d %d where first number is command and + second number is subcommand. + +What: /sys/bus/platform/devices/INT34D2:00/northpeak +Date: Jun 2015 +KernelVersion: 4.1 +Contact: Mika Westerberg +Description: This interface allows userspace to enable and disable + Northpeak through the PMC/SCU. + + Format: %u. diff --git a/arch/x86/include/asm/intel_pmc_ipc.h b/arch/x86/include/asm/intel_pmc_ipc.h deleted file mode 100644 index 22848df5faaf..000000000000 --- a/arch/x86/include/asm/intel_pmc_ipc.h +++ /dev/null @@ -1,47 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_INTEL_PMC_IPC_H_ -#define _ASM_X86_INTEL_PMC_IPC_H_ - -/* Commands */ -#define PMC_IPC_USB_PWR_CTRL 0xF0 -#define PMC_IPC_PMIC_BLACKLIST_SEL 0xEF -#define PMC_IPC_PHY_CONFIG 0xEE -#define PMC_IPC_NORTHPEAK_CTRL 0xED -#define PMC_IPC_PM_DEBUG 0xEC -#define PMC_IPC_PMC_FW_MSG_CTRL 0xEA - -/* IPC return code */ -#define IPC_ERR_NONE 0 -#define IPC_ERR_CMD_NOT_SUPPORTED 1 -#define IPC_ERR_CMD_NOT_SERVICED 2 -#define IPC_ERR_UNABLE_TO_SERVICE 3 -#define IPC_ERR_CMD_INVALID 4 -#define IPC_ERR_CMD_FAILED 5 -#define IPC_ERR_EMSECURITY 6 -#define IPC_ERR_UNSIGNEDKERNEL 7 - -/* GCR reg offsets from gcr base*/ -#define PMC_GCR_PMC_CFG_REG 0x08 -#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78 -#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80 - -#if IS_ENABLED(CONFIG_INTEL_PMC_IPC) - -int intel_pmc_s0ix_counter_read(u64 *data); -int intel_pmc_gcr_read64(u32 offset, u64 *data); - -#else - -static inline int intel_pmc_s0ix_counter_read(u64 *data) -{ - return -EINVAL; -} - -static inline int intel_pmc_gcr_read64(u32 offset, u64 *data) -{ - return -EINVAL; -} - -#endif /*CONFIG_INTEL_PMC_IPC*/ - -#endif diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index 2c0e7d7a10e9..8046e70dfd7c 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -53,6 +53,7 @@ struct telemetry_plt_config { struct telemetry_unit_config ioss_config; struct mutex telem_trace_lock; struct mutex telem_lock; + struct intel_pmc_dev *pmc; struct intel_scu_ipc_dev *scu; bool telem_in_use; }; diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index 5e21a78b6923..54b6aa4aaab1 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -551,7 +551,7 @@ config INTEL_SOC_PMIC config INTEL_SOC_PMIC_BXTWC tristate "Support for Intel Broxton Whiskey Cove PMIC" - depends on INTEL_PMC_IPC + depends on MFD_INTEL_PMC_BXT select MFD_CORE select REGMAP_IRQ help @@ -632,6 +632,20 @@ config MFD_INTEL_MSIC Passage) chip. This chip embeds audio, battery, GPIO, etc. devices used in Intel Medfield platforms. +config MFD_INTEL_PMC_BXT + tristate "Intel PMC Driver for Broxton" + depends on X86 + depends on X86_PLATFORM_DEVICES + depends on ACPI + select INTEL_SCU_IPC + select MFD_CORE + help + This driver provides support for the PMC (Power Management + Controller) on Intel Broxton and Apollo Lake. The PMC is a + multi-function device that exposes IPC, General Control + Register and P-unit access. In addition this creates devices + for iTCO watchdog and telemetry that are part of the PMC. + config MFD_IPAQ_MICRO bool "Atmel Micro ASIC (iPAQ h3100/h3600/h3700) Support" depends on SA1100_H3100 || SA1100_H3600 diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile index f935d10cbf0f..7761f84a96eb 100644 --- a/drivers/mfd/Makefile +++ b/drivers/mfd/Makefile @@ -212,6 +212,7 @@ obj-$(CONFIG_MFD_INTEL_LPSS) += intel-lpss.o obj-$(CONFIG_MFD_INTEL_LPSS_PCI) += intel-lpss-pci.o obj-$(CONFIG_MFD_INTEL_LPSS_ACPI) += intel-lpss-acpi.o obj-$(CONFIG_MFD_INTEL_MSIC) += intel_msic.o +obj-$(CONFIG_MFD_INTEL_PMC_BXT) += intel_pmc_bxt.o obj-$(CONFIG_MFD_PALMAS) += palmas.o obj-$(CONFIG_MFD_VIPERBOARD) += viperboard.o obj-$(CONFIG_MFD_RC5T583) += rc5t583.o rc5t583-irq.o diff --git a/drivers/mfd/intel_pmc_bxt.c b/drivers/mfd/intel_pmc_bxt.c new file mode 100644 index 000000000000..9f01d38acc7f --- /dev/null +++ b/drivers/mfd/intel_pmc_bxt.c @@ -0,0 +1,468 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Driver for the Intel Broxton PMC + * + * (C) Copyright 2014 - 2020 Intel Corporation + * + * This driver is based on Intel SCU IPC driver (intel_scu_ipc.c) by + * Sreedhara DS + * + * The PMC (Power Management Controller) running on the ARC processor + * communicates with another entity running in the IA (Intel Architecture) + * core through an IPC (Intel Processor Communications) mechanism which in + * turn sends messages between the IA and the PMC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Residency with clock rate at 19.2MHz to usecs */ +#define S0IX_RESIDENCY_IN_USECS(d, s) \ +({ \ + u64 result = 10ull * ((d) + (s)); \ + do_div(result, 192); \ + result; \ +}) + +/* Resources exported from IFWI */ +#define PLAT_RESOURCE_IPC_INDEX 0 +#define PLAT_RESOURCE_IPC_SIZE 0x1000 +#define PLAT_RESOURCE_GCR_OFFSET 0x1000 +#define PLAT_RESOURCE_GCR_SIZE 0x1000 +#define PLAT_RESOURCE_BIOS_DATA_INDEX 1 +#define PLAT_RESOURCE_BIOS_IFACE_INDEX 2 +#define PLAT_RESOURCE_TELEM_SSRAM_INDEX 3 +#define PLAT_RESOURCE_ISP_DATA_INDEX 4 +#define PLAT_RESOURCE_ISP_IFACE_INDEX 5 +#define PLAT_RESOURCE_GTD_DATA_INDEX 6 +#define PLAT_RESOURCE_GTD_IFACE_INDEX 7 +#define PLAT_RESOURCE_ACPI_IO_INDEX 0 + +/* + * BIOS does not create an ACPI device for each PMC function, but + * exports multiple resources from one ACPI device (IPC) for multiple + * functions. This driver is responsible for creating a child device and + * to export resources for those functions. + */ +#define SMI_EN_OFFSET 0x0040 +#define SMI_EN_SIZE 4 +#define TCO_BASE_OFFSET 0x0060 +#define TCO_REGS_SIZE 16 +#define TELEM_SSRAM_SIZE 240 +#define TELEM_PMC_SSRAM_OFFSET 0x1b00 +#define TELEM_PUNIT_SSRAM_OFFSET 0x1a00 + +/* Commands */ +#define PMC_NORTHPEAK_CTRL 0xed + +static inline bool is_gcr_valid(u32 offset) +{ + return offset < PLAT_RESOURCE_GCR_SIZE - 8; +} + +/** + * intel_pmc_gcr_read64() - Read a 64-bit PMC GCR register + * @pmc: PMC device pointer + * @offset: offset of GCR register from GCR address base + * @data: data pointer for storing the register output + * + * Reads the 64-bit PMC GCR register at given offset. + * + * Return: Negative value on error or 0 on success. + */ +int intel_pmc_gcr_read64(struct intel_pmc_dev *pmc, u32 offset, u64 *data) +{ + if (!is_gcr_valid(offset)) + return -EINVAL; + + spin_lock(&pmc->gcr_lock); + *data = readq(pmc->gcr_mem_base + offset); + spin_unlock(&pmc->gcr_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(intel_pmc_gcr_read64); + +/** + * intel_pmc_gcr_update() - Update PMC GCR register bits + * @pmc: PMC device pointer + * @offset: offset of GCR register from GCR address base + * @mask: bit mask for update operation + * @val: update value + * + * Updates the bits of given GCR register as specified by + * @mask and @val. + * + * Return: Negative value on error or 0 on success. + */ +int intel_pmc_gcr_update(struct intel_pmc_dev *pmc, u32 offset, u32 mask, u32 val) +{ + u32 new_val; + + if (!is_gcr_valid(offset)) + return -EINVAL; + + spin_lock(&pmc->gcr_lock); + new_val = readl(pmc->gcr_mem_base + offset); + + new_val = (new_val & ~mask) | (val & mask); + writel(new_val, pmc->gcr_mem_base + offset); + + new_val = readl(pmc->gcr_mem_base + offset); + spin_unlock(&pmc->gcr_lock); + + /* Check whether the bit update is successful */ + return (new_val & mask) != (val & mask) ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(intel_pmc_gcr_update); + +/** + * intel_pmc_s0ix_counter_read() - Read S0ix residency + * @pmc: PMC device pointer + * @data: Out param that contains current S0ix residency count. + * + * Writes to @data how many usecs the system has been in low-power S0ix + * state. + * + * Return: An error code or 0 on success. + */ +int intel_pmc_s0ix_counter_read(struct intel_pmc_dev *pmc, u64 *data) +{ + u64 deep, shlw; + + spin_lock(&pmc->gcr_lock); + deep = readq(pmc->gcr_mem_base + PMC_GCR_TELEM_DEEP_S0IX_REG); + shlw = readq(pmc->gcr_mem_base + PMC_GCR_TELEM_SHLW_S0IX_REG); + spin_unlock(&pmc->gcr_lock); + + *data = S0IX_RESIDENCY_IN_USECS(deep, shlw); + return 0; +} +EXPORT_SYMBOL_GPL(intel_pmc_s0ix_counter_read); + +/** + * simplecmd_store() - Send a simple IPC command + * @dev: Device under the attribute is + * @attr: Attribute in question + * @buf: Buffer holding data to be stored to the attribute + * @count: Number of bytes in @buf + * + * Expects a string with two integers separated with space. These two + * values hold command and subcommand that is send to PMC. + * + * Return: Number number of bytes written (@count) or negative errno in + * case of error. + */ +static ssize_t simplecmd_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct intel_pmc_dev *pmc = dev_get_drvdata(dev); + struct intel_scu_ipc_dev *scu = pmc->scu; + int subcmd; + int cmd; + int ret; + + ret = sscanf(buf, "%d %d", &cmd, &subcmd); + if (ret != 2) { + dev_err(dev, "Invalid values, expected: cmd subcmd\n"); + return -EINVAL; + } + + ret = intel_scu_ipc_dev_simple_command(scu, cmd, subcmd); + if (ret) + return ret; + + return count; +} +static DEVICE_ATTR_WO(simplecmd); + +/** + * northpeak_store() - Enable or disable Northpeak + * @dev: Device under the attribute is + * @attr: Attribute in question + * @buf: Buffer holding data to be stored to the attribute + * @count: Number of bytes in @buf + * + * Expects an unsigned integer. Non-zero enables Northpeak and zero + * disables it. + * + * Return: Number number of bytes written (@count) or negative errno in + * case of error. + */ +static ssize_t northpeak_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct intel_pmc_dev *pmc = dev_get_drvdata(dev); + struct intel_scu_ipc_dev *scu = pmc->scu; + unsigned long val; + int subcmd; + int ret; + + ret = kstrtoul(buf, 0, &val); + if (ret) + return ret; + + /* Northpeak is enabled if subcmd == 1 and disabled if it is 0 */ + if (val) + subcmd = 1; + else + subcmd = 0; + + ret = intel_scu_ipc_dev_simple_command(scu, PMC_NORTHPEAK_CTRL, subcmd); + if (ret) + return ret; + + return count; +} +static DEVICE_ATTR_WO(northpeak); + +static struct attribute *intel_pmc_attrs[] = { + &dev_attr_northpeak.attr, + &dev_attr_simplecmd.attr, + NULL +}; + +static const struct attribute_group intel_pmc_group = { + .attrs = intel_pmc_attrs, +}; + +static const struct attribute_group *intel_pmc_groups[] = { + &intel_pmc_group, + NULL +}; + +static struct resource punit_res[6]; + +static struct mfd_cell punit = { + .name = "intel_punit_ipc", + .resources = punit_res, +}; + +static struct itco_wdt_platform_data tco_pdata = { + .name = "Apollo Lake SoC", + .version = 5, + .no_reboot_use_pmc = true, +}; + +static struct resource tco_res[2]; + +static const struct mfd_cell tco = { + .name = "iTCO_wdt", + .ignore_resource_conflicts = true, + .resources = tco_res, + .num_resources = ARRAY_SIZE(tco_res), + .platform_data = &tco_pdata, + .pdata_size = sizeof(tco_pdata), +}; + +static const struct resource telem_res[] = { + DEFINE_RES_MEM(TELEM_PUNIT_SSRAM_OFFSET, TELEM_SSRAM_SIZE), + DEFINE_RES_MEM(TELEM_PMC_SSRAM_OFFSET, TELEM_SSRAM_SIZE), +}; + +static const struct mfd_cell telem = { + .name = "intel_telemetry", + .resources = telem_res, + .num_resources = ARRAY_SIZE(telem_res), +}; + +static int intel_pmc_get_tco_resources(struct platform_device *pdev) +{ + struct resource *res; + + if (acpi_has_watchdog()) + return 0; + + res = platform_get_resource(pdev, IORESOURCE_IO, + PLAT_RESOURCE_ACPI_IO_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get IO resource\n"); + return -EINVAL; + } + + tco_res[0].flags = IORESOURCE_IO; + tco_res[0].start = res->start + TCO_BASE_OFFSET; + tco_res[0].end = tco_res[0].start + TCO_REGS_SIZE - 1; + tco_res[1].flags = IORESOURCE_IO; + tco_res[1].start = res->start + SMI_EN_OFFSET; + tco_res[1].end = tco_res[1].start + SMI_EN_SIZE - 1; + + return 0; +} + +static int intel_pmc_get_resources(struct platform_device *pdev, + struct intel_pmc_dev *pmc, + struct intel_scu_ipc_data *scu_data) +{ + struct resource gcr_res; + size_t npunit_res = 0; + struct resource *res; + int ret; + + scu_data->irq = platform_get_irq_optional(pdev, 0); + + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_IPC_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get IPC resource\n"); + return -EINVAL; + } + + /* IPC registers */ + scu_data->mem.flags = res->flags; + scu_data->mem.start = res->start; + scu_data->mem.end = res->start + PLAT_RESOURCE_IPC_SIZE - 1; + + /* GCR registers */ + gcr_res.flags = res->flags; + gcr_res.start = res->start + PLAT_RESOURCE_GCR_OFFSET; + gcr_res.end = gcr_res.start + PLAT_RESOURCE_GCR_SIZE - 1; + + pmc->gcr_mem_base = devm_ioremap_resource(&pdev->dev, &gcr_res); + if (IS_ERR(pmc->gcr_mem_base)) + return PTR_ERR(pmc->gcr_mem_base); + + /* Only register iTCO watchdog if there is no WDAT ACPI table */ + ret = intel_pmc_get_tco_resources(pdev); + if (ret) + return ret; + + /* BIOS data register */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_BIOS_DATA_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get resource of P-unit BIOS data\n"); + return -EINVAL; + } + punit_res[npunit_res++] = *res; + + /* BIOS interface register */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_BIOS_IFACE_INDEX); + if (!res) { + dev_err(&pdev->dev, "Failed to get resource of P-unit BIOS interface\n"); + return -EINVAL; + } + punit_res[npunit_res++] = *res; + + /* ISP data register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_ISP_DATA_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + /* ISP interface register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_ISP_IFACE_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + /* GTD data register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_GTD_DATA_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + /* GTD interface register, optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_GTD_IFACE_INDEX); + if (res) + punit_res[npunit_res++] = *res; + + punit.num_resources = npunit_res; + + /* Telemetry SSRAM is optional */ + res = platform_get_resource(pdev, IORESOURCE_MEM, + PLAT_RESOURCE_TELEM_SSRAM_INDEX); + if (res) + pmc->telem_base = res; + + return 0; +} + +static int intel_pmc_create_devices(struct intel_pmc_dev *pmc) +{ + int ret; + + if (!acpi_has_watchdog()) { + ret = devm_mfd_add_devices(pmc->dev, PLATFORM_DEVID_AUTO, &tco, + 1, NULL, 0, NULL); + if (ret) + return ret; + } + + ret = devm_mfd_add_devices(pmc->dev, PLATFORM_DEVID_AUTO, &punit, 1, + NULL, 0, NULL); + if (ret) + return ret; + + if (pmc->telem_base) { + ret = devm_mfd_add_devices(pmc->dev, PLATFORM_DEVID_AUTO, + &telem, 1, pmc->telem_base, 0, NULL); + } + + return ret; +} + +static const struct acpi_device_id intel_pmc_acpi_ids[] = { + { "INT34D2" }, + { } +}; +MODULE_DEVICE_TABLE(acpi, intel_pmc_acpi_ids); + +static int intel_pmc_probe(struct platform_device *pdev) +{ + struct intel_scu_ipc_data scu_data = {}; + struct intel_pmc_dev *pmc; + int ret; + + pmc = devm_kzalloc(&pdev->dev, sizeof(*pmc), GFP_KERNEL); + if (!pmc) + return -ENOMEM; + + pmc->dev = &pdev->dev; + spin_lock_init(&pmc->gcr_lock); + + ret = intel_pmc_get_resources(pdev, pmc, &scu_data); + if (ret) { + dev_err(&pdev->dev, "Failed to request resources\n"); + return ret; + } + + pmc->scu = devm_intel_scu_ipc_register(&pdev->dev, &scu_data); + if (IS_ERR(pmc->scu)) + return PTR_ERR(pmc->scu); + + platform_set_drvdata(pdev, pmc); + + ret = intel_pmc_create_devices(pmc); + if (ret) + dev_err(&pdev->dev, "Failed to create PMC devices\n"); + + return ret; +} + +static struct platform_driver intel_pmc_driver = { + .probe = intel_pmc_probe, + .driver = { + .name = "intel_pmc_bxt", + .acpi_match_table = intel_pmc_acpi_ids, + .dev_groups = intel_pmc_groups, + }, +}; +module_platform_driver(intel_pmc_driver); + +MODULE_AUTHOR("Mika Westerberg "); +MODULE_AUTHOR("Zha Qipeng "); +MODULE_DESCRIPTION("Intel Broxton PMC driver"); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 303514af2e0d..642316761443 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -1269,7 +1269,8 @@ config INTEL_UNCORE_FREQ_CONTROL config INTEL_BXTWC_PMIC_TMU tristate "Intel BXT Whiskey Cove TMU Driver" depends on REGMAP - depends on INTEL_SOC_PMIC_BXTWC && INTEL_PMC_IPC + depends on MFD_INTEL_PMC_BXT + depends on INTEL_SOC_PMIC_BXTWC ---help--- Select this driver to use Intel BXT Whiskey Cove PMIC TMU feature. This driver enables the alarm wakeup functionality in the TMU unit @@ -1327,15 +1328,6 @@ config INTEL_PMC_CORE - LTR Ignore - MPHY/PLL gating status (Sunrisepoint PCH only) -config INTEL_PMC_IPC - tristate "Intel PMC IPC Driver" - depends on ACPI - select INTEL_SCU_IPC - ---help--- - This driver provides support for PMC control on some Intel platforms. - The PMC is an ARC processor which defines IPC commands for communication - with other entities in the CPU. - config INTEL_PUNIT_IPC tristate "Intel P-Unit IPC Driver" ---help--- @@ -1374,7 +1366,9 @@ config INTEL_SCU_IPC_UTIL config INTEL_TELEMETRY tristate "Intel SoC Telemetry Driver" - depends on INTEL_PMC_IPC && INTEL_PUNIT_IPC && X86_64 + depends on X86_64 + depends on MFD_INTEL_PMC_BXT + depends on INTEL_PUNIT_IPC ---help--- This driver provides interfaces to configure and use telemetry for INTEL SoC from APL onwards. It is also diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index ef995a0f04f0..04db27a25946 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -138,7 +138,6 @@ obj-$(CONFIG_INTEL_MFLD_THERMAL) += intel_mid_thermal.o obj-$(CONFIG_INTEL_MID_POWER_BUTTON) += intel_mid_powerbtn.o obj-$(CONFIG_INTEL_MRFLD_PWRBTN) += intel_mrfld_pwrbtn.o obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o intel_pmc_core_pltdrv.o -obj-$(CONFIG_INTEL_PMC_IPC) += intel_pmc_ipc.o obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_SCU_IPC) += intel_scu_ipc.o obj-$(CONFIG_INTEL_SCU_PCI) += intel_scu_pcidrv.o diff --git a/drivers/platform/x86/intel_pmc_ipc.c b/drivers/platform/x86/intel_pmc_ipc.c deleted file mode 100644 index 16ca4ee9cdd5..000000000000 --- a/drivers/platform/x86/intel_pmc_ipc.c +++ /dev/null @@ -1,645 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Driver for the Intel PMC IPC mechanism - * - * (C) Copyright 2014-2015 Intel Corporation - * - * This driver is based on Intel SCU IPC driver(intel_scu_ipc.c) by - * Sreedhara DS - * - * PMC running in ARC processor communicates with other entity running in IA - * core through IPC mechanism which in turn messaging between IA core ad PMC. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -/* Residency with clock rate at 19.2MHz to usecs */ -#define S0IX_RESIDENCY_IN_USECS(d, s) \ -({ \ - u64 result = 10ull * ((d) + (s)); \ - do_div(result, 192); \ - result; \ -}) - -/* exported resources from IFWI */ -#define PLAT_RESOURCE_IPC_INDEX 0 -#define PLAT_RESOURCE_IPC_SIZE 0x1000 -#define PLAT_RESOURCE_GCR_OFFSET 0x1000 -#define PLAT_RESOURCE_GCR_SIZE 0x1000 -#define PLAT_RESOURCE_BIOS_DATA_INDEX 1 -#define PLAT_RESOURCE_BIOS_IFACE_INDEX 2 -#define PLAT_RESOURCE_TELEM_SSRAM_INDEX 3 -#define PLAT_RESOURCE_ISP_DATA_INDEX 4 -#define PLAT_RESOURCE_ISP_IFACE_INDEX 5 -#define PLAT_RESOURCE_GTD_DATA_INDEX 6 -#define PLAT_RESOURCE_GTD_IFACE_INDEX 7 -#define PLAT_RESOURCE_ACPI_IO_INDEX 0 - -/* - * BIOS does not create an ACPI device for each PMC function, - * but exports multiple resources from one ACPI device(IPC) for - * multiple functions. This driver is responsible to create a - * platform device and to export resources for those functions. - */ -#define TCO_DEVICE_NAME "iTCO_wdt" -#define SMI_EN_OFFSET 0x40 -#define SMI_EN_SIZE 4 -#define TCO_BASE_OFFSET 0x60 -#define TCO_REGS_SIZE 16 -#define PUNIT_DEVICE_NAME "intel_punit_ipc" -#define TELEMETRY_DEVICE_NAME "intel_telemetry" -#define TELEM_SSRAM_SIZE 240 -#define TELEM_PMC_SSRAM_OFFSET 0x1B00 -#define TELEM_PUNIT_SSRAM_OFFSET 0x1A00 -#define TCO_PMC_OFFSET 0x08 -#define TCO_PMC_SIZE 0x04 - -/* PMC register bit definitions */ - -/* PMC_CFG_REG bit masks */ -#define PMC_CFG_NO_REBOOT_MASK BIT_MASK(4) -#define PMC_CFG_NO_REBOOT_EN (1 << 4) -#define PMC_CFG_NO_REBOOT_DIS (0 << 4) - -static struct intel_pmc_ipc_dev { - struct device *dev; - - /* The following PMC BARs share the same ACPI device with the IPC */ - resource_size_t acpi_io_base; - int acpi_io_size; - struct platform_device *tco_dev; - - /* gcr */ - void __iomem *gcr_mem_base; - bool has_gcr_regs; - spinlock_t gcr_lock; - - /* punit */ - struct platform_device *punit_dev; - unsigned int punit_res_count; - - /* Telemetry */ - resource_size_t telem_pmc_ssram_base; - resource_size_t telem_punit_ssram_base; - int telem_pmc_ssram_size; - int telem_punit_ssram_size; - u8 telem_res_inval; - struct platform_device *telemetry_dev; -} ipcdev; - -static inline u64 gcr_data_readq(u32 offset) -{ - return readq(ipcdev.gcr_mem_base + offset); -} - -static inline int is_gcr_valid(u32 offset) -{ - if (!ipcdev.has_gcr_regs) - return -EACCES; - - if (offset > PLAT_RESOURCE_GCR_SIZE) - return -EINVAL; - - return 0; -} - -/** - * intel_pmc_gcr_read64() - Read a 64-bit PMC GCR register - * @offset: offset of GCR register from GCR address base - * @data: data pointer for storing the register output - * - * Reads the 64-bit PMC GCR register at given offset. - * - * Return: negative value on error or 0 on success. - */ -int intel_pmc_gcr_read64(u32 offset, u64 *data) -{ - int ret; - - spin_lock(&ipcdev.gcr_lock); - - ret = is_gcr_valid(offset); - if (ret < 0) { - spin_unlock(&ipcdev.gcr_lock); - return ret; - } - - *data = readq(ipcdev.gcr_mem_base + offset); - - spin_unlock(&ipcdev.gcr_lock); - - return 0; -} -EXPORT_SYMBOL_GPL(intel_pmc_gcr_read64); - -/** - * intel_pmc_gcr_update() - Update PMC GCR register bits - * @offset: offset of GCR register from GCR address base - * @mask: bit mask for update operation - * @val: update value - * - * Updates the bits of given GCR register as specified by - * @mask and @val. - * - * Return: negative value on error or 0 on success. - */ -static int intel_pmc_gcr_update(u32 offset, u32 mask, u32 val) -{ - u32 new_val; - int ret = 0; - - spin_lock(&ipcdev.gcr_lock); - - ret = is_gcr_valid(offset); - if (ret < 0) - goto gcr_ipc_unlock; - - new_val = readl(ipcdev.gcr_mem_base + offset); - - new_val &= ~mask; - new_val |= val & mask; - - writel(new_val, ipcdev.gcr_mem_base + offset); - - new_val = readl(ipcdev.gcr_mem_base + offset); - - /* check whether the bit update is successful */ - if ((new_val & mask) != (val & mask)) { - ret = -EIO; - goto gcr_ipc_unlock; - } - -gcr_ipc_unlock: - spin_unlock(&ipcdev.gcr_lock); - return ret; -} - -static int update_no_reboot_bit(void *priv, bool set) -{ - u32 value = set ? PMC_CFG_NO_REBOOT_EN : PMC_CFG_NO_REBOOT_DIS; - - return intel_pmc_gcr_update(PMC_GCR_PMC_CFG_REG, - PMC_CFG_NO_REBOOT_MASK, value); -} - -static ssize_t intel_pmc_ipc_simple_cmd_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct intel_scu_ipc_dev *scu = dev_get_drvdata(dev); - int subcmd; - int cmd; - int ret; - - ret = sscanf(buf, "%d %d", &cmd, &subcmd); - if (ret != 2) { - dev_err(dev, "Error args\n"); - return -EINVAL; - } - - ret = intel_scu_ipc_dev_simple_command(scu, cmd, subcmd); - if (ret) { - dev_err(dev, "command %d error with %d\n", cmd, ret); - return ret; - } - return (ssize_t)count; -} -static DEVICE_ATTR(simplecmd, 0200, NULL, intel_pmc_ipc_simple_cmd_store); - -static ssize_t intel_pmc_ipc_northpeak_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct intel_scu_ipc_dev *scu = dev_get_drvdata(dev); - unsigned long val; - int subcmd; - int ret; - - ret = kstrtoul(buf, 0, &val); - if (ret) - return ret; - - if (val) - subcmd = 1; - else - subcmd = 0; - ret = intel_scu_ipc_dev_simple_command(scu, PMC_IPC_NORTHPEAK_CTRL, subcmd); - if (ret) { - dev_err(dev, "command north %d error with %d\n", subcmd, ret); - return ret; - } - return (ssize_t)count; -} -static DEVICE_ATTR(northpeak, 0200, NULL, intel_pmc_ipc_northpeak_store); - -static struct attribute *intel_ipc_attrs[] = { - &dev_attr_northpeak.attr, - &dev_attr_simplecmd.attr, - NULL -}; - -static const struct attribute_group intel_ipc_group = { - .attrs = intel_ipc_attrs, -}; - -static const struct attribute_group *intel_ipc_groups[] = { - &intel_ipc_group, - NULL -}; - -static struct resource punit_res_array[] = { - /* Punit BIOS */ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, - /* Punit ISP */ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, - /* Punit GTD */ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, -}; - -#define TCO_RESOURCE_ACPI_IO 0 -#define TCO_RESOURCE_SMI_EN_IO 1 -#define TCO_RESOURCE_GCR_MEM 2 -static struct resource tco_res[] = { - /* ACPI - TCO */ - { - .flags = IORESOURCE_IO, - }, - /* ACPI - SMI */ - { - .flags = IORESOURCE_IO, - }, -}; - -static struct itco_wdt_platform_data tco_info = { - .name = "Apollo Lake SoC", - .version = 5, - .no_reboot_priv = &ipcdev, - .update_no_reboot_bit = update_no_reboot_bit, -}; - -#define TELEMETRY_RESOURCE_PUNIT_SSRAM 0 -#define TELEMETRY_RESOURCE_PMC_SSRAM 1 -static struct resource telemetry_res[] = { - /*Telemetry*/ - { - .flags = IORESOURCE_MEM, - }, - { - .flags = IORESOURCE_MEM, - }, -}; - -static int ipc_create_punit_device(void) -{ - struct platform_device *pdev; - const struct platform_device_info pdevinfo = { - .parent = ipcdev.dev, - .name = PUNIT_DEVICE_NAME, - .id = -1, - .res = punit_res_array, - .num_res = ipcdev.punit_res_count, - }; - - pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - ipcdev.punit_dev = pdev; - - return 0; -} - -static int ipc_create_tco_device(void) -{ - struct platform_device *pdev; - struct resource *res; - const struct platform_device_info pdevinfo = { - .parent = ipcdev.dev, - .name = TCO_DEVICE_NAME, - .id = -1, - .res = tco_res, - .num_res = ARRAY_SIZE(tco_res), - .data = &tco_info, - .size_data = sizeof(tco_info), - }; - - res = tco_res + TCO_RESOURCE_ACPI_IO; - res->start = ipcdev.acpi_io_base + TCO_BASE_OFFSET; - res->end = res->start + TCO_REGS_SIZE - 1; - - res = tco_res + TCO_RESOURCE_SMI_EN_IO; - res->start = ipcdev.acpi_io_base + SMI_EN_OFFSET; - res->end = res->start + SMI_EN_SIZE - 1; - - pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - ipcdev.tco_dev = pdev; - - return 0; -} - -static int ipc_create_telemetry_device(void) -{ - struct platform_device *pdev; - struct resource *res; - const struct platform_device_info pdevinfo = { - .parent = ipcdev.dev, - .name = TELEMETRY_DEVICE_NAME, - .id = -1, - .res = telemetry_res, - .num_res = ARRAY_SIZE(telemetry_res), - }; - - res = telemetry_res + TELEMETRY_RESOURCE_PUNIT_SSRAM; - res->start = ipcdev.telem_punit_ssram_base; - res->end = res->start + ipcdev.telem_punit_ssram_size - 1; - - res = telemetry_res + TELEMETRY_RESOURCE_PMC_SSRAM; - res->start = ipcdev.telem_pmc_ssram_base; - res->end = res->start + ipcdev.telem_pmc_ssram_size - 1; - - pdev = platform_device_register_full(&pdevinfo); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - - ipcdev.telemetry_dev = pdev; - - return 0; -} - -static int ipc_create_pmc_devices(void) -{ - int ret; - - /* If we have ACPI based watchdog use that instead */ - if (!acpi_has_watchdog()) { - ret = ipc_create_tco_device(); - if (ret) { - dev_err(ipcdev.dev, "Failed to add tco platform device\n"); - return ret; - } - } - - ret = ipc_create_punit_device(); - if (ret) { - dev_err(ipcdev.dev, "Failed to add punit platform device\n"); - platform_device_unregister(ipcdev.tco_dev); - return ret; - } - - if (!ipcdev.telem_res_inval) { - ret = ipc_create_telemetry_device(); - if (ret) { - dev_warn(ipcdev.dev, - "Failed to add telemetry platform device\n"); - platform_device_unregister(ipcdev.punit_dev); - platform_device_unregister(ipcdev.tco_dev); - } - } - - return ret; -} - -static int ipc_plat_get_res(struct platform_device *pdev, - struct intel_scu_ipc_data *scu_data) -{ - struct resource *res, *punit_res = punit_res_array; - resource_size_t start; - void __iomem *addr; - int size; - - res = platform_get_resource(pdev, IORESOURCE_IO, - PLAT_RESOURCE_ACPI_IO_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get io resource\n"); - return -ENXIO; - } - size = resource_size(res); - ipcdev.acpi_io_base = res->start; - ipcdev.acpi_io_size = size; - dev_info(&pdev->dev, "io res: %pR\n", res); - - ipcdev.punit_res_count = 0; - - /* This is index 0 to cover BIOS data register */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_BIOS_DATA_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get res of punit BIOS data\n"); - return -ENXIO; - } - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit BIOS data res: %pR\n", res); - - /* This is index 1 to cover BIOS interface register */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_BIOS_IFACE_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get res of punit BIOS iface\n"); - return -ENXIO; - } - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit BIOS interface res: %pR\n", res); - - /* This is index 2 to cover ISP data register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_ISP_DATA_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit ISP data res: %pR\n", res); - } - - /* This is index 3 to cover ISP interface register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_ISP_IFACE_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit ISP interface res: %pR\n", res); - } - - /* This is index 4 to cover GTD data register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_GTD_DATA_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit GTD data res: %pR\n", res); - } - - /* This is index 5 to cover GTD interface register, optional */ - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_GTD_IFACE_INDEX); - if (res) { - punit_res[ipcdev.punit_res_count++] = *res; - dev_info(&pdev->dev, "punit GTD interface res: %pR\n", res); - } - - scu_data->irq = platform_get_irq(pdev, 0); - - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_IPC_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get ipc resource\n"); - return -ENXIO; - } - dev_info(&pdev->dev, "ipc res: %pR\n", res); - - scu_data->mem.flags = res->flags; - scu_data->mem.start = res->start; - scu_data->mem.end = res->start + PLAT_RESOURCE_IPC_SIZE - 1; - - start = res->start + PLAT_RESOURCE_GCR_OFFSET; - if (!devm_request_mem_region(&pdev->dev, start, PLAT_RESOURCE_GCR_SIZE, - "pmc_ipc_plat")) - return -EBUSY; - - addr = devm_ioremap(&pdev->dev, start, PLAT_RESOURCE_GCR_SIZE); - if (!addr) - return -ENOMEM; - - ipcdev.gcr_mem_base = addr; - - ipcdev.telem_res_inval = 0; - res = platform_get_resource(pdev, IORESOURCE_MEM, - PLAT_RESOURCE_TELEM_SSRAM_INDEX); - if (!res) { - dev_err(&pdev->dev, "Failed to get telemetry ssram resource\n"); - ipcdev.telem_res_inval = 1; - } else { - ipcdev.telem_punit_ssram_base = res->start + - TELEM_PUNIT_SSRAM_OFFSET; - ipcdev.telem_punit_ssram_size = TELEM_SSRAM_SIZE; - ipcdev.telem_pmc_ssram_base = res->start + - TELEM_PMC_SSRAM_OFFSET; - ipcdev.telem_pmc_ssram_size = TELEM_SSRAM_SIZE; - dev_info(&pdev->dev, "telemetry ssram res: %pR\n", res); - } - - return 0; -} - -/** - * intel_pmc_s0ix_counter_read() - Read S0ix residency. - * @data: Out param that contains current S0ix residency count. - * - * Return: an error code or 0 on success. - */ -int intel_pmc_s0ix_counter_read(u64 *data) -{ - u64 deep, shlw; - - if (!ipcdev.has_gcr_regs) - return -EACCES; - - deep = gcr_data_readq(PMC_GCR_TELEM_DEEP_S0IX_REG); - shlw = gcr_data_readq(PMC_GCR_TELEM_SHLW_S0IX_REG); - - *data = S0IX_RESIDENCY_IN_USECS(deep, shlw); - - return 0; -} -EXPORT_SYMBOL_GPL(intel_pmc_s0ix_counter_read); - -#ifdef CONFIG_ACPI -static const struct acpi_device_id ipc_acpi_ids[] = { - { "INT34D2", 0}, - { } -}; -MODULE_DEVICE_TABLE(acpi, ipc_acpi_ids); -#endif - -static int ipc_plat_probe(struct platform_device *pdev) -{ - struct intel_scu_ipc_data scu_data = {}; - struct intel_scu_ipc_dev *scu; - int ret; - - ipcdev.dev = &pdev->dev; - spin_lock_init(&ipcdev.gcr_lock); - - ret = ipc_plat_get_res(pdev, &scu_data); - if (ret) { - dev_err(&pdev->dev, "Failed to request resource\n"); - return ret; - } - - scu = devm_intel_scu_ipc_register(&pdev->dev, &scu_data); - if (IS_ERR(scu)) - return PTR_ERR(scu); - - platform_set_drvdata(pdev, scu); - - ret = ipc_create_pmc_devices(); - if (ret) { - dev_err(&pdev->dev, "Failed to create pmc devices\n"); - return ret; - } - - ipcdev.has_gcr_regs = true; - - return 0; -} - -static int ipc_plat_remove(struct platform_device *pdev) -{ - platform_device_unregister(ipcdev.tco_dev); - platform_device_unregister(ipcdev.punit_dev); - platform_device_unregister(ipcdev.telemetry_dev); - ipcdev.dev = NULL; - return 0; -} - -static struct platform_driver ipc_plat_driver = { - .remove = ipc_plat_remove, - .probe = ipc_plat_probe, - .driver = { - .name = "pmc-ipc-plat", - .acpi_match_table = ACPI_PTR(ipc_acpi_ids), - .dev_groups = intel_ipc_groups, - }, -}; - -static int __init intel_pmc_ipc_init(void) -{ - return platform_driver_register(&ipc_plat_driver); -} - -static void __exit intel_pmc_ipc_exit(void) -{ - platform_driver_unregister(&ipc_plat_driver); -} - -MODULE_AUTHOR("Zha Qipeng "); -MODULE_DESCRIPTION("Intel PMC IPC driver"); -MODULE_LICENSE("GPL v2"); - -/* Some modules are dependent on this, so init earlier */ -fs_initcall(intel_pmc_ipc_init); -module_exit(intel_pmc_ipc_exit); diff --git a/drivers/platform/x86/intel_telemetry_debugfs.c b/drivers/platform/x86/intel_telemetry_debugfs.c index 6cac3e05b817..1d4d0fbfd63c 100644 --- a/drivers/platform/x86/intel_telemetry_debugfs.c +++ b/drivers/platform/x86/intel_telemetry_debugfs.c @@ -15,6 +15,7 @@ */ #include #include +#include #include #include #include @@ -22,7 +23,6 @@ #include #include -#include #include #define DRIVER_NAME "telemetry_soc_debugfs" @@ -647,10 +647,11 @@ DEFINE_SHOW_ATTRIBUTE(telem_soc_states); static int telem_s0ix_res_get(void *data, u64 *val) { + struct telemetry_plt_config *plt_config = telemetry_get_pltdata(); u64 s0ix_total_res; int ret; - ret = intel_pmc_s0ix_counter_read(&s0ix_total_res); + ret = intel_pmc_s0ix_counter_read(plt_config->pmc, &s0ix_total_res); if (ret) { pr_err("Failed to read S0ix residency"); return ret; @@ -837,12 +838,15 @@ static int pm_suspend_exit_cb(void) */ if (suspend_shlw_ctr_exit == suspend_shlw_ctr_temp && suspend_deep_ctr_exit == suspend_deep_ctr_temp) { - ret = intel_pmc_gcr_read64(PMC_GCR_TELEM_SHLW_S0IX_REG, + struct telemetry_plt_config *plt_config = telemetry_get_pltdata(); + struct intel_pmc_dev *pmc = plt_config->pmc; + + ret = intel_pmc_gcr_read64(pmc, PMC_GCR_TELEM_SHLW_S0IX_REG, &suspend_shlw_res_exit); if (ret < 0) goto out; - ret = intel_pmc_gcr_read64(PMC_GCR_TELEM_DEEP_S0IX_REG, + ret = intel_pmc_gcr_read64(pmc, PMC_GCR_TELEM_DEEP_S0IX_REG, &suspend_deep_res_exit); if (ret < 0) goto out; diff --git a/drivers/platform/x86/intel_telemetry_pltdrv.c b/drivers/platform/x86/intel_telemetry_pltdrv.c index e45303f99303..405dea87de6b 100644 --- a/drivers/platform/x86/intel_telemetry_pltdrv.c +++ b/drivers/platform/x86/intel_telemetry_pltdrv.c @@ -1115,6 +1115,8 @@ static int telemetry_pltdrv_probe(struct platform_device *pdev) telm_conf = (struct telemetry_plt_config *)id->driver_data; + telm_conf->pmc = dev_get_drvdata(pdev->dev.parent); + mem = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(mem)) return PTR_ERR(mem); diff --git a/drivers/usb/typec/tcpm/Kconfig b/drivers/usb/typec/tcpm/Kconfig index 5b986d6c801d..fa3f39336246 100644 --- a/drivers/usb/typec/tcpm/Kconfig +++ b/drivers/usb/typec/tcpm/Kconfig @@ -41,8 +41,8 @@ config TYPEC_FUSB302 config TYPEC_WCOVE tristate "Intel WhiskeyCove PMIC USB Type-C PHY driver" depends on ACPI + depends on MFD_INTEL_PMC_BXT depends on INTEL_SOC_PMIC - depends on INTEL_PMC_IPC depends on BXT_WC_PMIC_OPREGION help This driver adds support for USB Type-C on Intel Broxton platforms diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index e707c4797f76..a370a185a41c 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -64,6 +64,7 @@ #include /* For copy_to_user/put_user/... */ #include /* For inb/outb/... */ #include +#include #include "iTCO_vendor.h" @@ -233,12 +234,24 @@ static int update_no_reboot_bit_cnt(void *priv, bool set) return val != newval ? -EIO : 0; } +static int update_no_reboot_bit_pmc(void *priv, bool set) +{ + struct intel_pmc_dev *pmc = priv; + u32 bits = PMC_CFG_NO_REBOOT_EN; + u32 value = set ? bits : 0; + + return intel_pmc_gcr_update(pmc, PMC_GCR_PMC_CFG_REG, bits, value); +} + static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, - struct itco_wdt_platform_data *pdata) + struct platform_device *pdev, + struct itco_wdt_platform_data *pdata) { - if (pdata->update_no_reboot_bit) { - p->update_no_reboot_bit = pdata->update_no_reboot_bit; - p->no_reboot_priv = pdata->no_reboot_priv; + if (pdata->no_reboot_use_pmc) { + struct intel_pmc_dev *pmc = dev_get_drvdata(pdev->dev.parent); + + p->update_no_reboot_bit = update_no_reboot_bit_pmc; + p->no_reboot_priv = pmc; return; } @@ -478,14 +491,14 @@ static int iTCO_wdt_probe(struct platform_device *pdev) return -ENODEV; } - iTCO_wdt_no_reboot_bit_setup(p, pdata); + iTCO_wdt_no_reboot_bit_setup(p, pdev, pdata); /* * Get the Memory-Mapped GCS or PMC register, we need it for the * NO_REBOOT flag (TCO v2 and v3). */ if (p->iTCO_version >= 2 && p->iTCO_version < 6 && - !pdata->update_no_reboot_bit) { + !pdata->no_reboot_use_pmc) { p->gcs_pmc_res = platform_get_resource(pdev, IORESOURCE_MEM, ICH_RES_MEM_GCS_PMC); diff --git a/include/linux/mfd/intel_pmc_bxt.h b/include/linux/mfd/intel_pmc_bxt.h new file mode 100644 index 000000000000..f51a43d25ffd --- /dev/null +++ b/include/linux/mfd/intel_pmc_bxt.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MFD_INTEL_PMC_BXT_H +#define MFD_INTEL_PMC_BXT_H + +/* GCR reg offsets from GCR base */ +#define PMC_GCR_PMC_CFG_REG 0x08 +#define PMC_GCR_TELEM_DEEP_S0IX_REG 0x78 +#define PMC_GCR_TELEM_SHLW_S0IX_REG 0x80 + +/* PMC_CFG_REG bit masks */ +#define PMC_CFG_NO_REBOOT_EN BIT(4) + +/** + * struct intel_pmc_dev - Intel PMC device structure + * @dev: Pointer to the parent PMC device + * @scu: Pointer to the SCU IPC device data structure + * @gcr_mem_base: Virtual base address of GCR (Global Configuration Registers) + * @gcr_lock: Lock used to serialize access to GCR registers + * @telem_base: Pointer to telemetry SSRAM base resource or %NULL if not + * available + */ +struct intel_pmc_dev { + struct device *dev; + struct intel_scu_ipc_dev *scu; + void __iomem *gcr_mem_base; + spinlock_t gcr_lock; + struct resource *telem_base; +}; + +#if IS_ENABLED(CONFIG_MFD_INTEL_PMC_BXT) +int intel_pmc_gcr_read64(struct intel_pmc_dev *pmc, u32 offset, u64 *data); +int intel_pmc_gcr_update(struct intel_pmc_dev *pmc, u32 offset, u32 mask, u32 val); +int intel_pmc_s0ix_counter_read(struct intel_pmc_dev *pmc, u64 *data); +#else +static inline int intel_pmc_gcr_read64(struct intel_pmc_dev *pmc, u32 offset, + u64 *data) +{ + return -ENOTSUPP; +} + +static inline int intel_pmc_gcr_update(struct intel_pmc_dev *pmc, u32 offset, + u32 mask, u32 val) +{ + return -ENOTSUPP; +} + +static inline int intel_pmc_s0ix_counter_read(struct intel_pmc_dev *pmc, u64 *data) +{ + return -ENOTSUPP; +} +#endif + +#endif /* MFD_INTEL_PMC_BXT_H */ diff --git a/include/linux/platform_data/itco_wdt.h b/include/linux/platform_data/itco_wdt.h index 2ccdce6a4e27..45d860cac2b0 100644 --- a/include/linux/platform_data/itco_wdt.h +++ b/include/linux/platform_data/itco_wdt.h @@ -12,13 +12,16 @@ #define ICH_RES_MEM_OFF 2 #define ICH_RES_MEM_GCS_PMC 0 +/** + * struct itco_wdt_platform_data - iTCO_wdt platform data + * @name: Name of the platform + * @version: iTCO version + * @no_reboot_use_pmc: Use PMC BXT API to set and clear NO_REBOOT bit + */ struct itco_wdt_platform_data { char name[32]; unsigned int version; - /* private data to be passed to update_no_reboot_bit API */ - void *no_reboot_priv; - /* pointer for platform specific no reboot update function */ - int (*update_no_reboot_bit)(void *priv, bool set); + bool no_reboot_use_pmc; }; #endif /* _ITCO_WDT_H_ */ -- cgit v1.2.3 From ccc27ae77494252965aedca68588e7457829cfad Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 16 Apr 2020 18:38:06 +0200 Subject: efi/libstub: Drop __pure getter for efi_system_table The practice of using __pure getter functions to access global variables in the EFI stub dates back to the time when we had to carefully prevent GOT entries from being emitted, because we could not rely on the toolchain to do this for us. Today, we use the hidden visibility pragma for all EFI stub source files, which now all live in the same subdirectory, and we apply a sanity check on the objects, so we can get rid of these getter functions and simply refer to global data objects directly. Start with efi_system_table(), and convert it into a global variable. While at it, make it a pointer-to-const, because we can. Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 14 ++++++++------ drivers/firmware/efi/libstub/efi-stub-helper.c | 6 +++--- drivers/firmware/efi/libstub/efi-stub.c | 13 ++++--------- drivers/firmware/efi/libstub/efistub.h | 6 +++--- drivers/firmware/efi/libstub/fdt.c | 6 +++--- drivers/firmware/efi/libstub/x86-stub.c | 23 +++++++++-------------- 6 files changed, 30 insertions(+), 38 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index f59cba117dcb..78e839925a81 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -339,15 +339,17 @@ static inline u32 efi64_convert_status(efi_status_t status) #define efi_bs_call(func, ...) \ (efi_is_native() \ - ? efi_system_table()->boottime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table(), \ - boottime), func, __VA_ARGS__)) + ? efi_system_table->boottime->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_table_attr(efi_system_table, \ + boottime), \ + func, __VA_ARGS__)) #define efi_rt_call(func, ...) \ (efi_is_native() \ - ? efi_system_table()->runtime->func(__VA_ARGS__) \ - : __efi64_thunk_map(efi_table_attr(efi_system_table(), \ - runtime), func, __VA_ARGS__)) + ? efi_system_table->runtime->func(__VA_ARGS__) \ + : __efi64_thunk_map(efi_table_attr(efi_system_table, \ + runtime), \ + func, __VA_ARGS__)) extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index 14e56a64f208..0b1688b10ddc 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -287,8 +287,8 @@ fail: void *get_efi_config_table(efi_guid_t guid) { - unsigned long tables = efi_table_attr(efi_system_table(), tables); - int nr_tables = efi_table_attr(efi_system_table(), nr_tables); + unsigned long tables = efi_table_attr(efi_system_table, tables); + int nr_tables = efi_table_attr(efi_system_table, nr_tables); int i; for (i = 0; i < nr_tables; i++) { @@ -305,7 +305,7 @@ void *get_efi_config_table(efi_guid_t guid) void efi_char16_printk(efi_char16_t *str) { - efi_call_proto(efi_table_attr(efi_system_table(), con_out), + efi_call_proto(efi_table_attr(efi_system_table, con_out), output_string, str); } diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 8455c590c7b9..8edfd4022803 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -38,12 +38,7 @@ static u64 virtmap_base = EFI_RT_VIRTUAL_BASE; static bool flat_va_mapping; -static efi_system_table_t *sys_table; - -__pure efi_system_table_t *efi_system_table(void) -{ - return sys_table; -} +const efi_system_table_t *efi_system_table; static struct screen_info *setup_graphics(void) { @@ -167,10 +162,10 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) efi_properties_table_t *prop_tbl; unsigned long max_addr; - sys_table = sys_table_arg; + efi_system_table = sys_table_arg; /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { + if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) { status = EFI_INVALID_PARAMETER; goto fail; } @@ -184,7 +179,7 @@ efi_status_t efi_entry(efi_handle_t handle, efi_system_table_t *sys_table_arg) * information about the running image, such as size and the command * line. */ - status = sys_table->boottime->handle_protocol(handle, + status = efi_system_table->boottime->handle_protocol(handle, &loaded_image_proto, (void *)&image); if (status != EFI_SUCCESS) { pr_efi_err("Failed to get loaded image protocol\n"); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 9a87fff1d4ba..c1481c5abea4 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -31,13 +31,13 @@ extern bool __pure noinitrd(void); extern bool __pure is_quiet(void); extern bool __pure novamap(void); -extern __pure efi_system_table_t *efi_system_table(void); +extern const efi_system_table_t *efi_system_table; #ifndef efi_bs_call -#define efi_bs_call(func, ...) efi_system_table()->boottime->func(__VA_ARGS__) +#define efi_bs_call(func, ...) efi_system_table->boottime->func(__VA_ARGS__) #endif #ifndef efi_rt_call -#define efi_rt_call(func, ...) efi_system_table()->runtime->func(__VA_ARGS__) +#define efi_rt_call(func, ...) efi_system_table->runtime->func(__VA_ARGS__) #endif #ifndef efi_is_native #define efi_is_native() (true) diff --git a/drivers/firmware/efi/libstub/fdt.c b/drivers/firmware/efi/libstub/fdt.c index 46cffac7a5f1..06d5e7fc8e34 100644 --- a/drivers/firmware/efi/libstub/fdt.c +++ b/drivers/firmware/efi/libstub/fdt.c @@ -110,7 +110,7 @@ static efi_status_t update_fdt(void *orig_fdt, unsigned long orig_fdt_size, /* Add FDT entries for EFI runtime services in chosen node. */ node = fdt_subnode_offset(fdt, 0, "chosen"); - fdt_val64 = cpu_to_fdt64((u64)(unsigned long)efi_system_table()); + fdt_val64 = cpu_to_fdt64((u64)(unsigned long)efi_system_table); status = fdt_setprop_var(fdt, node, "linux,uefi-system-table", fdt_val64); if (status) @@ -314,7 +314,7 @@ efi_status_t allocate_new_fdt_and_exit_boot(void *handle, return EFI_SUCCESS; /* Install the new virtual address map */ - svam = efi_system_table()->runtime->set_virtual_address_map; + svam = efi_system_table->runtime->set_virtual_address_map; status = svam(runtime_entry_count * desc_size, desc_size, desc_ver, runtime_map); @@ -348,7 +348,7 @@ fail_free_new_fdt: efi_free(MAX_FDT_SIZE, *new_fdt_addr); fail: - efi_system_table()->boottime->free_pool(runtime_map); + efi_system_table->boottime->free_pool(runtime_map); return EFI_LOAD_ERROR; } diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index 1c3807d0c321..3f132f51ab0f 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -20,15 +20,10 @@ /* Maximum physical address for 64-bit kernel with 4-level paging */ #define MAXMEM_X86_64_4LEVEL (1ull << 46) -static efi_system_table_t *sys_table; +const efi_system_table_t *efi_system_table; extern const bool efi_is64; extern u32 image_offset; -__pure efi_system_table_t *efi_system_table(void) -{ - return sys_table; -} - __attribute_const__ bool efi_is_64bit(void) { if (IS_ENABLED(CONFIG_EFI_MIXED)) @@ -227,7 +222,7 @@ static const efi_char16_t apple[] = L"Apple"; static void setup_quirks(struct boot_params *boot_params) { efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) - efi_table_attr(efi_system_table(), fw_vendor); + efi_table_attr(efi_system_table, fw_vendor); if (!memcmp(fw_vendor, apple, sizeof(apple))) { if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) @@ -377,10 +372,10 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, unsigned long ramdisk_addr; unsigned long ramdisk_size; - sys_table = sys_table_arg; + efi_system_table = sys_table_arg; /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); status = efi_bs_call(handle_protocol, handle, &proto, (void **)&image); @@ -446,7 +441,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, } } - efi_stub_entry(handle, sys_table, boot_params); + efi_stub_entry(handle, sys_table_arg, boot_params); /* not reached */ fail2: @@ -651,14 +646,14 @@ static efi_status_t exit_boot_func(struct efi_boot_memmap *map, : EFI32_LOADER_SIGNATURE; memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - p->efi->efi_systab = (unsigned long)efi_system_table(); + p->efi->efi_systab = (unsigned long)efi_system_table; p->efi->efi_memdesc_size = *map->desc_size; p->efi->efi_memdesc_version = *map->desc_ver; p->efi->efi_memmap = (unsigned long)*map->map; p->efi->efi_memmap_size = *map->map_size; #ifdef CONFIG_X86_64 - p->efi->efi_systab_hi = (unsigned long)efi_system_table() >> 32; + p->efi->efi_systab_hi = (unsigned long)efi_system_table >> 32; p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; #endif @@ -719,10 +714,10 @@ unsigned long efi_main(efi_handle_t handle, efi_status_t status; unsigned long cmdline_paddr; - sys_table = sys_table_arg; + efi_system_table = sys_table_arg; /* Check if we were booted by the EFI firmware */ - if (sys_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) + if (efi_system_table->hdr.signature != EFI_SYSTEM_TABLE_SIGNATURE) efi_exit(handle, EFI_INVALID_PARAMETER); /* -- cgit v1.2.3 From 0a75561489f534cf2e8f6883e0cf8cdf51c534c5 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 16 Apr 2020 21:15:23 +0200 Subject: efi/libstub/x86: Avoid getter function for efi_is64 We no longer need to take special care when using global variables in the EFI stub, so switch to a simple symbol reference for efi_is64. Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 11 ++++++++--- drivers/firmware/efi/libstub/x86-stub.c | 8 -------- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 78e839925a81..cd0c3fbf6156 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -225,14 +225,19 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, /* arch specific definitions used by the stub code */ -__attribute_const__ bool efi_is_64bit(void); +extern const bool efi_is64; + +static inline bool efi_is_64bit(void) +{ + if (IS_ENABLED(CONFIG_EFI_MIXED)) + return efi_is64; + return IS_ENABLED(CONFIG_X86_64); +} static inline bool efi_is_native(void) { if (!IS_ENABLED(CONFIG_X86_64)) return true; - if (!IS_ENABLED(CONFIG_EFI_MIXED)) - return true; return efi_is_64bit(); } diff --git a/drivers/firmware/efi/libstub/x86-stub.c b/drivers/firmware/efi/libstub/x86-stub.c index bddbc103a34b..597793fe8d22 100644 --- a/drivers/firmware/efi/libstub/x86-stub.c +++ b/drivers/firmware/efi/libstub/x86-stub.c @@ -21,16 +21,8 @@ #define MAXMEM_X86_64_4LEVEL (1ull << 46) const efi_system_table_t *efi_system_table; -extern const bool efi_is64; extern u32 image_offset; -__attribute_const__ bool efi_is_64bit(void) -{ - if (IS_ENABLED(CONFIG_EFI_MIXED)) - return efi_is64; - return IS_ENABLED(CONFIG_X86_64); -} - static efi_status_t preserve_pci_rom_image(efi_pci_io_protocol_t *pci, struct pci_setup_rom **__rom) { -- cgit v1.2.3 From 8c5cc19e94703182647dfccc164e4437a04539c8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:28 +0200 Subject: x86/tlb: Uninline __get_current_cr3_fast() cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. In preparation for unexporting cpu_tlbstate move __get_current_cr3_fast() into the x86 TLB management code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/20200421092558.848064318@linutronix.de --- arch/x86/include/asm/mmu_context.h | 19 +------------------ arch/x86/mm/tlb.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 4e55370e48e8..9608536b9c85 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -225,24 +225,7 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } -/* - * This can be used from process context to figure out what the value of - * CR3 is without needing to do a (slow) __read_cr3(). - * - * It's intended to be used for code like KVM that sneakily changes CR3 - * and needs to restore it. It needs to be used very carefully. - */ -static inline unsigned long __get_current_cr3_fast(void) -{ - unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, - this_cpu_read(cpu_tlbstate.loaded_mm_asid)); - - /* For now, be very restrictive about when this can be called. */ - VM_WARN_ON(in_nmi() || preemptible()); - - VM_BUG_ON(cr3 != __read_cr3()); - return cr3; -} +unsigned long __get_current_cr3_fast(void); typedef struct { struct mm_struct *mm; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 66f96f21a7b6..ea6f98a7ec06 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -843,6 +843,26 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) } } +/* + * This can be used from process context to figure out what the value of + * CR3 is without needing to do a (slow) __read_cr3(). + * + * It's intended to be used for code like KVM that sneakily changes CR3 + * and needs to restore it. It needs to be used very carefully. + */ +unsigned long __get_current_cr3_fast(void) +{ + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* For now, be very restrictive about when this can be called. */ + VM_WARN_ON(in_nmi() || preemptible()); + + VM_BUG_ON(cr3 != __read_cr3()); + return cr3; +} +EXPORT_SYMBOL_GPL(__get_current_cr3_fast); + /* * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. * This means that the 'struct flush_tlb_info' that describes which mappings to -- cgit v1.2.3 From d8f0b35331c4423e033f81f10eb5e0c7e4e1dcec Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:29 +0200 Subject: x86/cpu: Uninline CR4 accessors cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. The various CR4 accessors require cpu_tlbstate as the CR4 shadow cache is located there. In preparation for unexporting cpu_tlbstate, create a builtin function for manipulating CR4 and rework the various helpers to use it. No functional change. [ bp: push the export of native_write_cr4() only when CONFIG_LKTDM=m to the last patch in the series. ] Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092558.939985695@linutronix.de --- arch/x86/include/asm/tlbflush.h | 36 +++++------------------------------- arch/x86/kernel/cpu/common.c | 23 ++++++++++++++++++++++- arch/x86/kernel/process.c | 11 +++++++++++ 3 files changed, 38 insertions(+), 32 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6f66d841262d..d804030079da 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -276,37 +276,25 @@ static inline bool nmi_uaccess_okay(void) #define nmi_uaccess_okay nmi_uaccess_okay +void cr4_update_irqsoff(unsigned long set, unsigned long clear); +unsigned long cr4_read_shadow(void); + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } -static inline void __cr4_set(unsigned long cr4) -{ - lockdep_assert_irqs_disabled(); - this_cpu_write(cpu_tlbstate.cr4, cr4); - __write_cr4(cr4); -} - /* Set in this cpu's CR4. */ static inline void cr4_set_bits_irqsoff(unsigned long mask) { - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 | mask) != cr4) - __cr4_set(cr4 | mask); + cr4_update_irqsoff(mask, 0); } /* Clear in this cpu's CR4. */ static inline void cr4_clear_bits_irqsoff(unsigned long mask) { - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - if ((cr4 & ~mask) != cr4) - __cr4_set(cr4 & ~mask); + cr4_update_irqsoff(0, mask); } /* Set in this cpu's CR4. */ @@ -329,20 +317,6 @@ static inline void cr4_clear_bits(unsigned long mask) local_irq_restore(flags); } -static inline void cr4_toggle_bits_irqsoff(unsigned long mask) -{ - unsigned long cr4; - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - __cr4_set(cr4 ^ mask); -} - -/* Read the CR4 shadow. */ -static inline unsigned long cr4_read_shadow(void) -{ - return this_cpu_read(cpu_tlbstate.cr4); -} - /* * Mark all other ASIDs as invalid, preserves the current. */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..82042f40fc45 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -387,7 +387,28 @@ set_register: bits_missing); } } -EXPORT_SYMBOL(native_write_cr4); +EXPORT_SYMBOL_GPL(native_write_cr4); + +void cr4_update_irqsoff(unsigned long set, unsigned long clear) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + lockdep_assert_irqs_disabled(); + + newval = (cr4 & ~clear) | set; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} +EXPORT_SYMBOL(cr4_update_irqsoff); + +/* Read the CR4 shadow. */ +unsigned long cr4_read_shadow(void) +{ + return this_cpu_read(cpu_tlbstate.cr4); +} +EXPORT_SYMBOL_GPL(cr4_read_shadow); void cr4_init(void) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9da70b279dad..f2eab49d044e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -612,6 +612,17 @@ void speculation_ctrl_update_current(void) preempt_enable(); } +static inline void cr4_toggle_bits_irqsoff(unsigned long mask) +{ + unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4); + + newval = cr4 ^ mask; + if (newval != cr4) { + this_cpu_write(cpu_tlbstate.cr4, newval); + __write_cr4(newval); + } +} + void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) { unsigned long tifp, tifn; -- cgit v1.2.3 From cb2a02355b042ec3ef11d0ba2a46742678e41632 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:30 +0200 Subject: x86/cr4: Sanitize CR4.PCE update load_mm_cr4_irqsoff() is really a strange name for a function which has only one purpose: Update the CR4.PCE bit depending on the perf state. Rename it to update_cr4_pce_mm(), move it into the tlb code and provide a function which can be invoked by the perf smp function calls. Another step to remove exposure of cpu_tlbstate. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.049499158@linutronix.de --- arch/x86/events/core.c | 11 +++-------- arch/x86/include/asm/mmu_context.h | 14 +------------- arch/x86/mm/tlb.c | 22 +++++++++++++++++++++- 3 files changed, 25 insertions(+), 22 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index a619763e96e1..30d2b1d3e94c 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2162,11 +2162,6 @@ static int x86_pmu_event_init(struct perf_event *event) return err; } -static void refresh_pce(void *ignored) -{ - load_mm_cr4_irqsoff(this_cpu_read(cpu_tlbstate.loaded_mm)); -} - static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) { if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED)) @@ -2185,7 +2180,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm) lockdep_assert_held_write(&mm->mmap_sem); if (atomic_inc_return(&mm->context.perf_rdpmc_allowed) == 1) - on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); + on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm) @@ -2195,7 +2190,7 @@ static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *m return; if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed)) - on_each_cpu_mask(mm_cpumask(mm), refresh_pce, NULL, 1); + on_each_cpu_mask(mm_cpumask(mm), cr4_update_pce, NULL, 1); } static int x86_pmu_event_idx(struct perf_event *event) @@ -2253,7 +2248,7 @@ static ssize_t set_attr_rdpmc(struct device *cdev, else if (x86_pmu.attr_rdpmc == 2) static_branch_dec(&rdpmc_always_available_key); - on_each_cpu(refresh_pce, NULL, 1); + on_each_cpu(cr4_update_pce, NULL, 1); x86_pmu.attr_rdpmc = val; } diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 9608536b9c85..2985d06660aa 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -24,21 +24,9 @@ static inline void paravirt_activate_mm(struct mm_struct *prev, #endif /* !CONFIG_PARAVIRT_XXL */ #ifdef CONFIG_PERF_EVENTS - DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key); DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key); - -static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) -{ - if (static_branch_unlikely(&rdpmc_always_available_key) || - (!static_branch_unlikely(&rdpmc_never_available_key) && - atomic_read(&mm->context.perf_rdpmc_allowed))) - cr4_set_bits_irqsoff(X86_CR4_PCE); - else - cr4_clear_bits_irqsoff(X86_CR4_PCE); -} -#else -static inline void load_mm_cr4_irqsoff(struct mm_struct *mm) {} +void cr4_update_pce(void *ignored); #endif #ifdef CONFIG_MODIFY_LDT_SYSCALL diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ea6f98a7ec06..3d9d81951962 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -272,6 +272,26 @@ static void cond_ibpb(struct task_struct *next) } } +#ifdef CONFIG_PERF_EVENTS +static inline void cr4_update_pce_mm(struct mm_struct *mm) +{ + if (static_branch_unlikely(&rdpmc_always_available_key) || + (!static_branch_unlikely(&rdpmc_never_available_key) && + atomic_read(&mm->context.perf_rdpmc_allowed))) + cr4_set_bits_irqsoff(X86_CR4_PCE); + else + cr4_clear_bits_irqsoff(X86_CR4_PCE); +} + +void cr4_update_pce(void *ignored) +{ + cr4_update_pce_mm(this_cpu_read(cpu_tlbstate.loaded_mm)); +} + +#else +static inline void cr4_update_pce_mm(struct mm_struct *mm) { } +#endif + void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { @@ -440,7 +460,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); if (next != real_prev) { - load_mm_cr4_irqsoff(next); + cr4_update_pce_mm(next); switch_ldt(real_prev, next); } } -- cgit v1.2.3 From 9020d3956317d052cdddd43e55acdd2970344192 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:31 +0200 Subject: x86/alternatives: Move temporary_mm helpers into C The only user of these inlines is the text poke code and this must not be exposed to the world. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.139069561@linutronix.de --- arch/x86/include/asm/mmu_context.h | 55 -------------------------------------- arch/x86/kernel/alternative.c | 55 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 55 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 2985d06660aa..47562147e70b 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -215,59 +215,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, unsigned long __get_current_cr3_fast(void); -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} - #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7867dfb3963e..cd617979b7fc 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -783,6 +783,61 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + __ro_after_init struct mm_struct *poking_mm; __ro_after_init unsigned long poking_addr; -- cgit v1.2.3 From 81b67439d147677d844d492fcbd03712ea438f42 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Sat, 25 Apr 2020 05:06:14 -0500 Subject: x86/unwind/orc: Fix premature unwind stoppage due to IRET frames The following execution path is possible: fsnotify() [ realign the stack and store previous SP in R10 ] [ only IRET regs saved ] common_interrupt() interrupt_entry() [ full pt_regs saved ] ... [ unwind stack ] When the unwinder goes through the NMI and the IRQ on the stack, and then sees fsnotify(), it doesn't have access to the value of R10, because it only has the five IRET registers. So the unwind stops prematurely. However, because the interrupt_entry() code is careful not to clobber R10 before saving the full regs, the unwinder should be able to read R10 from the previously saved full pt_regs associated with the NMI. Handle this case properly. When encountering an IRET regs frame immediately after a full pt_regs frame, use the pt_regs as a backup which can be used to get the C register values. Also, note that a call frame resets the 'prev_regs' value, because a function is free to clobber the registers. For this fix to work, the IRET and full regs frames must be adjacent, with no FUNC frames in between. So replace the FUNC hint in interrupt_entry() with an IRET_REGS hint. Fixes: ee9f8fce9964 ("x86/unwind: Add the ORC unwinder") Reviewed-by: Miroslav Benes Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Andy Lutomirski Cc: Dave Jones Cc: Jann Horn Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Link: https://lore.kernel.org/r/97a408167cc09f1cfa0de31a7b70dd88868d743f.1587808742.git.jpoimboe@redhat.com --- arch/x86/entry/entry_64.S | 4 ++-- arch/x86/include/asm/unwind.h | 2 +- arch/x86/kernel/unwind_orc.c | 51 +++++++++++++++++++++++++++++++++---------- 3 files changed, 43 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9fe0d5cad8e4..3063aa9090f9 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -511,7 +511,7 @@ SYM_CODE_END(spurious_entries_start) * +----------------------------------------------------+ */ SYM_CODE_START(interrupt_entry) - UNWIND_HINT_FUNC + UNWIND_HINT_IRET_REGS offset=16 ASM_CLAC cld @@ -543,9 +543,9 @@ SYM_CODE_START(interrupt_entry) pushq 5*8(%rdi) /* regs->eflags */ pushq 4*8(%rdi) /* regs->cs */ pushq 3*8(%rdi) /* regs->ip */ + UNWIND_HINT_IRET_REGS pushq 2*8(%rdi) /* regs->orig_ax */ pushq 8(%rdi) /* return address */ - UNWIND_HINT_FUNC movq (%rdi), %rdi jmp 2f diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 499578f7e6d7..70fc159ebe69 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -19,7 +19,7 @@ struct unwind_state { #if defined(CONFIG_UNWINDER_ORC) bool signal, full_regs; unsigned long sp, bp, ip; - struct pt_regs *regs; + struct pt_regs *regs, *prev_regs; #elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; unsigned long *bp, *orig_sp, ip; diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index 33b80a7f998f..0ebc11a8bb45 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -384,9 +384,38 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr return true; } +/* + * If state->regs is non-NULL, and points to a full pt_regs, just get the reg + * value from state->regs. + * + * Otherwise, if state->regs just points to IRET regs, and the previous frame + * had full regs, it's safe to get the value from the previous regs. This can + * happen when early/late IRQ entry code gets interrupted by an NMI. + */ +static bool get_reg(struct unwind_state *state, unsigned int reg_off, + unsigned long *val) +{ + unsigned int reg = reg_off/8; + + if (!state->regs) + return false; + + if (state->full_regs) { + *val = ((unsigned long *)state->regs)[reg]; + return true; + } + + if (state->prev_regs) { + *val = ((unsigned long *)state->prev_regs)[reg]; + return true; + } + + return false; +} + bool unwind_next_frame(struct unwind_state *state) { - unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp; + unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp; enum stack_type prev_type = state->stack_info.type; struct orc_entry *orc; bool indirect = false; @@ -448,39 +477,35 @@ bool unwind_next_frame(struct unwind_state *state) break; case ORC_REG_R10: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) { orc_warn_current("missing R10 value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->r10; break; case ORC_REG_R13: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) { orc_warn_current("missing R13 value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->r13; break; case ORC_REG_DI: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) { orc_warn_current("missing RDI value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->di; break; case ORC_REG_DX: - if (!state->regs || !state->full_regs) { + if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) { orc_warn_current("missing DX value at %pB\n", (void *)state->ip); goto err; } - sp = state->regs->dx; break; default: @@ -507,6 +532,7 @@ bool unwind_next_frame(struct unwind_state *state) state->sp = sp; state->regs = NULL; + state->prev_regs = NULL; state->signal = false; break; @@ -518,6 +544,7 @@ bool unwind_next_frame(struct unwind_state *state) } state->regs = (struct pt_regs *)sp; + state->prev_regs = NULL; state->full_regs = true; state->signal = true; break; @@ -529,6 +556,8 @@ bool unwind_next_frame(struct unwind_state *state) goto err; } + if (state->full_regs) + state->prev_regs = state->regs; state->regs = (void *)sp - IRET_FRAME_OFFSET; state->full_regs = false; state->signal = true; @@ -543,8 +572,8 @@ bool unwind_next_frame(struct unwind_state *state) /* Find BP: */ switch (orc->bp_reg) { case ORC_REG_UNDEFINED: - if (state->regs && state->full_regs) - state->bp = state->regs->bp; + if (get_reg(state, offsetof(struct pt_regs, bp), &tmp)) + state->bp = tmp; break; case ORC_REG_PREV_SP: -- cgit v1.2.3 From 2faf153bb7346b7dfc895f916edf93a86297ec0a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:32 +0200 Subject: x86/tlb: Move __flush_tlb() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a first step, move __flush_tlb() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. Consolidate the namespace while at it and remove the pointless extra wrapper in the paravirt code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.246130908@linutronix.de --- arch/x86/include/asm/paravirt.h | 4 +++- arch/x86/include/asm/tlbflush.h | 29 +++++------------------------ arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/paravirt.c | 7 +------ arch/x86/mm/mem_encrypt.c | 2 +- arch/x86/mm/tlb.c | 33 ++++++++++++++++++++++++++++++++- arch/x86/platform/uv/tlb_uv.c | 2 +- 7 files changed, 45 insertions(+), 36 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 694d8daf4983..f412450668d8 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -47,7 +47,9 @@ static inline void slow_down_io(void) #endif } -static inline void __flush_tlb(void) +void native_flush_tlb_local(void); + +static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d804030079da..fe1fd02904ba 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -140,12 +140,13 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; } +void flush_tlb_local(void); + #ifdef CONFIG_PARAVIRT #include #else -#define __flush_tlb() __native_flush_tlb() -#define __flush_tlb_global() __native_flush_tlb_global() -#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) +#define __flush_tlb_global() __native_flush_tlb_global() +#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif struct tlb_context { @@ -370,24 +371,6 @@ static inline void invalidate_user_asid(u16 asid) (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); } -/* - * flush the entire current user mapping - */ -static inline void __native_flush_tlb(void) -{ - /* - * Preemption or interrupts must be disabled to protect the access - * to the per CPU variable and to prevent being preempted between - * read_cr3() and write_cr3(). - */ - WARN_ON_ONCE(preemptible()); - - invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); - - /* If current->mm == NULL then the read_cr3() "borrows" an mm */ - native_write_cr3(__native_read_cr3()); -} - /* * flush everything */ @@ -461,7 +444,7 @@ static inline void __flush_tlb_all(void) /* * !PGE -> !PCID (setup_pcid()), thus every flush is total. */ - __flush_tlb(); + flush_tlb_local(); } } @@ -537,8 +520,6 @@ struct flush_tlb_info { bool freed_tables; }; -#define local_flush_tlb() __flush_tlb() - #define flush_tlb_mm(mm) \ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 51b9190c628b..23ad8e953dfb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -761,7 +761,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Save MTRR state */ rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); @@ -778,7 +778,7 @@ static void post_set(void) __releases(set_atomicity_lock) { /* Flush TLBs (no need to flush caches - they are disabled) */ count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - __flush_tlb(); + flush_tlb_local(); /* Intel (P6) standard MTRRs */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c131ba4e70ef..4cb3d822ea09 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -160,11 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len, return insn_len; } -static void native_flush_tlb(void) -{ - __native_flush_tlb(); -} - /* * Global pages have to be flushed a bit differently. Not a real * performance problem because this does not happen often. @@ -359,7 +354,7 @@ struct paravirt_patch_template pv_ops = { #endif /* CONFIG_PARAVIRT_XXL */ /* Mmu ops. */ - .mmu.flush_tlb_user = native_flush_tlb, + .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_others = native_flush_tlb_others, diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index a03614bd3e1a..4a781cf99e92 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -134,7 +134,7 @@ static void __init __sme_early_map_unmap_mem(void *vaddr, unsigned long size, size = (size <= PMD_SIZE) ? 0 : size - PMD_SIZE; } while (size); - __native_flush_tlb(); + flush_tlb_local(); } void __init sme_unmap_bootdata(char *real_mode_data) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3d9d81951962..06116480c343 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -18,6 +18,13 @@ #include "mm_internal.h" +#ifdef CONFIG_PARAVIRT +# define STATIC_NOPV +#else +# define STATIC_NOPV static +# define __flush_tlb_local native_flush_tlb_local +#endif + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -645,7 +652,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, trace_tlb_flush(reason, nr_invalidate); } else { /* Full flush. */ - local_flush_tlb(); + flush_tlb_local(); if (local) count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); trace_tlb_flush(reason, TLB_FLUSH_ALL); @@ -883,6 +890,30 @@ unsigned long __get_current_cr3_fast(void) } EXPORT_SYMBOL_GPL(__get_current_cr3_fast); +/* + * Flush the entire current user mapping + */ +STATIC_NOPV void native_flush_tlb_local(void) +{ + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); + + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); +} + +void flush_tlb_local(void) +{ + __flush_tlb_local(); +} +EXPORT_SYMBOL_GPL(flush_tlb_local); + /* * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. * This means that the 'struct flush_tlb_info' that describes which mappings to diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 1fd321f37f1b..6af766c47dd2 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -293,7 +293,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, * This must be a normal message, or retry of a normal message */ if (msg->address == TLB_FLUSH_ALL) { - local_flush_tlb(); + flush_tlb_local(); stat->d_alltlb++; } else { __flush_tlb_one_user(msg->address); -- cgit v1.2.3 From cd30d26cf307b45159cd629d60b989e582372afe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:33 +0200 Subject: x86/tlb: Move __flush_tlb_global() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a second step, move __flush_tlb_global() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. Consolidate the namespace while at it and remove the pointless extra wrapper in the paravirt code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.336916818@linutronix.de --- arch/x86/include/asm/paravirt.h | 1 + arch/x86/include/asm/tlbflush.h | 38 ++------------------------------------ arch/x86/kernel/paravirt.c | 9 --------- arch/x86/mm/tlb.c | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 44 insertions(+), 45 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index f412450668d8..712e059bc7c6 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -48,6 +48,7 @@ static inline void slow_down_io(void) } void native_flush_tlb_local(void); +void native_flush_tlb_global(void); static inline void __flush_tlb_local(void) { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index fe1fd02904ba..d66d16e3fd67 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -141,11 +141,11 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) } void flush_tlb_local(void); +void flush_tlb_global(void); #ifdef CONFIG_PARAVIRT #include #else -#define __flush_tlb_global() __native_flush_tlb_global() #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif @@ -371,40 +371,6 @@ static inline void invalidate_user_asid(u16 asid) (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); } -/* - * flush everything - */ -static inline void __native_flush_tlb_global(void) -{ - unsigned long cr4, flags; - - if (static_cpu_has(X86_FEATURE_INVPCID)) { - /* - * Using INVPCID is considerably faster than a pair of writes - * to CR4 sandwiched inside an IRQ flag save/restore. - * - * Note, this works with CR4.PCIDE=0 or 1. - */ - invpcid_flush_all(); - return; - } - - /* - * Read-modify-write to CR4 - protect it from preemption and - * from interrupts. (Use the raw variant because this code can - * be called from deep inside debugging code.) - */ - raw_local_irq_save(flags); - - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* toggle PGE */ - native_write_cr4(cr4 ^ X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); - - raw_local_irq_restore(flags); -} - /* * flush one page in the user mapping */ @@ -439,7 +405,7 @@ static inline void __flush_tlb_all(void) VM_WARN_ON_ONCE(preemptible()); if (boot_cpu_has(X86_FEATURE_PGE)) { - __flush_tlb_global(); + flush_tlb_global(); } else { /* * !PGE -> !PCID (setup_pcid()), thus every flush is total. diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4cb3d822ea09..6094b007979c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -160,15 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len, return insn_len; } -/* - * Global pages have to be flushed a bit differently. Not a real - * performance problem because this does not happen often. - */ -static void native_flush_tlb_global(void) -{ - __native_flush_tlb_global(); -} - static void native_flush_tlb_one_user(unsigned long addr) { __native_flush_tlb_one_user(addr); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 06116480c343..d548b98e5a49 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -23,6 +23,7 @@ #else # define STATIC_NOPV static # define __flush_tlb_local native_flush_tlb_local +# define __flush_tlb_global native_flush_tlb_global #endif /* @@ -890,6 +891,46 @@ unsigned long __get_current_cr3_fast(void) } EXPORT_SYMBOL_GPL(__get_current_cr3_fast); +/* + * Flush everything + */ +STATIC_NOPV void native_flush_tlb_global(void) +{ + unsigned long cr4, flags; + + if (static_cpu_has(X86_FEATURE_INVPCID)) { + /* + * Using INVPCID is considerably faster than a pair of writes + * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. + */ + invpcid_flush_all(); + return; + } + + /* + * Read-modify-write to CR4 - protect it from preemption and + * from interrupts. (Use the raw variant because this code can + * be called from deep inside debugging code.) + */ + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + native_write_cr4(cr4); + + raw_local_irq_restore(flags); +} + +void flush_tlb_global(void) +{ + __flush_tlb_global(); +} +EXPORT_SYMBOL_GPL(flush_tlb_global); + /* * Flush the entire current user mapping */ -- cgit v1.2.3 From 127ac915c8e1c11b8209393e700ca16be0efabe8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:34 +0200 Subject: x86/tlb: Move __flush_tlb_one_user() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a third step, move _flush_tlb_one_user() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. Consolidate the name space while at it and remove the pointless extra wrapper in the paravirt code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.428213098@linutronix.de --- arch/x86/include/asm/paravirt.h | 1 + arch/x86/include/asm/tlbflush.h | 53 ++------------------------------------ arch/x86/kernel/paravirt.c | 5 ---- arch/x86/mm/tlb.c | 56 ++++++++++++++++++++++++++++++++++++++++- arch/x86/platform/uv/tlb_uv.c | 2 +- 5 files changed, 59 insertions(+), 58 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 712e059bc7c6..dcd6517a694a 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -49,6 +49,7 @@ static inline void slow_down_io(void) void native_flush_tlb_local(void); void native_flush_tlb_global(void); +void native_flush_tlb_one_user(unsigned long addr); static inline void __flush_tlb_local(void) { diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d66d16e3fd67..14c5b98aaa51 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -142,11 +142,10 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) void flush_tlb_local(void); void flush_tlb_global(void); +void flush_tlb_one_user(unsigned long addr); #ifdef CONFIG_PARAVIRT #include -#else -#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif struct tlb_context { @@ -345,54 +344,6 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) extern void initialize_tlbstate_and_flush(void); -/* - * Given an ASID, flush the corresponding user ASID. We can delay this - * until the next time we switch to it. - * - * See SWITCH_TO_USER_CR3. - */ -static inline void invalidate_user_asid(u16 asid) -{ - /* There is no user ASID if address space separation is off */ - if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) - return; - - /* - * We only have a single ASID if PCID is off and the CR3 - * write will have flushed it. - */ - if (!cpu_feature_enabled(X86_FEATURE_PCID)) - return; - - if (!static_cpu_has(X86_FEATURE_PTI)) - return; - - __set_bit(kern_pcid(asid), - (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); -} - -/* - * flush one page in the user mapping - */ -static inline void __native_flush_tlb_one_user(unsigned long addr) -{ - u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - - asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); - - if (!static_cpu_has(X86_FEATURE_PTI)) - return; - - /* - * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. - * Just use invalidate_user_asid() in case we are called early. - */ - if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) - invalidate_user_asid(loaded_mm_asid); - else - invpcid_flush_one(user_pcid(loaded_mm_asid), addr); -} - /* * flush everything */ @@ -432,7 +383,7 @@ static inline void __flush_tlb_one_kernel(unsigned long addr) * kernel address space and for its usermode counterpart, but it does * not flush it for other address spaces. */ - __flush_tlb_one_user(addr); + flush_tlb_one_user(addr); if (!static_cpu_has(X86_FEATURE_PTI)) return; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6094b007979c..5638e4ae2ea6 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -160,11 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len, return insn_len; } -static void native_flush_tlb_one_user(unsigned long addr) -{ - __native_flush_tlb_one_user(addr); -} - struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index d548b98e5a49..2822602ce60a 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -24,6 +24,7 @@ # define STATIC_NOPV static # define __flush_tlb_local native_flush_tlb_local # define __flush_tlb_global native_flush_tlb_global +# define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) #endif /* @@ -118,6 +119,32 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, *need_flush = true; } +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) +{ + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + return; + + /* + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. + */ + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); +} + static void load_new_mm_cr3(pgd_t *pgdir, u16 new_asid, bool need_flush) { unsigned long new_mm_cr3; @@ -645,7 +672,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, unsigned long addr = f->start; while (addr < f->end) { - __flush_tlb_one_user(addr); + flush_tlb_one_user(addr); addr += 1UL << f->stride_shift; } if (local) @@ -891,6 +918,33 @@ unsigned long __get_current_cr3_fast(void) } EXPORT_SYMBOL_GPL(__get_current_cr3_fast); +/* + * Flush one page in the user mapping + */ +STATIC_NOPV void native_flush_tlb_one_user(unsigned long addr) +{ + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); +} + +void flush_tlb_one_user(unsigned long addr) +{ + __flush_tlb_one_user(addr); +} + /* * Flush everything */ diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 6af766c47dd2..4ea69690c3e4 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -296,7 +296,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp, flush_tlb_local(); stat->d_alltlb++; } else { - __flush_tlb_one_user(msg->address); + flush_tlb_one_user(msg->address); stat->d_onetlb++; } stat->d_requestee++; -- cgit v1.2.3 From 58430c5dba7bfe1d132b3c07f0d7a596852ef55c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:35 +0200 Subject: x86/tlb: Move __flush_tlb_one_kernel() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a fourth step, move __flush_tlb_one_kernel() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. Consolidate the name space while at it and remove the pointless extra wrapper in the paravirt code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.535159540@linutronix.de --- arch/x86/include/asm/pgtable_32.h | 2 +- arch/x86/include/asm/tlbflush.h | 41 +-------------------------------------- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/ioremap.c | 2 +- arch/x86/mm/kmmio.c | 2 +- arch/x86/mm/pat/set_memory.c | 2 +- arch/x86/mm/pgtable_32.c | 2 +- arch/x86/mm/tlb.c | 34 +++++++++++++++++++++++++++++++- 8 files changed, 40 insertions(+), 47 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0dca7f7aeff2..bd2ed47cb067 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -60,7 +60,7 @@ void sync_initial_page_table(void); #define kpte_clear_flush(ptep, vaddr) \ do { \ pte_clear(&init_mm, (vaddr), (ptep)); \ - __flush_tlb_one_kernel((vaddr)); \ + flush_tlb_one_kernel((vaddr)); \ } while (0) #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 14c5b98aaa51..bbb94f05e1f3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -143,6 +143,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) void flush_tlb_local(void); void flush_tlb_global(void); void flush_tlb_one_user(unsigned long addr); +void flush_tlb_one_kernel(unsigned long addr); #ifdef CONFIG_PARAVIRT #include @@ -317,14 +318,6 @@ static inline void cr4_clear_bits(unsigned long mask) local_irq_restore(flags); } -/* - * Mark all other ASIDs as invalid, preserves the current. - */ -static inline void invalidate_other_asid(void) -{ - this_cpu_write(cpu_tlbstate.invalidate_other, true); -} - /* * Save some of cr4 feature set we're using (e.g. Pentium 4MB * enable and PPro Global page enable), so that any CPU's that boot @@ -365,38 +358,6 @@ static inline void __flush_tlb_all(void) } } -/* - * flush one page in the kernel mapping - */ -static inline void __flush_tlb_one_kernel(unsigned long addr) -{ - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); - - /* - * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its - * paravirt equivalent. Even with PCID, this is sufficient: we only - * use PCID if we also use global PTEs for the kernel mapping, and - * INVLPG flushes global translations across all address spaces. - * - * If PTI is on, then the kernel is mapped with non-global PTEs, and - * __flush_tlb_one_user() will flush the given address for the current - * kernel address space and for its usermode counterpart, but it does - * not flush it for other address spaces. - */ - flush_tlb_one_user(addr); - - if (!static_cpu_has(X86_FEATURE_PTI)) - return; - - /* - * See above. We need to propagate the flush to all other address - * spaces. In principle, we only need to propagate it to kernelmode - * address spaces, but the extra bookkeeping we would need is not - * worth it. - */ - invalidate_other_asid(); -} - #define TLB_FLUSH_ALL -1UL /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9a497ba02440..c7a1c6c23431 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -298,7 +298,7 @@ static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one_kernel(vaddr); + flush_tlb_one_kernel(vaddr); } void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 41536f523a5f..986d57534fd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -885,5 +885,5 @@ void __init __early_set_fixmap(enum fixed_addresses idx, set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); else pte_clear(&init_mm, addr, pte); - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); } diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 9994353fb75d..dd625898425a 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -173,7 +173,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear) return -1; } - __flush_tlb_one_kernel(f->addr); + flush_tlb_one_kernel(f->addr); return 0; } diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index a28f0c345303..b7fb1f05f257 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -345,7 +345,7 @@ static void __cpa_flush_tlb(void *data) unsigned int i; for (i = 0; i < cpa->numpages; i++) - __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); + flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static void cpa_flush(struct cpa_data *data, int cache) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 0e6700eaa4f9..e1ce59dc558f 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -64,7 +64,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) * It's enough to flush this one mapping. * (PGE mappings get flushed as well) */ - __flush_tlb_one_kernel(vaddr); + flush_tlb_one_kernel(vaddr); } unsigned long __FIXADDR_TOP = 0xfffff000; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 2822602ce60a..ad217ed2a74f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -876,7 +876,7 @@ static void do_kernel_range_flush(void *info) /* flush range by one by one 'invlpg' */ for (addr = f->start; addr < f->end; addr += PAGE_SIZE) - __flush_tlb_one_kernel(addr); + flush_tlb_one_kernel(addr); } void flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -918,6 +918,38 @@ unsigned long __get_current_cr3_fast(void) } EXPORT_SYMBOL_GPL(__get_current_cr3_fast); +/* + * Flush one page in the kernel mapping + */ +void flush_tlb_one_kernel(unsigned long addr) +{ + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + + /* + * If PTI is off, then __flush_tlb_one_user() is just INVLPG or its + * paravirt equivalent. Even with PCID, this is sufficient: we only + * use PCID if we also use global PTEs for the kernel mapping, and + * INVLPG flushes global translations across all address spaces. + * + * If PTI is on, then the kernel is mapped with non-global PTEs, and + * __flush_tlb_one_user() will flush the given address for the current + * kernel address space and for its usermode counterpart, but it does + * not flush it for other address spaces. + */ + flush_tlb_one_user(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * See above. We need to propagate the flush to all other address + * spaces. In principle, we only need to propagate it to kernelmode + * address spaces, but the extra bookkeeping we would need is not + * worth it. + */ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + /* * Flush one page in the user mapping */ -- cgit v1.2.3 From 29def599b38bb8a10f48f83821dd990615300b04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:36 +0200 Subject: x86/tlb: Move flush_tlb_others() out of line cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. As a last step, move __flush_tlb_others() out of line and hide the native function. The latter can be static when CONFIG_PARAVIRT is disabled. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.641957686@linutronix.de --- arch/x86/include/asm/paravirt.h | 6 ++++-- arch/x86/include/asm/tlbflush.h | 10 ++++------ arch/x86/mm/tlb.c | 11 +++++++++-- 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index dcd6517a694a..5ca5d297df75 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -50,6 +50,8 @@ static inline void slow_down_io(void) void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); +void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { @@ -66,8 +68,8 @@ static inline void __flush_tlb_one_user(unsigned long addr) PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } -static inline void flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +static inline void __flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info); } diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index bbb94f05e1f3..d064ae8a0f2a 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -140,10 +140,14 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; } +struct flush_tlb_info; + void flush_tlb_local(void); void flush_tlb_global(void); void flush_tlb_one_user(unsigned long addr); void flush_tlb_one_kernel(unsigned long addr); +void flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); #ifdef CONFIG_PARAVIRT #include @@ -418,9 +422,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a) flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, PAGE_SHIFT, false); } -void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info); - static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) { /* @@ -442,9 +443,6 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #ifndef CONFIG_PARAVIRT -#define flush_tlb_others(mask, info) \ - native_flush_tlb_others(mask, info) - #define paravirt_tlb_remove_table(tlb, page) \ tlb_remove_page(tlb, (void *)(page)) #endif diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ad217ed2a74f..209799dabc70 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -25,6 +25,7 @@ # define __flush_tlb_local native_flush_tlb_local # define __flush_tlb_global native_flush_tlb_global # define __flush_tlb_one_user(addr) native_flush_tlb_one_user(addr) +# define __flush_tlb_others(msk, info) native_flush_tlb_others(msk, info) #endif /* @@ -715,8 +716,8 @@ static bool tlb_is_not_lazy(int cpu, void *data) return !per_cpu(cpu_tlbstate.is_lazy, cpu); } -void native_flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info) +STATIC_NOPV void native_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) @@ -766,6 +767,12 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1, cpumask); } +void flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + __flush_tlb_others(cpumask, info); +} + /* * See Documentation/x86/tlb.rst for details. We choose 33 * because it is large enough to cover the vast majority (at -- cgit v1.2.3 From 4b04e6c236744635eb4852bd9690172734fa0a1c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:37 +0200 Subject: x86/tlb: Move __flush_tlb_all() out of line Reduce the number of required exports to one and make flush_tlb_global() static to the TLB code. flush_tlb_local() cannot be confined to the TLB code as the MTRR handling requires a PGE-less flush. Suggested-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200421092559.740388137@linutronix.de --- arch/x86/include/asm/tlbflush.h | 23 +---------------------- arch/x86/mm/tlb.c | 29 ++++++++++++++++++++++------- 2 files changed, 23 insertions(+), 29 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d064ae8a0f2a..7401c6cd1ffc 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -142,8 +142,8 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) struct flush_tlb_info; +void __flush_tlb_all(void); void flush_tlb_local(void); -void flush_tlb_global(void); void flush_tlb_one_user(unsigned long addr); void flush_tlb_one_kernel(unsigned long addr); void flush_tlb_others(const struct cpumask *cpumask, @@ -341,27 +341,6 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask) extern void initialize_tlbstate_and_flush(void); -/* - * flush everything - */ -static inline void __flush_tlb_all(void) -{ - /* - * This is to catch users with enabled preemption and the PGE feature - * and don't trigger the warning in __native_flush_tlb(). - */ - VM_WARN_ON_ONCE(preemptible()); - - if (boot_cpu_has(X86_FEATURE_PGE)) { - flush_tlb_global(); - } else { - /* - * !PGE -> !PCID (setup_pcid()), thus every flush is total. - */ - flush_tlb_local(); - } -} - #define TLB_FLUSH_ALL -1UL /* diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 209799dabc70..aabf8c7377e3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1018,12 +1018,6 @@ STATIC_NOPV void native_flush_tlb_global(void) raw_local_irq_restore(flags); } -void flush_tlb_global(void) -{ - __flush_tlb_global(); -} -EXPORT_SYMBOL_GPL(flush_tlb_global); - /* * Flush the entire current user mapping */ @@ -1046,7 +1040,28 @@ void flush_tlb_local(void) { __flush_tlb_local(); } -EXPORT_SYMBOL_GPL(flush_tlb_local); + +/* + * Flush everything + */ +void __flush_tlb_all(void) +{ + /* + * This is to catch users with enabled preemption and the PGE feature + * and don't trigger the warning in __native_flush_tlb(). + */ + VM_WARN_ON_ONCE(preemptible()); + + if (boot_cpu_has(X86_FEATURE_PGE)) { + __flush_tlb_global(); + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ + flush_tlb_local(); + } +} +EXPORT_SYMBOL_GPL(__flush_tlb_all); /* * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. -- cgit v1.2.3 From 69de6c1a7fc730260d39f09432d69abc99f5f344 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:38 +0200 Subject: x86/tlb: Move paravirt_tlb_remove_table() to the usage site Move it where the only user is. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.849801011@linutronix.de --- arch/x86/include/asm/tlbflush.h | 5 ----- arch/x86/mm/pgtable.c | 8 ++++++++ 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 7401c6cd1ffc..c22fc72c126d 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -421,9 +421,4 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); -#ifndef CONFIG_PARAVIRT -#define paravirt_tlb_remove_table(tlb, page) \ - tlb_remove_page(tlb, (void *)(page)) -#endif - #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c54d1d0a8e3b..d88e9064c28e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -19,6 +19,14 @@ EXPORT_SYMBOL(physical_mask); #define PGTABLE_HIGHMEM 0 #endif +#ifndef CONFIG_PARAVIRT +static inline +void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + tlb_remove_page(tlb, table); +} +#endif + gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; pgtable_t pte_alloc_one(struct mm_struct *mm) -- cgit v1.2.3 From 96f59fe291d2cdc0fcb6f5f2f4b7c9cea9533fc3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:39 +0200 Subject: x86/tlb: Move cr4_set_bits_and_update_boot() to the usage site No point in having this exposed. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092559.940978251@linutronix.de --- arch/x86/include/asm/tlbflush.h | 14 -------------- arch/x86/mm/init.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index c22fc72c126d..917deea058d5 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -322,23 +322,9 @@ static inline void cr4_clear_bits(unsigned long mask) local_irq_restore(flags); } -/* - * Save some of cr4 feature set we're using (e.g. Pentium 4MB - * enable and PPro Global page enable), so that any CPU's that boot - * up after us can get the correct flags. This should only be used - * during boot on the boot cpu. - */ extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; -static inline void cr4_set_bits_and_update_boot(unsigned long mask) -{ - mmu_cr4_features |= mask; - if (trampoline_cr4_features) - *trampoline_cr4_features = mmu_cr4_features; - cr4_set_bits(mask); -} - extern void initialize_tlbstate_and_flush(void); #define TLB_FLUSH_ALL -1UL diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 71720dd8f28a..d37e8164022e 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -194,6 +194,19 @@ struct map_range { static int page_size_mask; +/* + * Save some of cr4 feature set we're using (e.g. Pentium 4MB + * enable and PPro Global page enable), so that any CPU's that boot + * up after us can get the correct flags. Invoked on the boot CPU. + */ +static inline void cr4_set_bits_and_update_boot(unsigned long mask) +{ + mmu_cr4_features |= mask; + if (trampoline_cr4_features) + *trampoline_cr4_features = mmu_cr4_features; + cr4_set_bits(mask); +} + static void __init probe_page_size_mask(void) { /* -- cgit v1.2.3 From af5c40c6ee057c5354930abdc4d34be013d0e9e0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:40 +0200 Subject: x86/tlb: Uninline nmi_uaccess_okay() cpu_tlbstate is exported because various TLB-related functions need access to it, but cpu_tlbstate is sensitive information which should only be accessed by well-contained kernel functions and not be directly exposed to modules. nmi_access_ok() is the last inline function which requires access to cpu_tlbstate. Move it into the TLB code. No functional change. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092600.052543007@linutronix.de --- arch/x86/include/asm/tlbflush.h | 33 +-------------------------------- arch/x86/mm/tlb.c | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 32 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 917deea058d5..1c17f5a6cb53 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -247,38 +247,7 @@ struct tlb_state { }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); -/* - * Blindly accessing user memory from NMI context can be dangerous - * if we're in the middle of switching the current user task or - * switching the loaded mm. It can also be dangerous if we - * interrupted some kernel code that was temporarily using a - * different mm. - */ -static inline bool nmi_uaccess_okay(void) -{ - struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); - struct mm_struct *current_mm = current->mm; - - VM_WARN_ON_ONCE(!loaded_mm); - - /* - * The condition we want to check is - * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, - * if we're running in a VM with shadow paging, and nmi_uaccess_okay() - * is supposed to be reasonably fast. - * - * Instead, we check the almost equivalent but somewhat conservative - * condition below, and we rely on the fact that switch_mm_irqs_off() - * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. - */ - if (loaded_mm != current_mm) - return false; - - VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); - - return true; -} - +bool nmi_uaccess_okay(void); #define nmi_uaccess_okay nmi_uaccess_okay void cr4_update_irqsoff(unsigned long set, unsigned long clear); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index aabf8c7377e3..45426ae8e7d7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -1094,6 +1094,38 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) put_cpu(); } +/* + * Blindly accessing user memory from NMI context can be dangerous + * if we're in the middle of switching the current user task or + * switching the loaded mm. It can also be dangerous if we + * interrupted some kernel code that was temporarily using a + * different mm. + */ +bool nmi_uaccess_okay(void) +{ + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + struct mm_struct *current_mm = current->mm; + + VM_WARN_ON_ONCE(!loaded_mm); + + /* + * The condition we want to check is + * current_mm->pgd == __va(read_cr3_pa()). This may be slow, though, + * if we're running in a VM with shadow paging, and nmi_uaccess_okay() + * is supposed to be reasonably fast. + * + * Instead, we check the almost equivalent but somewhat conservative + * condition below, and we rely on the fact that switch_mm_irqs_off() + * sets loaded_mm to LOADED_MM_SWITCHING before writing to CR3. + */ + if (loaded_mm != current_mm) + return false; + + VM_WARN_ON_ONCE(current_mm->pgd != __va(read_cr3_pa())); + + return true; +} + static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { -- cgit v1.2.3 From 6c9b7d79a801074837c683fc996e231266ca47ae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:41 +0200 Subject: x86/tlb: Move PCID helpers where they are used Aside of the fact that they are used only in the TLB code, especially having the comment close to the actual implementation makes a lot of sense. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092600.145772183@linutronix.de --- arch/x86/include/asm/tlbflush.h | 133 ++-------------------------------------- arch/x86/mm/tlb.c | 120 ++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 127 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 1c17f5a6cb53..f9731219a28d 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -13,133 +13,6 @@ #include #include -/* - * The x86 feature is called PCID (Process Context IDentifier). It is similar - * to what is traditionally called ASID on the RISC processors. - * - * We don't use the traditional ASID implementation, where each process/mm gets - * its own ASID and flush/restart when we run out of ASID space. - * - * Instead we have a small per-cpu array of ASIDs and cache the last few mm's - * that came by on this CPU, allowing cheaper switch_mm between processes on - * this CPU. - * - * We end up with different spaces for different things. To avoid confusion we - * use different names for each of them: - * - * ASID - [0, TLB_NR_DYN_ASIDS-1] - * the canonical identifier for an mm - * - * kPCID - [1, TLB_NR_DYN_ASIDS] - * the value we write into the PCID part of CR3; corresponds to the - * ASID+1, because PCID 0 is special. - * - * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] - * for KPTI each mm has two address spaces and thus needs two - * PCID values, but we can still do with a single ASID denomination - * for each mm. Corresponds to kPCID + 2048. - * - */ - -/* There are 12 bits of space for ASIDS in CR3 */ -#define CR3_HW_ASID_BITS 12 - -/* - * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for - * user/kernel switches - */ -#ifdef CONFIG_PAGE_TABLE_ISOLATION -# define PTI_CONSUMED_PCID_BITS 1 -#else -# define PTI_CONSUMED_PCID_BITS 0 -#endif - -#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) - -/* - * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account - * for them being zero-based. Another -1 is because PCID 0 is reserved for - * use by non-PCID-aware users. - */ -#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) - -/* - * 6 because 6 should be plenty and struct tlb_state will fit in two cache - * lines. - */ -#define TLB_NR_DYN_ASIDS 6 - -/* - * Given @asid, compute kPCID - */ -static inline u16 kern_pcid(u16 asid) -{ - VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); - -#ifdef CONFIG_PAGE_TABLE_ISOLATION - /* - * Make sure that the dynamic ASID space does not confict with the - * bit we are using to switch between user and kernel ASIDs. - */ - BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); - - /* - * The ASID being passed in here should have respected the - * MAX_ASID_AVAILABLE and thus never have the switch bit set. - */ - VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); -#endif - /* - * The dynamically-assigned ASIDs that get passed in are small - * ( MAX_ASID_AVAILABLE); - /* - * Use boot_cpu_has() instead of this_cpu_has() as this function - * might be called during early boot. This should work even after - * boot because all CPU's the have same capabilities: - */ - VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); - return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; -} - struct flush_tlb_info; void __flush_tlb_all(void); @@ -153,6 +26,12 @@ void flush_tlb_others(const struct cpumask *cpumask, #include #endif +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS 6 + struct tlb_context { u64 ctx_id; u64 tlb_gen; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 45426ae8e7d7..cf81902e6992 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -48,6 +48,126 @@ */ #define LAST_USER_MM_IBPB 0x1UL +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ + +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 + +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) +{ + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); + +#ifdef CONFIG_PAGE_TABLE_ISOLATION + /* + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. + */ + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); + + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * ( MAX_ASID_AVAILABLE); + /* + * Use boot_cpu_has() instead of this_cpu_has() as this function + * might be called during early boot. This should work even after + * boot because all CPU's the have same capabilities: + */ + VM_WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_PCID)); + return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; +} + /* * We get here when we do something requiring a TLB invalidation * but could not go invalidate all of the contexts. We do the -- cgit v1.2.3 From bfe3d8f6313d1e10806062ba22c5f660dddecbcc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Apr 2020 11:20:43 +0200 Subject: x86/tlb: Restrict access to tlbstate Hide tlbstate, flush_tlb_info and related helpers when tlbflush.h is included from a module. Modules have absolutely no business with these internals. Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200421092600.328438734@linutronix.de --- arch/x86/include/asm/tlbflush.h | 96 +++++++++++++++++++++-------------------- arch/x86/mm/init.c | 1 - 2 files changed, 49 insertions(+), 48 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index f9731219a28d..8c87a2e0b660 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -13,19 +13,46 @@ #include #include -struct flush_tlb_info; - void __flush_tlb_all(void); -void flush_tlb_local(void); -void flush_tlb_one_user(unsigned long addr); -void flush_tlb_one_kernel(unsigned long addr); -void flush_tlb_others(const struct cpumask *cpumask, - const struct flush_tlb_info *info); -#ifdef CONFIG_PARAVIRT -#include -#endif +#define TLB_FLUSH_ALL -1UL + +void cr4_update_irqsoff(unsigned long set, unsigned long clear); +unsigned long cr4_read_shadow(void); + +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits_irqsoff(unsigned long mask) +{ + cr4_update_irqsoff(mask, 0); +} +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits_irqsoff(unsigned long mask) +{ + cr4_update_irqsoff(0, mask); +} + +/* Set in this cpu's CR4. */ +static inline void cr4_set_bits(unsigned long mask) +{ + unsigned long flags; + + local_irq_save(flags); + cr4_set_bits_irqsoff(mask); + local_irq_restore(flags); +} + +/* Clear in this cpu's CR4. */ +static inline void cr4_clear_bits(unsigned long mask) +{ + unsigned long flags; + + local_irq_save(flags); + cr4_clear_bits_irqsoff(mask); + local_irq_restore(flags); +} + +#ifndef MODULE /* * 6 because 6 should be plenty and struct tlb_state will fit in two cache * lines. @@ -129,54 +156,17 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); bool nmi_uaccess_okay(void); #define nmi_uaccess_okay nmi_uaccess_okay -void cr4_update_irqsoff(unsigned long set, unsigned long clear); -unsigned long cr4_read_shadow(void); - /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } -/* Set in this cpu's CR4. */ -static inline void cr4_set_bits_irqsoff(unsigned long mask) -{ - cr4_update_irqsoff(mask, 0); -} - -/* Clear in this cpu's CR4. */ -static inline void cr4_clear_bits_irqsoff(unsigned long mask) -{ - cr4_update_irqsoff(0, mask); -} - -/* Set in this cpu's CR4. */ -static inline void cr4_set_bits(unsigned long mask) -{ - unsigned long flags; - - local_irq_save(flags); - cr4_set_bits_irqsoff(mask); - local_irq_restore(flags); -} - -/* Clear in this cpu's CR4. */ -static inline void cr4_clear_bits(unsigned long mask) -{ - unsigned long flags; - - local_irq_save(flags); - cr4_clear_bits_irqsoff(mask); - local_irq_restore(flags); -} - extern unsigned long mmu_cr4_features; extern u32 *trampoline_cr4_features; extern void initialize_tlbstate_and_flush(void); -#define TLB_FLUSH_ALL -1UL - /* * TLB flushing: * @@ -215,6 +205,16 @@ struct flush_tlb_info { bool freed_tables; }; +void flush_tlb_local(void); +void flush_tlb_one_user(unsigned long addr); +void flush_tlb_one_kernel(unsigned long addr); +void flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + +#ifdef CONFIG_PARAVIRT +#include +#endif + #define flush_tlb_mm(mm) \ flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL, true) @@ -255,4 +255,6 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); +#endif /* !MODULE */ + #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d37e8164022e..248dc8fe43c5 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -992,7 +992,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { .next_asid = 1, .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ }; -EXPORT_PER_CPU_SYMBOL(cpu_tlbstate); void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { -- cgit v1.2.3 From 1ff865e343c2b59469d7e41d370a980a3f972c71 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Apr 2020 19:57:59 +0200 Subject: x86,smap: Fix smap_{save,restore}() alternatives As reported by objtool: lib/ubsan.o: warning: objtool: .altinstr_replacement+0x0: alternative modifies stack lib/ubsan.o: warning: objtool: .altinstr_replacement+0x7: alternative modifies stack the smap_{save,restore}() alternatives violate (the newly enforced) rule on stack invariance. That is, due to there only being a single ORC table it must be valid to any alternative. These alternatives violate this with the direct result that unwinds will not be correct when it hits between the PUSH and POP instructions. Rewrite the functions to only have a conditional jump. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Miroslav Benes Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200429101802.GI13592@hirez.programming.kicks-ass.net --- arch/x86/include/asm/smap.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 27c47d183f4b..8b58d6975d5d 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -57,8 +57,10 @@ static __always_inline unsigned long smap_save(void) { unsigned long flags; - asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, - X86_FEATURE_SMAP) + asm volatile ("# smap_save\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "pushf; pop %0; " __ASM_CLAC "\n\t" + "1:" : "=rm" (flags) : : "memory", "cc"); return flags; @@ -66,7 +68,10 @@ static __always_inline unsigned long smap_save(void) static __always_inline void smap_restore(unsigned long flags) { - asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + asm volatile ("# smap_restore\n\t" + ALTERNATIVE("jmp 1f", "", X86_FEATURE_SMAP) + "push %0; popf\n\t" + "1:" : : "g" (flags) : "memory", "cc"); } -- cgit v1.2.3 From 089dd8e53126ebaf506e2dc0bf89d652c36bfc12 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 14 Apr 2020 12:36:16 +0200 Subject: x86/speculation: Change FILL_RETURN_BUFFER to work with objtool Change FILL_RETURN_BUFFER so that objtool groks it and can generate correct ORC unwind information. - Since ORC is alternative invariant; that is, all alternatives should have the same ORC entries, the __FILL_RETURN_BUFFER body can not be part of an alternative. Therefore, move it out of the alternative and keep the alternative as a sort of jump_label around it. - Use the ANNOTATE_INTRA_FUNCTION_CALL annotation to white-list these 'funny' call instructions to nowhere. - Use UNWIND_HINT_EMPTY to 'fill' the speculation traps, otherwise objtool will consider them unreachable. - Move the RSP adjustment into the loop, such that the loop has a deterministic stack layout. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Alexandre Chartre Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200428191700.032079304@infradead.org --- arch/x86/include/asm/nospec-branch.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7e9a281e2660..b8890e18f624 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -4,11 +4,13 @@ #define _ASM_X86_NOSPEC_BRANCH_H_ #include +#include #include #include #include #include +#include /* * This should be used immediately before a retpoline alternative. It tells @@ -46,21 +48,25 @@ #define __FILL_RETURN_BUFFER(reg, nr, sp) \ mov $(nr/2), reg; \ 771: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 772f; \ 773: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 773b; \ 772: \ + ANNOTATE_INTRA_FUNCTION_CALL; \ call 774f; \ 775: /* speculation trap */ \ + UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 775b; \ 774: \ + add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ - jnz 771b; \ - add $(BITS_PER_LONG/8) * nr, sp; + jnz 771b; #ifdef __ASSEMBLY__ @@ -137,10 +143,8 @@ */ .macro FILL_RETURN_BUFFER reg:req nr:req ftr:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE "jmp .Lskip_rsb_\@", \ - __stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP)) \ - \ftr + ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr + __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) .Lskip_rsb_\@: #endif .endm -- cgit v1.2.3 From ca3f0d80dd57c8828bfb5bc0bc79750ea7a1ba26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Apr 2020 17:03:22 +0200 Subject: x86: Simplify retpoline declaration Because of how KSYM works, we need one declaration per line. Seeing how we're going to be doubling the amount of retpoline symbols, simplify the machinery in order to avoid having to copy/paste even more. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200428191700.091696925@infradead.org --- arch/x86/include/asm/GEN-for-each-reg.h | 25 ++++++++++++++++++++++ arch/x86/include/asm/asm-prototypes.h | 28 +++++++------------------ arch/x86/lib/retpoline.S | 37 ++++++++++++++------------------- 3 files changed, 49 insertions(+), 41 deletions(-) create mode 100644 arch/x86/include/asm/GEN-for-each-reg.h (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/GEN-for-each-reg.h b/arch/x86/include/asm/GEN-for-each-reg.h new file mode 100644 index 000000000000..1b07fb102c4e --- /dev/null +++ b/arch/x86/include/asm/GEN-for-each-reg.h @@ -0,0 +1,25 @@ +#ifdef CONFIG_64BIT +GEN(rax) +GEN(rbx) +GEN(rcx) +GEN(rdx) +GEN(rsi) +GEN(rdi) +GEN(rbp) +GEN(r8) +GEN(r9) +GEN(r10) +GEN(r11) +GEN(r12) +GEN(r13) +GEN(r14) +GEN(r15) +#else +GEN(eax) +GEN(ebx) +GEN(ecx) +GEN(edx) +GEN(esi) +GEN(edi) +GEN(ebp) +#endif diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index ce92c4acc913..aa7585e1fe67 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -17,24 +17,12 @@ extern void cmpxchg8b_emu(void); #endif #ifdef CONFIG_RETPOLINE -#ifdef CONFIG_X86_32 -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); -#else -#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); -INDIRECT_THUNK(8) -INDIRECT_THUNK(9) -INDIRECT_THUNK(10) -INDIRECT_THUNK(11) -INDIRECT_THUNK(12) -INDIRECT_THUNK(13) -INDIRECT_THUNK(14) -INDIRECT_THUNK(15) -#endif -INDIRECT_THUNK(ax) -INDIRECT_THUNK(bx) -INDIRECT_THUNK(cx) -INDIRECT_THUNK(dx) -INDIRECT_THUNK(si) -INDIRECT_THUNK(di) -INDIRECT_THUNK(bp) + +#define DECL_INDIRECT_THUNK(reg) \ + extern asmlinkage void __x86_indirect_thunk_ ## reg (void); + +#undef GEN +#define GEN(reg) DECL_INDIRECT_THUNK(reg) +#include + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 363ec132df7e..9cc5480300c9 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -24,25 +24,20 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) * only see one instance of "__x86_indirect_thunk_\reg" rather * than one per register with the correct names. So we do it * the simple and nasty way... + * + * Worse, you can only have a single EXPORT_SYMBOL per line, + * and CPP can't insert newlines, so we have to repeat everything + * at least twice. */ -#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) -#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) -#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg) - -GENERATE_THUNK(_ASM_AX) -GENERATE_THUNK(_ASM_BX) -GENERATE_THUNK(_ASM_CX) -GENERATE_THUNK(_ASM_DX) -GENERATE_THUNK(_ASM_SI) -GENERATE_THUNK(_ASM_DI) -GENERATE_THUNK(_ASM_BP) -#ifdef CONFIG_64BIT -GENERATE_THUNK(r8) -GENERATE_THUNK(r9) -GENERATE_THUNK(r10) -GENERATE_THUNK(r11) -GENERATE_THUNK(r12) -GENERATE_THUNK(r13) -GENERATE_THUNK(r14) -GENERATE_THUNK(r15) -#endif + +#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) +#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) + +#undef GEN +#define GEN(reg) THUNK reg +#include + +#undef GEN +#define GEN(reg) EXPORT_THUNK(reg) +#include + -- cgit v1.2.3 From 34fdce6981b96920ced4e0ee56e9db3fb03a33f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Apr 2020 17:16:40 +0200 Subject: x86: Change {JMP,CALL}_NOSPEC argument In order to change the {JMP,CALL}_NOSPEC macros to call out-of-line versions of the retpoline magic, we need to remove the '%' from the argument, such that we can paste it onto symbol names. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200428191700.151623523@infradead.org --- arch/x86/crypto/aesni-intel_asm.S | 4 ++-- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 2 +- arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 2 +- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 26 +++++++++++++------------- arch/x86/entry/entry_32.S | 6 +++--- arch/x86/entry/entry_64.S | 2 +- arch/x86/include/asm/nospec-branch.h | 16 ++++++++-------- arch/x86/kernel/ftrace_32.S | 2 +- arch/x86/kernel/ftrace_64.S | 4 ++-- arch/x86/lib/checksum_32.S | 4 ++-- arch/x86/platform/efi/efi_stub_64.S | 2 +- 11 files changed, 35 insertions(+), 35 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index cad6e1bfa7d5..54e7d15dbd0d 100644 --- a/arch/x86/crypto/aesni-intel_asm.S +++ b/arch/x86/crypto/aesni-intel_asm.S @@ -2758,7 +2758,7 @@ SYM_FUNC_START(aesni_xts_crypt8) pxor INC, STATE4 movdqu IV, 0x30(OUTP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2803,7 +2803,7 @@ SYM_FUNC_START(aesni_xts_crypt8) _aesni_gf128mul_x_ble() movups IV, (IVP) - CALL_NOSPEC %r11 + CALL_NOSPEC r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S index d01ddd73de65..ecc0a9a905c4 100644 --- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -1228,7 +1228,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_16way) vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 16), %rsp; diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S index 563ef6e83cdd..0907243c501c 100644 --- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -1339,7 +1339,7 @@ SYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - CALL_NOSPEC %r9; + CALL_NOSPEC r9; addq $(16 * 32), %rsp; diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 0e6690e3618c..8501ec4532f4 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -75,7 +75,7 @@ .text SYM_FUNC_START(crc_pcl) -#define bufp %rdi +#define bufp rdi #define bufp_dw %edi #define bufp_w %di #define bufp_b %dil @@ -105,9 +105,9 @@ SYM_FUNC_START(crc_pcl) ## 1) ALIGN: ################################################################ - mov bufp, bufptmp # rdi = *buf - neg bufp - and $7, bufp # calculate the unalignment amount of + mov %bufp, bufptmp # rdi = *buf + neg %bufp + and $7, %bufp # calculate the unalignment amount of # the address je proc_block # Skip if aligned @@ -123,13 +123,13 @@ SYM_FUNC_START(crc_pcl) do_align: #### Calculate CRC of unaligned bytes of the buffer (if any) movq (bufptmp), tmp # load a quadward from the buffer - add bufp, bufptmp # align buffer pointer for quadword + add %bufp, bufptmp # align buffer pointer for quadword # processing - sub bufp, len # update buffer length + sub %bufp, len # update buffer length align_loop: crc32b %bl, crc_init_dw # compute crc32 of 1-byte shr $8, tmp # get next byte - dec bufp + dec %bufp jne align_loop proc_block: @@ -169,10 +169,10 @@ continue_block: xor crc2, crc2 ## branch into array - lea jump_table(%rip), bufp - movzxw (bufp, %rax, 2), len - lea crc_array(%rip), bufp - lea (bufp, len, 1), bufp + lea jump_table(%rip), %bufp + movzxw (%bufp, %rax, 2), len + lea crc_array(%rip), %bufp + lea (%bufp, len, 1), %bufp JMP_NOSPEC bufp ################################################################ @@ -218,9 +218,9 @@ LABEL crc_ %i ## 4) Combine three results: ################################################################ - lea (K_table-8)(%rip), bufp # first entry is for idx 1 + lea (K_table-8)(%rip), %bufp # first entry is for idx 1 shlq $3, %rax # rax *= 8 - pmovzxdq (bufp,%rax), %xmm0 # 2 consts: K1:K2 + pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 leal (%eax,%eax,2), %eax # rax *= 3 (total *24) subq %rax, tmp # tmp -= rax*24 diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b67bae7091d7..7e7ffb7a5147 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -816,7 +816,7 @@ SYM_CODE_START(ret_from_fork) /* kernel thread */ 1: movl %edi, %eax - CALL_NOSPEC %ebx + CALL_NOSPEC ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -1501,7 +1501,7 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi + CALL_NOSPEC edi jmp ret_from_exception SYM_CODE_END(common_exception_read_cr2) @@ -1522,7 +1522,7 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception) TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - CALL_NOSPEC %edi + CALL_NOSPEC edi jmp ret_from_exception SYM_CODE_END(common_exception) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0e9504fabe52..168b798913bc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -349,7 +349,7 @@ SYM_CODE_START(ret_from_fork) /* kernel thread */ UNWIND_HINT_EMPTY movq %r12, %rdi - CALL_NOSPEC %rbx + CALL_NOSPEC rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index b8890e18f624..d3269b69728a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -118,22 +118,22 @@ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ - __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ + __stringify(RETPOLINE_JMP %\reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD #else - jmp *\reg + jmp *%\reg #endif .endm .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ - __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg),\ + __stringify(RETPOLINE_CALL %\reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD #else - call *\reg + call *%\reg #endif .endm diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index e8a9f8370112..e405fe1a8bf4 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -189,5 +189,5 @@ return_to_handler: movl %eax, %ecx popl %edx popl %eax - JMP_NOSPEC %ecx + JMP_NOSPEC ecx #endif diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 9738ed23964e..aa5d28aeb31e 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -301,7 +301,7 @@ trace: * function tracing is enabled. */ movq ftrace_trace_function, %r8 - CALL_NOSPEC %r8 + CALL_NOSPEC r8 restore_mcount_regs jmp fgraph_trace @@ -338,6 +338,6 @@ SYM_CODE_START(return_to_handler) movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - JMP_NOSPEC %rdi + JMP_NOSPEC rdi SYM_CODE_END(return_to_handler) #endif diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index 4742e8fa7ee7..d1d768912368 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -153,7 +153,7 @@ SYM_FUNC_START(csum_partial) negl %ebx lea 45f(%ebx,%ebx,2), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx # Handle 2-byte-aligned regions 20: addw (%esi), %ax @@ -436,7 +436,7 @@ SYM_FUNC_START(csum_partial_copy_generic) andl $-32,%edx lea 3f(%ebx,%ebx), %ebx testl %esi, %esi - JMP_NOSPEC %ebx + JMP_NOSPEC ebx 1: addl $64,%esi addl $64,%edi SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) diff --git a/arch/x86/platform/efi/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S index 15da118f04f0..90380a17ab23 100644 --- a/arch/x86/platform/efi/efi_stub_64.S +++ b/arch/x86/platform/efi/efi_stub_64.S @@ -21,7 +21,7 @@ SYM_FUNC_START(__efi_call) mov %r8, %r9 mov %rcx, %r8 mov %rsi, %rcx - CALL_NOSPEC %rdi + CALL_NOSPEC rdi leave ret SYM_FUNC_END(__efi_call) -- cgit v1.2.3 From cc1ac9c792810b93783a7de344f428922af8d98c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Apr 2020 14:34:26 +0200 Subject: x86/retpoline: Fix retpoline unwind Currently objtool cannot understand retpolines, and thus cannot generate ORC unwind information for them. This means that we cannot unwind from the middle of a retpoline. The recent ANNOTATE_INTRA_FUNCTION_CALL and UNWIND_HINT_RET_OFFSET support in objtool enables it to understand the basic retpoline construct. A further problem is that the ORC unwind information is alternative invariant; IOW. every alternative should have the same ORC, retpolines obviously violate this. This means we need to out-of-line them. Since all GCC generated code already uses out-of-line retpolines, this should not affect performance much, if anything. This will enable objtool to generate valid ORC data for the out-of-line copies, which means we can correctly and reliably unwind through a retpoline. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lkml.kernel.org/r/20200428191700.210835357@infradead.org --- arch/x86/include/asm/asm-prototypes.h | 7 +++++ arch/x86/include/asm/nospec-branch.h | 56 +++++------------------------------ arch/x86/lib/retpoline.S | 26 ++++++++++++++-- 3 files changed, 38 insertions(+), 51 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h index aa7585e1fe67..9bf2620ce817 100644 --- a/arch/x86/include/asm/asm-prototypes.h +++ b/arch/x86/include/asm/asm-prototypes.h @@ -21,8 +21,15 @@ extern void cmpxchg8b_emu(void); #define DECL_INDIRECT_THUNK(reg) \ extern asmlinkage void __x86_indirect_thunk_ ## reg (void); +#define DECL_RETPOLINE(reg) \ + extern asmlinkage void __x86_retpoline_ ## reg (void); + #undef GEN #define GEN(reg) DECL_INDIRECT_THUNK(reg) #include +#undef GEN +#define GEN(reg) DECL_RETPOLINE(reg) +#include + #endif /* CONFIG_RETPOLINE */ diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index d3269b69728a..d52d1aacdd97 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -12,15 +12,6 @@ #include #include -/* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - ANNOTATE_IGNORE_ALTERNATIVE - /* * Fill the CPU return stack buffer. * @@ -82,34 +73,6 @@ .popsection .endm -/* - * These are the bare retpoline primitives for indirect jmp and call. - * Do not use these directly; they only exist to make the ALTERNATIVE - * invocation below less ugly. - */ -.macro RETPOLINE_JMP reg:req - call .Ldo_rop_\@ -.Lspec_trap_\@: - pause - lfence - jmp .Lspec_trap_\@ -.Ldo_rop_\@: - mov \reg, (%_ASM_SP) - ret -.endm - -/* - * This is a wrapper around RETPOLINE_JMP so the called function in reg - * returns to the instruction after the macro. - */ -.macro RETPOLINE_CALL reg:req - jmp .Ldo_call_\@ -.Ldo_retpoline_jmp_\@: - RETPOLINE_JMP \reg -.Ldo_call_\@: - call .Ldo_retpoline_jmp_\@ -.endm - /* * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 @@ -117,10 +80,9 @@ */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ - __stringify(RETPOLINE_JMP %\reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \ + __stringify(jmp __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), X86_FEATURE_RETPOLINE_AMD #else jmp *%\reg #endif @@ -128,10 +90,9 @@ .macro CALL_NOSPEC reg:req #ifdef CONFIG_RETPOLINE - ANNOTATE_NOSPEC_ALTERNATIVE - ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg),\ - __stringify(RETPOLINE_CALL %\reg), X86_FEATURE_RETPOLINE,\ - __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *%\reg), \ + __stringify(call __x86_retpoline_\reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *%\reg), X86_FEATURE_RETPOLINE_AMD #else call *%\reg #endif @@ -165,16 +126,16 @@ * which is ensured when CONFIG_RETPOLINE is defined. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ + "call __x86_retpoline_%V[thunk_target]\n", \ X86_FEATURE_RETPOLINE, \ "lfence;\n" \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ X86_FEATURE_RETPOLINE_AMD) + # define THUNK_TARGET(addr) [thunk_target] "r" (addr) #else /* CONFIG_X86_32 */ @@ -184,7 +145,6 @@ * here, anyway. */ # define CALL_NOSPEC \ - ANNOTATE_NOSPEC_ALTERNATIVE \ ALTERNATIVE_2( \ ANNOTATE_RETPOLINE_SAFE \ "call *%[thunk_target]\n", \ diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 9cc5480300c9..b4c43a9b1483 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -7,15 +7,31 @@ #include #include #include +#include +#include .macro THUNK reg .section .text.__x86.indirect_thunk + .align 32 SYM_FUNC_START(__x86_indirect_thunk_\reg) - CFI_STARTPROC - JMP_NOSPEC %\reg - CFI_ENDPROC + JMP_NOSPEC \reg SYM_FUNC_END(__x86_indirect_thunk_\reg) + +SYM_FUNC_START_NOALIGN(__x86_retpoline_\reg) + ANNOTATE_INTRA_FUNCTION_CALL + call .Ldo_rop_\@ +.Lspec_trap_\@: + UNWIND_HINT_EMPTY + pause + lfence + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov %\reg, (%_ASM_SP) + UNWIND_HINT_RET_OFFSET + ret +SYM_FUNC_END(__x86_retpoline_\reg) + .endm /* @@ -32,6 +48,7 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) #define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym) #define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg) +#define EXPORT_RETPOLINE(reg) __EXPORT_THUNK(__x86_retpoline_ ## reg) #undef GEN #define GEN(reg) THUNK reg @@ -41,3 +58,6 @@ SYM_FUNC_END(__x86_indirect_thunk_\reg) #define GEN(reg) EXPORT_THUNK(reg) #include +#undef GEN +#define GEN(reg) EXPORT_RETPOLINE(reg) +#include -- cgit v1.2.3 From fdc63ff0e49c588884992b4b2656345a5e878b32 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 8 Apr 2020 21:13:10 +0300 Subject: ftrace/x86: Fix trace event registration for syscalls without arguments The refactoring of SYSCALL_DEFINE0() macros removed the ABI stubs and simply defines __abi_sys_$NAME as alias of __do_sys_$NAME. As a result kallsyms_lookup() returns "__do_sys_$NAME" which does not match with the declared trace event name. See also commit 1c758a2202a6 ("tracing/x86: Update syscall trace events to handle new prefixed syscall func names"). Add __do_sys_ to the valid prefixes which are checked in arch_syscall_match_sym_name(). Fixes: d2b5de495ee9 ("x86/entry: Refactor SYSCALL_DEFINE0 macros") Signed-off-by: Konstantin Khlebnikov Signed-off-by: Thomas Gleixner Acked-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/158636958997.7900.16485049455470033557.stgit@buzz --- arch/x86/include/asm/ftrace.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 85be2f506272..70b96cae5b42 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -61,11 +61,12 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name { /* * Compare the symbol name with the system call name. Skip the - * "__x64_sys", "__ia32_sys" or simple "sys" prefix. + * "__x64_sys", "__ia32_sys", "__do_sys" or simple "sys" prefix. */ return !strcmp(sym + 3, name + 3) || (!strncmp(sym, "__x64_", 6) && !strcmp(sym + 9, name + 3)) || - (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)); + (!strncmp(sym, "__ia32_", 7) && !strcmp(sym + 10, name + 3)) || + (!strncmp(sym, "__do_sys", 8) && !strcmp(sym + 8, name + 3)); } #ifndef COMPILE_OFFSETS -- cgit v1.2.3 From bd1de2a7aace4d1d312fb1be264b8fafdb706208 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 3 May 2020 12:27:29 +0200 Subject: x86/tlb/uv: Add a forward declaration for struct flush_tlb_info MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... to fix these build warnings: In file included from ./arch/x86/include/asm/uv/uv_hub.h:22, from drivers/misc/sgi-gru/grukdump.c:16: ./arch/x86/include/asm/uv/uv.h:39:21: warning: ‘struct flush_tlb_info’ declared \ inside parameter list will not be visible outside of this definition or declaration 39 | const struct flush_tlb_info *info); | ^~~~~~~~~~~~~~ In file included from ./arch/x86/include/asm/uv/uv_hub.h:22, from drivers/misc/sgi-gru/grutlbpurge.c:28: ./arch/x86/include/asm/uv/uv.h:39:21: warning: ‘struct flush_tlb_info’ declared \ inside parameter list will not be visible outside of this definition or declaration 39 | const struct flush_tlb_info *info); | ^~~~~~~~~~~~~~ ... after bfe3d8f6313d ("x86/tlb: Restrict access to tlbstate") restricted access to tlbstate. Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200503103107.3419-1-bp@alien8.de --- arch/x86/include/asm/uv/uv.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 45ea95ce79b4..91e088ac6904 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -8,6 +8,7 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC}; struct cpumask; struct mm_struct; +struct flush_tlb_info; #ifdef CONFIG_X86_UV #include -- cgit v1.2.3 From 637543a8d61c6afe4e9be64bfb43c78701a83375 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Tue, 7 Apr 2020 01:13:09 -0500 Subject: KVM: x86: Fixes posted interrupt check for IRQs delivery modes Current logic incorrectly uses the enum ioapic_irq_destination_types to check the posted interrupt destination types. However, the value was set using APIC_DM_XXX macros, which are left-shifted by 8 bits. Fixes by using the APIC_DM_FIXED and APIC_DM_LOWEST instead. Fixes: (fdcf75621375 'KVM: x86: Disable posted interrupts for non-standard IRQs delivery modes') Cc: Alexander Graf Signed-off-by: Suravee Suthikulpanit Message-Id: <1586239989-58305-1-git-send-email-suravee.suthikulpanit@amd.com> Reviewed-by: Maxim Levitsky Tested-by: Maxim Levitsky Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 42a2d0d3984a..0dea9f122bb9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1663,8 +1663,8 @@ void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ - return (irq->delivery_mode == dest_Fixed || - irq->delivery_mode == dest_LowestPrio); + return (irq->delivery_mode == APIC_DM_FIXED || + irq->delivery_mode == APIC_DM_LOWEST); } static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From de8c55208c3865d0532466097b0244fbea1d9089 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 4 May 2020 11:02:48 -0400 Subject: efi/libstub: Fix mixed mode boot issue after macro refactor Commit 22090f84bc3f ("efi/libstub: unify EFI call wrappers for non-x86") refactored the macros that are used to provide wrappers for mixed-mode calls on x86, allowing us to boot a 64-bit kernel on 32-bit firmware. Unfortunately, this broke mixed mode boot due to the fact that efi_is_native() is not a macro on x86. All of these macros should go together, so rather than testing each one to see if it is defined, condition the generic macro definitions on a new ARCH_HAS_EFISTUB_WRAPPERS, and remove the wrapper definitions on x86 as well if CONFIG_EFI_MIXED is not enabled. Fixes: 22090f84bc3f ("efi/libstub: unify EFI call wrappers for non-x86") Reported-by: Guenter Roeck Signed-off-by: Arvind Sankar Link: https://lore.kernel.org/r/20200504150248.62482-1-nivedita@alum.mit.edu Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 19 +++++++++++++++---- drivers/firmware/efi/libstub/efistub.h | 14 ++++---------- 2 files changed, 19 insertions(+), 14 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index cd0c3fbf6156..6b9ab0d8b2a7 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -225,13 +225,15 @@ efi_status_t efi_set_virtual_address_map(unsigned long memory_map_size, /* arch specific definitions used by the stub code */ -extern const bool efi_is64; +#ifdef CONFIG_EFI_MIXED + +#define ARCH_HAS_EFISTUB_WRAPPERS static inline bool efi_is_64bit(void) { - if (IS_ENABLED(CONFIG_EFI_MIXED)) - return efi_is64; - return IS_ENABLED(CONFIG_X86_64); + extern const bool efi_is64; + + return efi_is64; } static inline bool efi_is_native(void) @@ -356,6 +358,15 @@ static inline u32 efi64_convert_status(efi_status_t status) runtime), \ func, __VA_ARGS__)) +#else /* CONFIG_EFI_MIXED */ + +static inline bool efi_is_64bit(void) +{ + return IS_ENABLED(CONFIG_X86_64); +} + +#endif /* CONFIG_EFI_MIXED */ + extern bool efi_reboot_required(void); extern bool efi_is_table_address(unsigned long phys_addr); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 874233cf8820..4f10a09563f3 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -33,20 +33,14 @@ extern bool efi_novamap; extern const efi_system_table_t *efi_system_table; -#ifndef efi_bs_call +#ifndef ARCH_HAS_EFISTUB_WRAPPERS + +#define efi_is_native() (true) #define efi_bs_call(func, ...) efi_system_table->boottime->func(__VA_ARGS__) -#endif -#ifndef efi_rt_call #define efi_rt_call(func, ...) efi_system_table->runtime->func(__VA_ARGS__) -#endif -#ifndef efi_is_native -#define efi_is_native() (true) -#endif -#ifndef efi_table_attr #define efi_table_attr(inst, attr) (inst->attr) -#endif -#ifndef efi_call_proto #define efi_call_proto(inst, func, ...) inst->func(inst, ##__VA_ARGS__) + #endif #define efi_info(msg) do { \ -- cgit v1.2.3 From c3b3f52476412a3899f2c65b220075aceb18dd2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 5 May 2020 12:12:53 +0200 Subject: signal: refactor copy_siginfo_to_user32 Factor out a copy_siginfo_to_external32 helper from copy_siginfo_to_user32 that fills out the compat_siginfo, but does so on a kernel space data structure. With that we can let architectures override copy_siginfo_to_user32 with their own implementations using copy_siginfo_to_external32. That allows moving the x32 SIGCHLD purely to x86 architecture code. As a nice side effect copy_siginfo_to_external32 also comes in handy for avoiding a set_fs() call in the coredump code later on. Contains improvements from Eric W. Biederman and Arnd Bergmann . Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- arch/x86/ia32/ia32_signal.c | 2 +- arch/x86/include/asm/compat.h | 8 +++- arch/x86/kernel/signal.c | 28 ++++++++++- include/linux/compat.h | 11 ++++- kernel/signal.c | 106 +++++++++++++++++++++--------------------- 5 files changed, 96 insertions(+), 59 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index f9d8804144d0..81cf22398cd1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -350,7 +350,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, unsafe_put_user(*(__u64 *)set, (__u64 *)&frame->uc.uc_sigmask, Efault); user_access_end(); - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, false)) + if (__copy_siginfo_to_user32(&frame->info, &ksig->info)) return -EFAULT; /* Set up registers for signal handler */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 52e9f3480f69..d4edf281fff4 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -214,7 +214,11 @@ static inline bool in_compat_syscall(void) #endif struct compat_siginfo; -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const kernel_siginfo_t *from, bool x32_ABI); + +#ifdef CONFIG_X86_X32_ABI +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const kernel_siginfo_t *from); +#define copy_siginfo_to_user32 copy_siginfo_to_user32 +#endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 83b74fb38c8f..f3df262e370b 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -37,6 +37,7 @@ #include #ifdef CONFIG_X86_64 +#include #include #include #endif /* CONFIG_X86_64 */ @@ -511,6 +512,31 @@ Efault: } #endif /* CONFIG_X86_32 */ +#ifdef CONFIG_X86_X32_ABI +static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); + if (from->si_signo == SIGCHLD) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + return 0; +} + +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + if (in_x32_syscall()) + return x32_copy_siginfo_to_user(to, from); + return __copy_siginfo_to_user32(to, from); +} +#endif /* CONFIG_X86_X32_ABI */ + static int x32_setup_rt_frame(struct ksignal *ksig, compat_sigset_t *set, struct pt_regs *regs) @@ -543,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, user_access_end(); if (ksig->ka.sa.sa_flags & SA_SIGINFO) { - if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true)) + if (x32_copy_siginfo_to_user(&frame->info, &ksig->info)) return -EFAULT; } diff --git a/include/linux/compat.h b/include/linux/compat.h index 0480ba4db592..e90100c0de72 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -402,8 +402,15 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size); -int copy_siginfo_from_user32(kernel_siginfo_t *to, const struct compat_siginfo __user *from); -int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from); +int copy_siginfo_from_user32(kernel_siginfo_t *to, + const struct compat_siginfo __user *from); +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const kernel_siginfo_t *from); +#ifndef copy_siginfo_to_user32 +#define copy_siginfo_to_user32 __copy_siginfo_to_user32 +#endif int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); diff --git a/kernel/signal.c b/kernel/signal.c index e58a6c619824..2bc9458d40bd 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3235,94 +3235,94 @@ int copy_siginfo_from_user(kernel_siginfo_t *to, const siginfo_t __user *from) } #ifdef CONFIG_COMPAT -int copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} -int __copy_siginfo_to_user32(struct compat_siginfo __user *to, - const struct kernel_siginfo *from, bool x32_ABI) -#endif +/** + * copy_siginfo_to_external32 - copy a kernel siginfo into a compat user siginfo + * @to: compat siginfo destination + * @from: kernel siginfo source + * + * Note: This function does not work properly for the SIGCHLD on x32, but + * fortunately it doesn't have to. The only valid callers for this function are + * copy_siginfo_to_user32, which is overriden for x32 and the coredump code. + * The latter does not care because SIGCHLD will never cause a coredump. + */ +void copy_siginfo_to_external32(struct compat_siginfo *to, + const struct kernel_siginfo *from) { - struct compat_siginfo new; - memset(&new, 0, sizeof(new)); + memset(to, 0, sizeof(*to)); - new.si_signo = from->si_signo; - new.si_errno = from->si_errno; - new.si_code = from->si_code; + to->si_signo = from->si_signo; + to->si_errno = from->si_errno; + to->si_code = from->si_code; switch(siginfo_layout(from->si_signo, from->si_code)) { case SIL_KILL: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; break; case SIL_TIMER: - new.si_tid = from->si_tid; - new.si_overrun = from->si_overrun; - new.si_int = from->si_int; + to->si_tid = from->si_tid; + to->si_overrun = from->si_overrun; + to->si_int = from->si_int; break; case SIL_POLL: - new.si_band = from->si_band; - new.si_fd = from->si_fd; + to->si_band = from->si_band; + to->si_fd = from->si_fd; break; case SIL_FAULT: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif break; case SIL_FAULT_MCEERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_addr_lsb = from->si_addr_lsb; + to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_lower = ptr_to_compat(from->si_lower); - new.si_upper = ptr_to_compat(from->si_upper); + to->si_lower = ptr_to_compat(from->si_lower); + to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: - new.si_addr = ptr_to_compat(from->si_addr); + to->si_addr = ptr_to_compat(from->si_addr); #ifdef __ARCH_SI_TRAPNO - new.si_trapno = from->si_trapno; + to->si_trapno = from->si_trapno; #endif - new.si_pkey = from->si_pkey; + to->si_pkey = from->si_pkey; break; case SIL_CHLD: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_status = from->si_status; -#ifdef CONFIG_X86_X32_ABI - if (x32_ABI) { - new._sifields._sigchld_x32._utime = from->si_utime; - new._sifields._sigchld_x32._stime = from->si_stime; - } else -#endif - { - new.si_utime = from->si_utime; - new.si_stime = from->si_stime; - } + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_status = from->si_status; + to->si_utime = from->si_utime; + to->si_stime = from->si_stime; break; case SIL_RT: - new.si_pid = from->si_pid; - new.si_uid = from->si_uid; - new.si_int = from->si_int; + to->si_pid = from->si_pid; + to->si_uid = from->si_uid; + to->si_int = from->si_int; break; case SIL_SYS: - new.si_call_addr = ptr_to_compat(from->si_call_addr); - new.si_syscall = from->si_syscall; - new.si_arch = from->si_arch; + to->si_call_addr = ptr_to_compat(from->si_call_addr); + to->si_syscall = from->si_syscall; + to->si_arch = from->si_arch; break; } +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct kernel_siginfo *from) +{ + struct compat_siginfo new; + + copy_siginfo_to_external32(&new, from); if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) return -EFAULT; - return 0; } -- cgit v1.2.3 From 8dd97c65185c5a63c668e5bd8a861c04f47a35ed Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:12 -0700 Subject: x86/resctrl: Rename asm/resctrl_sched.h to asm/resctrl.h asm/resctrl_sched.h is dedicated to the code used for configuration of the CPU resource control state when a task is scheduled. Rename resctrl_sched.h to resctrl.h in preparation of additions that will no longer make this file dedicated to work done during scheduling. No functional change. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/6914e0ef880b539a82a6d889f9423496d471ad1d.1588715690.git.reinette.chatre@intel.com --- MAINTAINERS | 2 +- arch/x86/include/asm/resctrl.h | 93 +++++++++++++++++++++++++++++++ arch/x86/include/asm/resctrl_sched.h | 93 ------------------------------- arch/x86/kernel/cpu/resctrl/core.c | 2 +- arch/x86/kernel/cpu/resctrl/pseudo_lock.c | 2 +- arch/x86/kernel/cpu/resctrl/rdtgroup.c | 2 +- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 8 files changed, 99 insertions(+), 99 deletions(-) create mode 100644 arch/x86/include/asm/resctrl.h delete mode 100644 arch/x86/include/asm/resctrl_sched.h (limited to 'arch/x86/include') diff --git a/MAINTAINERS b/MAINTAINERS index 2926327e4976..ecfb07ab49b0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14227,7 +14227,7 @@ M: Reinette Chatre L: linux-kernel@vger.kernel.org S: Supported F: Documentation/x86/resctrl* -F: arch/x86/include/asm/resctrl_sched.h +F: arch/x86/include/asm/resctrl.h F: arch/x86/kernel/cpu/resctrl/ F: tools/testing/selftests/resctrl/ diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h new file mode 100644 index 000000000000..99b5728e441d --- /dev/null +++ b/arch/x86/include/asm/resctrl.h @@ -0,0 +1,93 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_RESCTRL_H +#define _ASM_X86_RESCTRL_H + +#ifdef CONFIG_X86_CPU_RESCTRL + +#include +#include + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct resctrl_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct resctrl_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __resctrl_sched_in(void) +{ + struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void resctrl_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __resctrl_sched_in(); +} + +#else + +static inline void resctrl_sched_in(void) {} + +#endif /* CONFIG_X86_CPU_RESCTRL */ + +#endif /* _ASM_X86_RESCTRL_H */ diff --git a/arch/x86/include/asm/resctrl_sched.h b/arch/x86/include/asm/resctrl_sched.h deleted file mode 100644 index f6b7fe2833cc..000000000000 --- a/arch/x86/include/asm/resctrl_sched.h +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_RESCTRL_SCHED_H -#define _ASM_X86_RESCTRL_SCHED_H - -#ifdef CONFIG_X86_CPU_RESCTRL - -#include -#include - -#define IA32_PQR_ASSOC 0x0c8f - -/** - * struct resctrl_pqr_state - State cache for the PQR MSR - * @cur_rmid: The cached Resource Monitoring ID - * @cur_closid: The cached Class Of Service ID - * @default_rmid: The user assigned Resource Monitoring ID - * @default_closid: The user assigned cached Class Of Service ID - * - * The upper 32 bits of IA32_PQR_ASSOC contain closid and the - * lower 10 bits rmid. The update to IA32_PQR_ASSOC always - * contains both parts, so we need to cache them. This also - * stores the user configured per cpu CLOSID and RMID. - * - * The cache also helps to avoid pointless updates if the value does - * not change. - */ -struct resctrl_pqr_state { - u32 cur_rmid; - u32 cur_closid; - u32 default_rmid; - u32 default_closid; -}; - -DECLARE_PER_CPU(struct resctrl_pqr_state, pqr_state); - -DECLARE_STATIC_KEY_FALSE(rdt_enable_key); -DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); -DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); - -/* - * __resctrl_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR - * - * Following considerations are made so that this has minimal impact - * on scheduler hot path: - * - This will stay as no-op unless we are running on an Intel SKU - * which supports resource control or monitoring and we enable by - * mounting the resctrl file system. - * - Caches the per cpu CLOSid/RMID values and does the MSR write only - * when a task with a different CLOSid/RMID is scheduled in. - * - We allocate RMIDs/CLOSids globally in order to keep this as - * simple as possible. - * Must be called with preemption disabled. - */ -static void __resctrl_sched_in(void) -{ - struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state); - u32 closid = state->default_closid; - u32 rmid = state->default_rmid; - - /* - * If this task has a closid/rmid assigned, use it. - * Else use the closid/rmid assigned to this cpu. - */ - if (static_branch_likely(&rdt_alloc_enable_key)) { - if (current->closid) - closid = current->closid; - } - - if (static_branch_likely(&rdt_mon_enable_key)) { - if (current->rmid) - rmid = current->rmid; - } - - if (closid != state->cur_closid || rmid != state->cur_rmid) { - state->cur_closid = closid; - state->cur_rmid = rmid; - wrmsr(IA32_PQR_ASSOC, rmid, closid); - } -} - -static inline void resctrl_sched_in(void) -{ - if (static_branch_likely(&rdt_enable_key)) - __resctrl_sched_in(); -} - -#else - -static inline void resctrl_sched_in(void) {} - -#endif /* CONFIG_X86_CPU_RESCTRL */ - -#endif /* _ASM_X86_RESCTRL_SCHED_H */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d8cc5223b7ce..6f38c88226af 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include "internal.h" /* Mutex to protect rdtgroup access. */ diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index d7623e1b927d..4bd28b388a1a 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include "../../events/perf_event.h" /* For X86_CONFIG() */ diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 5a359d9fcc05..6276ae015945 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -29,7 +29,7 @@ #include -#include +#include #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 954b013cc585..538d4e8d6589 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include "process.h" diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5ef9d8f25b0e..0c169a5687e1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #include #ifdef CONFIG_IA32_EMULATION -- cgit v1.2.3 From 0118ad82c2a64ebcf15d7565ed35361407efadfa Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:13 -0700 Subject: x86/cpu: Move resctrl CPUID code to resctrl/ The function determining a platform's support and properties of cache occupancy and memory bandwidth monitoring (properties of X86_FEATURE_CQM_LLC) can be found among the common CPU code. After the feature's properties is populated in the per-CPU data the resctrl subsystem is the only consumer (via boot_cpu_data). Move the function that obtains the CPU information used by resctrl to the resctrl subsystem and rename it from init_cqm() to resctrl_cpu_detect(). The function continues to be called from the common CPU code. This move is done in preparation of the addition of some vendor specific code. No functional change. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/38433b99f9d16c8f4ee796f8cc42b871531fa203.1588715690.git.reinette.chatre@intel.com --- arch/x86/include/asm/resctrl.h | 3 +++ arch/x86/kernel/cpu/common.c | 27 ++------------------------- arch/x86/kernel/cpu/resctrl/core.c | 24 ++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 25 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h index 99b5728e441d..07603064df8f 100644 --- a/arch/x86/include/asm/resctrl.h +++ b/arch/x86/include/asm/resctrl.h @@ -84,9 +84,12 @@ static inline void resctrl_sched_in(void) __resctrl_sched_in(); } +void resctrl_cpu_detect(struct cpuinfo_x86 *c); + #else static inline void resctrl_sched_in(void) {} +static inline void resctrl_cpu_detect(struct cpuinfo_x86 *c) {} #endif /* CONFIG_X86_CPU_RESCTRL */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index bed0cb83fe24..a8f0f22ee5c1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "cpu.h" @@ -854,30 +855,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c) } } -static void init_cqm(struct cpuinfo_x86 *c) -{ - if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { - c->x86_cache_max_rmid = -1; - c->x86_cache_occ_scale = -1; - return; - } - - /* will be overridden if occupancy monitoring exists */ - c->x86_cache_max_rmid = cpuid_ebx(0xf); - - if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || - cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || - cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { - u32 eax, ebx, ecx, edx; - - /* QoS sub-leaf, EAX=0Fh, ECX=1 */ - cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); - - c->x86_cache_max_rmid = ecx; - c->x86_cache_occ_scale = ebx; - } -} - void get_cpu_cap(struct cpuinfo_x86 *c) { u32 eax, ebx, ecx, edx; @@ -945,7 +922,7 @@ void get_cpu_cap(struct cpuinfo_x86 *c) init_scattered_cpuid_features(c); init_speculation_control(c); - init_cqm(c); + resctrl_cpu_detect(c); /* * Clear/Set all flags overridden by options, after probe. diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index 6f38c88226af..861c6d1ba9ab 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -958,6 +958,30 @@ static __init void rdt_init_res_defs(void) static enum cpuhp_state rdt_online; +void resctrl_cpu_detect(struct cpuinfo_x86 *c) +{ + if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { + c->x86_cache_max_rmid = -1; + c->x86_cache_occ_scale = -1; + return; + } + + /* will be overridden if occupancy monitoring exists */ + c->x86_cache_max_rmid = cpuid_ebx(0xf); + + if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) || + cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) || + cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) { + u32 eax, ebx, ecx, edx; + + /* QoS sub-leaf, EAX=0Fh, ECX=1 */ + cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx); + + c->x86_cache_max_rmid = ecx; + c->x86_cache_occ_scale = ebx; + } +} + static int __init resctrl_late_init(void) { struct rdt_resource *r; -- cgit v1.2.3 From f3d44f18b0662327c42128b9d3604489bdb6e36f Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Tue, 5 May 2020 15:36:17 -0700 Subject: x86/resctrl: Support CPUID enumeration of MBM counter width The original Memory Bandwidth Monitoring (MBM) architectural definition defines counters of up to 62 bits in the IA32_QM_CTR MSR while the first-generation MBM implementation uses statically defined 24 bit counters. Expand the MBM CPUID enumeration properties to include the MBM counter width. The previously undefined EAX output register contains, in bits [7:0], the MBM counter width encoded as an offset from 24 bits. Enumerating this property is only specified for Intel CPUs. Suggested-by: Borislav Petkov Signed-off-by: Reinette Chatre Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/afa3af2f753f6bc301fb743bc8944e749cb24afa.1588715690.git.reinette.chatre@intel.com --- arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/resctrl/core.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3bcf27caf6c9..c4e8fd709cf6 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -113,9 +113,10 @@ struct cpuinfo_x86 { /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ - /* Cache QoS architectural values: */ + /* Cache QoS architectural values, valid only on the BSP: */ int x86_cache_max_rmid; /* max index */ int x86_cache_occ_scale; /* scale to bytes */ + int x86_cache_mbm_width_offset; int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index d5979073301e..12f967c6b603 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -964,6 +964,7 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_CQM_LLC)) { c->x86_cache_max_rmid = -1; c->x86_cache_occ_scale = -1; + c->x86_cache_mbm_width_offset = -1; return; } @@ -980,6 +981,10 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) c->x86_cache_max_rmid = ecx; c->x86_cache_occ_scale = ebx; + if (c->x86_vendor == X86_VENDOR_INTEL) + c->x86_cache_mbm_width_offset = eax & 0xff; + else + c->x86_cache_mbm_width_offset = -1; } } -- cgit v1.2.3 From 4d5523cfd5d298c58743eb31c003886cfc856709 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 5 May 2020 07:33:20 -0400 Subject: KVM: x86: fix DR6 delivery for various cases of #DB injection Go through kvm_queue_exception_p so that the payload is correctly delivered through the exit qualification, and add a kvm_update_dr6 call to kvm_deliver_exception_payload that is needed on AMD. Reported-by: Peter Xu Reviewed-by: Peter Xu Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 8 ++------ arch/x86/kvm/x86.c | 11 ++++++----- 3 files changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0dea9f122bb9..8c247bcb037e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1449,6 +1449,7 @@ bool kvm_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); +void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c2c6335a998c..bb5a527e49d9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4677,12 +4677,10 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= dr6 | DR6_RTM; if (is_icebp(intr_info)) WARN_ON(!skip_emulated_instruction(vcpu)); - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); return 1; } kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; @@ -4936,9 +4934,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) vcpu->run->exit_reason = KVM_EXIT_DEBUG; return 0; } else { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= DR6_BD | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD); return 1; } } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d786c7d27ce5..109115c96897 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -104,6 +104,7 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); +static void kvm_update_dr6(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -473,6 +474,7 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * breakpoint), it is reserved and must be zero in DR6. */ vcpu->arch.dr6 &= ~BIT(12); + kvm_update_dr6(vcpu); break; case PF_VECTOR: vcpu->arch.cr2 = payload; @@ -572,11 +574,12 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, - unsigned long payload) +void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, + unsigned long payload) { kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false); } +EXPORT_SYMBOL_GPL(kvm_queue_exception_p); static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code, unsigned long payload) @@ -6719,9 +6722,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.db); if (dr6 != 0) { - vcpu->arch.dr6 &= ~DR_TRAP_BITS; - vcpu->arch.dr6 |= dr6 | DR6_RTM; - kvm_queue_exception(vcpu, DB_VECTOR); + kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); *r = 1; return true; } -- cgit v1.2.3 From d8422f6bb052b44db923a28afd8ceaef63d38d26 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 5 May 2020 19:25:08 +0200 Subject: x86/cpu: Add a X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS() macro ... to match Intel family 6 CPUs with steppings. Signed-off-by: Borislav Petkov Reviewed-by: Mark Gross Link: https://lkml.kernel.org/r/20200506071516.25445-3-bp@alien8.de --- arch/x86/include/asm/cpu_device_id.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/cpu_device_id.h b/arch/x86/include/asm/cpu_device_id.h index 10426cd56dca..eb8fcede9e3b 100644 --- a/arch/x86/include/asm/cpu_device_id.h +++ b/arch/x86/include/asm/cpu_device_id.h @@ -160,6 +160,10 @@ #define X86_MATCH_INTEL_FAM6_MODEL(model, data) \ X86_MATCH_VENDOR_FAM_MODEL(INTEL, 6, INTEL_FAM6_##model, data) +#define X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(model, steppings, data) \ + X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, INTEL_FAM6_##model, \ + steppings, X86_FEATURE_ANY, data) + /* * Match specific microcode revisions. * -- cgit v1.2.3 From 30ad8db3a2c2e0121202342c6c2a48fc28937056 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:17 +0200 Subject: x86/platform/uv: Mark uv_bios_call() and uv_bios_call_irqsave() static Both functions are only used inside of bios_uv.c. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-2-hch@lst.de --- arch/x86/include/asm/uv/bios.h | 6 ------ arch/x86/platform/uv/bios_uv.c | 9 ++++----- 2 files changed, 4 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index 389174eaec79..fc85dafa5e49 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -123,12 +123,6 @@ enum uv_memprotect { UV_MEMPROT_ALLOW_RW }; -/* - * bios calls have 6 parameters - */ -extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64); -extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); - extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *); extern s64 uv_bios_freq_base(u64, u64 *); extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int, diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index c60255da5a6c..ca287554e234 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -45,7 +45,8 @@ static s64 __uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } -s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) +static s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, + u64 a5) { s64 ret; @@ -57,10 +58,9 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) return ret; } -EXPORT_SYMBOL_GPL(uv_bios_call); -s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, - u64 a4, u64 a5) +static s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) { unsigned long bios_flags; s64 ret; @@ -77,7 +77,6 @@ s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } - long sn_partition_id; EXPORT_SYMBOL_GPL(sn_partition_id); long sn_coherency_id; -- cgit v1.2.3 From 32988cfd579f7912aeb9a66bf44ca4ce0fa860f1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:18 +0200 Subject: x86/platform/uv: Remove the uv_partition_coherence_id() macro uv_partition_coherence_id() is only used once. Just open code it in the only user. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-3-hch@lst.de --- arch/x86/include/asm/uv/bios.h | 1 - arch/x86/platform/uv/uv_sysfs.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h index fc85dafa5e49..2fcc3ac12e76 100644 --- a/arch/x86/include/asm/uv/bios.h +++ b/arch/x86/include/asm/uv/bios.h @@ -140,7 +140,6 @@ extern long sn_partition_id; extern long sn_coherency_id; extern long sn_region_size; extern long system_serial_number; -#define uv_partition_coherence_id() (sn_coherency_id) extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ diff --git a/arch/x86/platform/uv/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c index 62214731fea5..266773e2fb37 100644 --- a/arch/x86/platform/uv/uv_sysfs.c +++ b/arch/x86/platform/uv/uv_sysfs.c @@ -21,7 +21,7 @@ static ssize_t partition_id_show(struct kobject *kobj, static ssize_t coherence_id_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%ld\n", uv_partition_coherence_id()); + return snprintf(buf, PAGE_SIZE, "%ld\n", sn_coherency_id); } static struct kobj_attribute partition_id_attr = -- cgit v1.2.3 From cc1991058705a9cc4fe51ab6c2116df17922ef82 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:21 +0200 Subject: x86/platform/uv: Remove the UV*_HUB_IS_SUPPORTED macros All of the macros are always defined to one. Remove them and the dead code keyed off them. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-6-hch@lst.de --- arch/x86/include/asm/uv/uv_hub.h | 19 ------------------- arch/x86/include/asm/uv/uv_mmrs.h | 7 ------- 2 files changed, 26 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 950cd1395d5d..a998e65e6a58 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -244,51 +244,32 @@ static inline int uv_hub_info_check(int version) #define UV4_HUB_REVISION_BASE 7 #define UV4A_HUB_REVISION_BASE 8 /* UV4 (fixed) rev 2 */ -/* WARNING: UVx_HUB_IS_SUPPORTED defines are deprecated and will be removed */ static inline int is_uv1_hub(void) { -#ifdef UV1_HUB_IS_SUPPORTED return is_uv_hubbed(uv(1)); -#else - return 0; -#endif } static inline int is_uv2_hub(void) { -#ifdef UV2_HUB_IS_SUPPORTED return is_uv_hubbed(uv(2)); -#else - return 0; -#endif } static inline int is_uv3_hub(void) { -#ifdef UV3_HUB_IS_SUPPORTED return is_uv_hubbed(uv(3)); -#else - return 0; -#endif } /* First test "is UV4A", then "is UV4" */ static inline int is_uv4a_hub(void) { -#ifdef UV4A_HUB_IS_SUPPORTED if (is_uv_hubbed(uv(4))) return (uv_hub_info->hub_revision == UV4A_HUB_REVISION_BASE); -#endif return 0; } static inline int is_uv4_hub(void) { -#ifdef UV4_HUB_IS_SUPPORTED return is_uv_hubbed(uv(4)); -#else - return 0; -#endif } static inline int is_uvx_hub(void) diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h index 62c79e26a59a..9ee5ed6e8b34 100644 --- a/arch/x86/include/asm/uv/uv_mmrs.h +++ b/arch/x86/include/asm/uv/uv_mmrs.h @@ -99,13 +99,6 @@ #define UV3_HUB_PART_NUMBER_X 0x4321 #define UV4_HUB_PART_NUMBER 0x99a1 -/* Compat: Indicate which UV Hubs are supported. */ -#define UV1_HUB_IS_SUPPORTED 1 -#define UV2_HUB_IS_SUPPORTED 1 -#define UV3_HUB_IS_SUPPORTED 1 -#define UV4_HUB_IS_SUPPORTED 1 -#define UV4A_HUB_IS_SUPPORTED 1 - /* Error function to catch undefined references */ extern unsigned long uv_undefined(char *str); -- cgit v1.2.3 From e4dd8b8351264fb5bef30c0a77e4d171e0603d63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:22 +0200 Subject: x86/platform/uv: Mark is_uv_hubless() static is_uv_hubless() is only used in x2apic_uv_x.c. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-7-hch@lst.de --- arch/x86/include/asm/uv/uv.h | 2 -- arch/x86/kernel/apic/x2apic_uv_x.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 45ea95ce79b4..ae587ce544f4 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -31,7 +31,6 @@ static inline bool is_early_uv_system(void) } extern int is_uv_system(void); extern int is_uv_hubbed(int uvtype); -extern int is_uv_hubless(int uvtype); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); @@ -44,7 +43,6 @@ static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline bool is_early_uv_system(void) { return 0; } static inline int is_uv_system(void) { return 0; } static inline int is_uv_hubbed(int uv) { return 0; } -static inline int is_uv_hubless(int uv) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index ad53b2abc859..cb07a98771f9 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -385,11 +385,10 @@ int is_uv_hubbed(int uvtype) } EXPORT_SYMBOL_GPL(is_uv_hubbed); -int is_uv_hubless(int uvtype) +static int is_uv_hubless(int uvtype) { return (uv_hubless_system & uvtype); } -EXPORT_SYMBOL_GPL(is_uv_hubless); void **__uv_hub_info_list; EXPORT_SYMBOL_GPL(__uv_hub_info_list); -- cgit v1.2.3 From 8e77554580250f1185cffd8d3ac6a9b01de05d60 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:24 +0200 Subject: x86/platform/uv: Simplify uv_send_IPI_one() Merge two helpers only used by uv_send_IPI_one() into the main function. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-9-hch@lst.de --- arch/x86/include/asm/uv/uv_hub.h | 20 -------------------- arch/x86/kernel/apic/x2apic_uv_x.c | 19 ++++++++++++++----- 2 files changed, 14 insertions(+), 25 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index a998e65e6a58..8a25d95cdf20 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -837,26 +837,6 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) } extern unsigned int uv_apicid_hibits; -static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) -{ - apicid |= uv_apicid_hibits; - return (1UL << UVH_IPI_INT_SEND_SHFT) | - ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | - (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | - (vector << UVH_IPI_INT_VECTOR_SHFT); -} - -static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) -{ - unsigned long val; - unsigned long dmode = dest_Fixed; - - if (vector == NMI_VECTOR) - dmode = dest_NMI; - - val = uv_hub_ipi_value(apicid, vector, dmode); - uv_write_global_mmr64(pnode, UVH_IPI_INT, val); -} /* * Get the minimum revision number of the hub chips within the partition. diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f1a0142e2731..3830538095e6 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -588,12 +588,21 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip) static void uv_send_IPI_one(int cpu, int vector) { - unsigned long apicid; - int pnode; + unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu); + int pnode = uv_apicid_to_pnode(apicid); + unsigned long dmode, val; + + if (vector == NMI_VECTOR) + dmode = dest_NMI; + else + dmode = dest_Fixed; - apicid = per_cpu(x86_cpu_to_apicid, cpu); - pnode = uv_apicid_to_pnode(apicid); - uv_hub_send_ipi(pnode, apicid, vector); + val = (1UL << UVH_IPI_INT_SEND_SHFT) | + ((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) | + (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | + (vector << UVH_IPI_INT_VECTOR_SHFT); + + uv_write_global_mmr64(pnode, UVH_IPI_INT, val); } static void uv_send_IPI_mask(const struct cpumask *mask, int vector) -- cgit v1.2.3 From fbe1d37866d249b6936cf526592957725a941f95 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:25 +0200 Subject: x86/platform/uv: Remove _uv_hub_info_check() Neither this functions nor the helpers used to implement it are used anywhere in the kernel tree. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-10-hch@lst.de --- arch/x86/include/asm/uv/uv_hub.h | 14 -------------- arch/x86/kernel/apic/x2apic_uv_x.c | 6 ------ 2 files changed, 20 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 8a25d95cdf20..9eabd7e52fd1 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -219,20 +219,6 @@ static inline struct uv_hub_info_s *uv_cpu_hub_info(int cpu) return (struct uv_hub_info_s *)uv_cpu_info_per(cpu)->p_uv_hub_info; } -#define UV_HUB_INFO_VERSION 0x7150 -extern int uv_hub_info_version(void); -static inline int uv_hub_info_check(int version) -{ - if (uv_hub_info_version() == version) - return 0; - - pr_crit("UV: uv_hub_info version(%x) mismatch, expecting(%x)\n", - uv_hub_info_version(), version); - - BUG(); /* Catastrophic - cannot continue on unknown UV system */ -} -#define _uv_hub_info_check() uv_hub_info_check(UV_HUB_INFO_VERSION) - /* * HUB revision ranges for each UV HUB architecture. * This is a software convention - NOT the hardware revision numbers in diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 3830538095e6..8cf0e24cf883 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -415,12 +415,6 @@ static __initdata struct uv_gam_range_s *_gr_table; #define SOCK_EMPTY ((unsigned short)~0) -extern int uv_hub_info_version(void) -{ - return UV_HUB_INFO_VERSION; -} -EXPORT_SYMBOL(uv_hub_info_version); - /* Default UV memory block size is 2GB */ static unsigned long mem_block_size __initdata = (2UL << 30); -- cgit v1.2.3 From 2981cf836127c2d2d68399d456d2c22688d520b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 4 May 2020 19:15:27 +0200 Subject: x86/platform/uv: Remove the unused _uv_cpu_blade_processor_id() macro No users anywhere in the kernel tree. Signed-off-by: Christoph Hellwig Signed-off-by: Thomas Gleixner Not-acked-by: Dimitri Sivanich Cc: Russ Anderson Link: https://lkml.kernel.org/r/20200504171527.2845224-12-hch@lst.de --- arch/x86/include/asm/uv/uv_hub.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 9eabd7e52fd1..60ca0afdeaf9 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -659,7 +659,6 @@ static inline int uv_cpu_blade_processor_id(int cpu) { return uv_cpu_info_per(cpu)->blade_cpu_id; } -#define _uv_cpu_blade_processor_id 1 /* indicate function available */ /* Blade number to Node number (UV1..UV4 is 1:1) */ static inline int uv_blade_to_node(int blade) -- cgit v1.2.3 From e8824890249355656968d8846908a313fe231f11 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 24 Apr 2020 12:37:54 -0700 Subject: x86/delay: Preparatory code cleanup The naming conventions in the delay code are confusing at best. All delay variants use a loops argument and or variable which originates from the original delay_loop() implementation. But all variants except delay_loop() are based on TSC cycles. Rename the argument to cycles and make it type u64 to avoid these weird expansions to u64 in the functions. Rename MWAITX_MAX_LOOPS to MWAITX_MAX_WAIT_CYCLES for the same reason and fixup the comment of delay_mwaitx() as well. Mark the delay_fn function pointer __ro_after_init and fixup the comment for it. No functional change and preparation for the upcoming TPAUSE based delay variant. [ Kyung Min Park: Added __init to use_tsc_delay() ] Signed-off-by: Thomas Gleixner Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/1587757076-30337-2-git-send-email-kyung.min.park@intel.com --- arch/x86/include/asm/delay.h | 3 ++- arch/x86/include/asm/mwait.h | 2 +- arch/x86/lib/delay.c | 45 +++++++++++++++++++++++--------------------- 3 files changed, 27 insertions(+), 23 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index de9e7841f953..9aa38de7bd72 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -3,8 +3,9 @@ #define _ASM_X86_DELAY_H #include +#include -void use_tsc_delay(void); +void __init use_tsc_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index b809f117f3f4..a43b35b35049 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -20,7 +20,7 @@ #define MWAIT_ECX_INTERRUPT_BREAK 0x1 #define MWAITX_ECX_TIMER_ENABLE BIT(1) -#define MWAITX_MAX_LOOPS ((u32)-1) +#define MWAITX_MAX_WAIT_CYCLES UINT_MAX #define MWAITX_DISABLE_CSTATES 0xf0 u32 get_umwait_control_msr(void); diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index c126571e5e2e..887d52d5a7cc 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -27,9 +27,19 @@ # include #endif +static void delay_loop(u64 __loops); + +/* + * Calibration and selection of the delay mechanism happens only once + * during boot. + */ +static void (*delay_fn)(u64) __ro_after_init = delay_loop; + /* simple loop based delay: */ -static void delay_loop(unsigned long loops) +static void delay_loop(u64 __loops) { + unsigned long loops = (unsigned long)__loops; + asm volatile( " test %0,%0 \n" " jz 3f \n" @@ -49,9 +59,9 @@ static void delay_loop(unsigned long loops) } /* TSC based delay: */ -static void delay_tsc(unsigned long __loops) +static void delay_tsc(u64 cycles) { - u64 bclock, now, loops = __loops; + u64 bclock, now; int cpu; preempt_disable(); @@ -59,7 +69,7 @@ static void delay_tsc(unsigned long __loops) bclock = rdtsc_ordered(); for (;;) { now = rdtsc_ordered(); - if ((now - bclock) >= loops) + if ((now - bclock) >= cycles) break; /* Allow RT tasks to run */ @@ -77,7 +87,7 @@ static void delay_tsc(unsigned long __loops) * counter for this CPU. */ if (unlikely(cpu != smp_processor_id())) { - loops -= (now - bclock); + cycles -= (now - bclock); cpu = smp_processor_id(); bclock = rdtsc_ordered(); } @@ -87,24 +97,24 @@ static void delay_tsc(unsigned long __loops) /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that - * counts with TSC frequency. The input value is the loop of the - * counter, it will exit when the timer expires. + * counts with TSC frequency. The input value is the number of TSC cycles + * to wait. MWAITX will also exit when the timer expires. */ -static void delay_mwaitx(unsigned long __loops) +static void delay_mwaitx(u64 cycles) { - u64 start, end, delay, loops = __loops; + u64 start, end, delay; /* * Timer value of 0 causes MWAITX to wait indefinitely, unless there * is a store on the memory monitored by MONITORX. */ - if (loops == 0) + if (!cycles) return; start = rdtsc_ordered(); for (;;) { - delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles); /* * Use cpu_tss_rw as a cacheline-aligned, seldomly @@ -121,22 +131,15 @@ static void delay_mwaitx(unsigned long __loops) end = rdtsc_ordered(); - if (loops <= end - start) + if (cycles <= end - start) break; - loops -= end - start; - + cycles -= end - start; start = end; } } -/* - * Since we calibrate only once at boot, this - * function should be set once at boot and not changed - */ -static void (*delay_fn)(unsigned long) = delay_loop; - -void use_tsc_delay(void) +void __init use_tsc_delay(void) { if (delay_fn == delay_loop) delay_fn = delay_tsc; -- cgit v1.2.3 From cec5f268cd02d25d2d74807843d8ae0292fe0fb7 Mon Sep 17 00:00:00 2001 From: Kyung Min Park Date: Fri, 24 Apr 2020 12:37:56 -0700 Subject: x86/delay: Introduce TPAUSE delay TPAUSE instructs the processor to enter an implementation-dependent optimized state. The instruction execution wakes up when the time-stamp counter reaches or exceeds the implicit EDX:EAX 64-bit input value. The instruction execution also wakes up due to the expiration of the operating system time-limit or by an external interrupt or exceptions such as a debug exception or a machine check exception. TPAUSE offers a choice of two lower power states: 1. Light-weight power/performance optimized state C0.1 2. Improved power/performance optimized state C0.2 This way, it can save power with low wake-up latency in comparison to spinloop based delay. The selection between the two is governed by the input register. TPAUSE is available on processors with X86_FEATURE_WAITPKG. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Kyung Min Park Signed-off-by: Thomas Gleixner Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/1587757076-30337-4-git-send-email-kyung.min.park@intel.com --- arch/x86/Kconfig.assembler | 4 ++++ arch/x86/include/asm/delay.h | 1 + arch/x86/include/asm/mwait.h | 22 ++++++++++++++++++++++ arch/x86/kernel/time.c | 3 +++ arch/x86/lib/delay.c | 27 +++++++++++++++++++++++++++ 5 files changed, 57 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler index 13de0db38d4e..26b8c08e2fc4 100644 --- a/arch/x86/Kconfig.assembler +++ b/arch/x86/Kconfig.assembler @@ -15,3 +15,7 @@ config AS_SHA256_NI def_bool $(as-instr,sha256msg1 %xmm0$(comma)%xmm1) help Supported by binutils >= 2.24 and LLVM integrated assembler +config AS_TPAUSE + def_bool $(as-instr,tpause %ecx) + help + Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7 diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 9aa38de7bd72..630891d25819 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -6,6 +6,7 @@ #include void __init use_tsc_delay(void); +void __init use_tpause_delay(void); void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index a43b35b35049..73d997aa2966 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -22,6 +22,8 @@ #define MWAITX_ECX_TIMER_ENABLE BIT(1) #define MWAITX_MAX_WAIT_CYCLES UINT_MAX #define MWAITX_DISABLE_CSTATES 0xf0 +#define TPAUSE_C01_STATE 1 +#define TPAUSE_C02_STATE 0 u32 get_umwait_control_msr(void); @@ -122,4 +124,24 @@ static inline void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) current_clr_polling(); } +/* + * Caller can specify whether to enter C0.1 (low latency, less + * power saving) or C0.2 state (saves more power, but longer wakeup + * latency). This may be overridden by the IA32_UMWAIT_CONTROL MSR + * which can force requests for C0.2 to be downgraded to C0.1. + */ +static inline void __tpause(u32 ecx, u32 edx, u32 eax) +{ + /* "tpause %ecx, %edx, %eax;" */ + #ifdef CONFIG_AS_TPAUSE + asm volatile("tpause %%ecx\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #else + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf1\t\n" + : + : "c"(ecx), "d"(edx), "a"(eax)); + #endif +} + #endif /* _ASM_X86_MWAIT_H */ diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 106e7f87f534..371a6b348e44 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -103,6 +103,9 @@ static __init void x86_late_time_init(void) */ x86_init.irqs.intr_mode_init(); tsc_init(); + + if (static_cpu_has(X86_FEATURE_WAITPKG)) + use_tpause_delay(); } /* diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index fe91dc171cf8..65d15df6212d 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -96,6 +96,27 @@ static void delay_tsc(u64 cycles) preempt_enable(); } +/* + * On Intel the TPAUSE instruction waits until any of: + * 1) the TSC counter exceeds the value provided in EDX:EAX + * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded + * 3) an external interrupt occurs + */ +static void delay_halt_tpause(u64 start, u64 cycles) +{ + u64 until = start + cycles; + u32 eax, edx; + + eax = lower_32_bits(until); + edx = upper_32_bits(until); + + /* + * Hard code the deeper (C0.2) sleep state because exit latency is + * small compared to the "microseconds" that usleep() will delay. + */ + __tpause(TPAUSE_C02_STATE, edx, eax); +} + /* * On some AMD platforms, MWAITX has a configurable 32-bit timer, that * counts with TSC frequency. The input value is the number of TSC cycles @@ -156,6 +177,12 @@ void __init use_tsc_delay(void) delay_fn = delay_tsc; } +void __init use_tpause_delay(void) +{ + delay_halt_fn = delay_halt_tpause; + delay_fn = delay_halt; +} + void use_mwaitx_delay(void) { delay_halt_fn = delay_halt_mwaitx; -- cgit v1.2.3 From 5679b803e44ed8947e8c2a7f44cdef1d93ea24d5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 4 May 2020 11:28:25 -0400 Subject: KVM: SVM: keep DR6 synchronized with vcpu->arch.dr6 kvm_x86_ops.set_dr6 is only ever called with vcpu->arch.dr6 as the second argument. Ensure that the VMCB value is synchronized to vcpu->arch.dr6 on #DB (both "normal" and nested) and nested vmentry, so that the current value of DR6 is always available in vcpu->arch.dr6. The get_dr6 callback can just access vcpu->arch.dr6 and becomes redundant. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/nested.c | 24 ++++++++++++++++-------- arch/x86/kvm/svm/svm.c | 9 ++------- arch/x86/kvm/vmx/vmx.c | 6 ------ arch/x86/kvm/x86.c | 8 ++------ 5 files changed, 21 insertions(+), 28 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 8c247bcb037e..de0c28814348 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1093,7 +1093,6 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); - u64 (*get_dr6)(struct kvm_vcpu *vcpu); void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); @@ -1624,6 +1623,7 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +void kvm_update_dr6(struct kvm_vcpu *vcpu); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index adab5b1c5fe1..9cfa8098995e 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -19,6 +19,7 @@ #include #include +#include #include "kvm_emulate.h" #include "trace.h" @@ -267,7 +268,8 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->save.rsp = nested_vmcb->save.rsp; svm->vmcb->save.rip = nested_vmcb->save.rip; svm->vmcb->save.dr7 = nested_vmcb->save.dr7; - svm->vmcb->save.dr6 = nested_vmcb->save.dr6; + svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; + kvm_update_dr6(&svm->vcpu); svm->vmcb->save.cpl = nested_vmcb->save.cpl; svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; @@ -482,7 +484,7 @@ int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.rsp = vmcb->save.rsp; nested_vmcb->save.rax = vmcb->save.rax; nested_vmcb->save.dr7 = vmcb->save.dr7; - nested_vmcb->save.dr6 = vmcb->save.dr6; + nested_vmcb->save.dr6 = svm->vcpu.arch.dr6; nested_vmcb->save.cpl = vmcb->save.cpl; nested_vmcb->control.int_ctl = vmcb->control.int_ctl; @@ -606,7 +608,7 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) /* DB exceptions for our internal use must not cause vmexit */ static int nested_svm_intercept_db(struct vcpu_svm *svm) { - unsigned long dr6; + unsigned long dr6 = svm->vmcb->save.dr6; /* Always catch it and pass it to userspace if debugging. */ if (svm->vcpu.guest_debug & @@ -615,22 +617,28 @@ static int nested_svm_intercept_db(struct vcpu_svm *svm) /* if we're not singlestepping, it's not ours */ if (!svm->nmi_singlestep) - return NESTED_EXIT_DONE; + goto reflected_db; /* if it's not a singlestep exception, it's not ours */ - if (kvm_get_dr(&svm->vcpu, 6, &dr6)) - return NESTED_EXIT_DONE; if (!(dr6 & DR6_BS)) - return NESTED_EXIT_DONE; + goto reflected_db; /* if the guest is singlestepping, it should get the vmexit */ if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { disable_nmi_singlestep(svm); - return NESTED_EXIT_DONE; + goto reflected_db; } /* it's ours, the nested hypervisor must not see this one */ return NESTED_EXIT_HOST; + +reflected_db: + /* + * Synchronize guest DR6 here just like in db_interception; it will + * be moved into the nested VMCB by nested_svm_vmexit. + */ + svm->vcpu.arch.dr6 = dr6; + return NESTED_EXIT_DONE; } static int nested_svm_intercept_ioio(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 38f6aeefeb55..f03bffafd9e6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1672,11 +1672,6 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) mark_dirty(svm->vmcb, VMCB_ASID); } -static u64 svm_get_dr6(struct kvm_vcpu *vcpu) -{ - return to_svm(vcpu)->vmcb->save.dr6; -} - static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1693,7 +1688,7 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); - vcpu->arch.dr6 = svm_get_dr6(vcpu); + vcpu->arch.dr6 = svm->vmcb->save.dr6; vcpu->arch.dr7 = svm->vmcb->save.dr7; vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; @@ -1739,6 +1734,7 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { + vcpu->arch.dr6 = svm->vmcb->save.dr6; kvm_queue_exception(&svm->vcpu, DB_VECTOR); return 1; } @@ -3931,7 +3927,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .get_dr6 = svm_get_dr6, .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 2384a2dbec44..6153a47109d3 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4965,11 +4965,6 @@ static int handle_dr(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) -{ - return vcpu->arch.dr6; -} - static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) { } @@ -7736,7 +7731,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .get_dr6 = vmx_get_dr6, .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7628555f036..4ea644827b8a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -104,7 +104,6 @@ static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) static void update_cr8_intercept(struct kvm_vcpu *vcpu); -static void kvm_update_dr6(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void enter_smm(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); @@ -1048,7 +1047,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) } } -static void kvm_update_dr6(struct kvm_vcpu *vcpu) +void kvm_update_dr6(struct kvm_vcpu *vcpu) { if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6); @@ -1129,10 +1128,7 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val) case 4: /* fall through */ case 6: - if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) - *val = vcpu->arch.dr6; - else - *val = kvm_x86_ops.get_dr6(vcpu); + *val = vcpu->arch.dr6; break; case 5: /* fall through */ -- cgit v1.2.3 From d67668e9dd76d98136048935723947156737932b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 6 May 2020 06:40:04 -0400 Subject: KVM: x86, SVM: isolate vcpu->arch.dr6 from vmcb->save.dr6 There are two issues with KVM_EXIT_DEBUG on AMD, whose root cause is the different handling of DR6 on intercepted #DB exceptions on Intel and AMD. On Intel, #DB exceptions transmit the DR6 value via the exit qualification field of the VMCS, and the exit qualification only contains the description of the precise event that caused a vmexit. On AMD, instead the DR6 field of the VMCB is filled in as if the #DB exception was to be injected into the guest. This has two effects when guest debugging is in use: * the guest DR6 is clobbered * the kvm_run->debug.arch.dr6 field can accumulate more debug events, rather than just the last one that happened (the testcase in the next patch covers this issue). This patch fixes both issues by emulating, so to speak, the Intel behavior on AMD processors. The important observation is that (after the previous patches) the VMCB value of DR6 is only ever observable from the guest is KVM_DEBUGREG_WONT_EXIT is set. Therefore we can actually set vmcb->save.dr6 to any value we want as long as KVM_DEBUGREG_WONT_EXIT is clear, which it will be if guest debugging is enabled. Therefore it is possible to enter the guest with an all-zero DR6, reconstruct the #DB payload from the DR6 we get at exit time, and let kvm_deliver_exception_payload move the newly set bits into vcpu->arch.dr6. Some extra bits may be included in the payload if KVM_DEBUGREG_WONT_EXIT is set, but this is harmless. This may not be the most optimized way to deal with this, but it is simple and, being confined within SVM code, it gets rid of the set_dr6 callback and kvm_update_dr6. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 -- arch/x86/kvm/svm/nested.c | 15 +++++++++++---- arch/x86/kvm/svm/svm.c | 29 +++++++++++++++++++++-------- arch/x86/kvm/vmx/vmx.c | 5 ----- arch/x86/kvm/x86.c | 11 ----------- 5 files changed, 32 insertions(+), 30 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index de0c28814348..9e8263b1e6fe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1093,7 +1093,6 @@ struct kvm_x86_ops { void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); - void (*set_dr6)(struct kvm_vcpu *vcpu, unsigned long value); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); @@ -1623,7 +1622,6 @@ int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); -void kvm_update_dr6(struct kvm_vcpu *vcpu); u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9cfa8098995e..9a2a62e5afeb 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -269,7 +269,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->vmcb->save.rip = nested_vmcb->save.rip; svm->vmcb->save.dr7 = nested_vmcb->save.dr7; svm->vcpu.arch.dr6 = nested_vmcb->save.dr6; - kvm_update_dr6(&svm->vcpu); svm->vmcb->save.cpl = nested_vmcb->save.cpl; svm->nested.vmcb_msrpm = nested_vmcb->control.msrpm_base_pa & ~0x0fffULL; @@ -634,10 +633,18 @@ static int nested_svm_intercept_db(struct vcpu_svm *svm) reflected_db: /* - * Synchronize guest DR6 here just like in db_interception; it will - * be moved into the nested VMCB by nested_svm_vmexit. + * Synchronize guest DR6 here just like in kvm_deliver_exception_payload; + * it will be moved into the nested VMCB by nested_svm_vmexit. Once + * exceptions will be moved to svm_check_nested_events, all this stuff + * will just go away and we could just return NESTED_EXIT_HOST + * unconditionally. db_interception will queue the exception, which + * will be processed by svm_check_nested_events if a nested vmexit is + * required, and we will just use kvm_deliver_exception_payload to copy + * the payload to DR6 before vmexit. */ - svm->vcpu.arch.dr6 = dr6; + WARN_ON(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT); + svm->vcpu.arch.dr6 &= ~(DR_TRAP_BITS | DR6_RTM); + svm->vcpu.arch.dr6 |= dr6 & ~DR6_FIXED_1; return NESTED_EXIT_DONE; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f03bffafd9e6..a862c768fd54 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1672,12 +1672,14 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) mark_dirty(svm->vmcb, VMCB_ASID); } -static void svm_set_dr6(struct kvm_vcpu *vcpu, unsigned long value) +static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) { - struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; - svm->vmcb->save.dr6 = value; - mark_dirty(svm->vmcb, VMCB_DR); + if (unlikely(value != vmcb->save.dr6)) { + vmcb->save.dr6 = value; + mark_dirty(vmcb, VMCB_DR); + } } static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) @@ -1688,9 +1690,12 @@ static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) get_debugreg(vcpu->arch.db[1], 1); get_debugreg(vcpu->arch.db[2], 2); get_debugreg(vcpu->arch.db[3], 3); + /* + * We cannot reset svm->vmcb->save.dr6 to DR6_FIXED_1|DR6_RTM here, + * because db_interception might need it. We can do it before vmentry. + */ vcpu->arch.dr6 = svm->vmcb->save.dr6; vcpu->arch.dr7 = svm->vmcb->save.dr7; - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; set_dr_intercepts(svm); } @@ -1734,8 +1739,8 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && !svm->nmi_singlestep) { - vcpu->arch.dr6 = svm->vmcb->save.dr6; - kvm_queue_exception(&svm->vcpu, DB_VECTOR); + u32 payload = (svm->vmcb->save.dr6 ^ DR6_RTM) & ~DR6_FIXED_1; + kvm_queue_exception_p(&svm->vcpu, DB_VECTOR, payload); return 1; } @@ -3313,6 +3318,15 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; + /* + * Run with all-zero DR6 unless needed, so that we can get the exact cause + * of a #DB. + */ + if (unlikely(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + svm_set_dr6(svm, vcpu->arch.dr6); + else + svm_set_dr6(svm, DR6_FIXED_1 | DR6_RTM); + clgi(); kvm_load_guest_xsave_state(vcpu); @@ -3927,7 +3941,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_idt = svm_set_idt, .get_gdt = svm_get_gdt, .set_gdt = svm_set_gdt, - .set_dr6 = svm_set_dr6, .set_dr7 = svm_set_dr7, .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, .cache_reg = svm_cache_reg, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6153a47109d3..e2b71b0cdfce 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4965,10 +4965,6 @@ static int handle_dr(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } -static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val) -{ -} - static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) { get_debugreg(vcpu->arch.db[0], 0); @@ -7731,7 +7727,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .set_dr6 = vmx_set_dr6, .set_dr7 = vmx_set_dr7, .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs, .cache_reg = vmx_cache_reg, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4ea644827b8a..f780af601c5f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -473,7 +473,6 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu) * breakpoint), it is reserved and must be zero in DR6. */ vcpu->arch.dr6 &= ~BIT(12); - kvm_update_dr6(vcpu); break; case PF_VECTOR: vcpu->arch.cr2 = payload; @@ -1047,12 +1046,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) } } -void kvm_update_dr6(struct kvm_vcpu *vcpu) -{ - if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - kvm_x86_ops.set_dr6(vcpu, vcpu->arch.dr6); -} - static void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -1092,7 +1085,6 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) if (val & 0xffffffff00000000ULL) return -1; /* #GP */ vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); - kvm_update_dr6(vcpu); break; case 5: /* fall through */ @@ -4008,7 +4000,6 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = dbgregs->dr6; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = dbgregs->dr7; kvm_update_dr7(vcpu); @@ -8417,7 +8408,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); kvm_x86_ops.sync_dirty_debug_regs(vcpu); kvm_update_dr0123(vcpu); - kvm_update_dr6(vcpu); kvm_update_dr7(vcpu); vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } @@ -9478,7 +9468,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); kvm_update_dr0123(vcpu); vcpu->arch.dr6 = DR6_INIT; - kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); -- cgit v1.2.3 From 7e32a9dac9926241d56851e1517c9391d39fb48e Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 8 May 2020 11:22:47 +0200 Subject: x86/cpu: Use INVPCID mnemonic in invpcid.h The current minimum required version of binutils is 2.23, which supports the INVPCID instruction mnemonic. Replace the byte-wise specification of INVPCID with the proper mnemonic. [ bp: Add symbolic operand names for increased readability and flip their order like the insn expects them for the AT&T syntax. ] Signed-off-by: Uros Bizjak Signed-off-by: Borislav Petkov Reviewed-by: H. Peter Anvin (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200508092247.132147-1-ubizjak@gmail.com Signed-off-by: Borislav Petkov --- arch/x86/include/asm/invpcid.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h index 989cfa86de85..734482afbf81 100644 --- a/arch/x86/include/asm/invpcid.h +++ b/arch/x86/include/asm/invpcid.h @@ -12,12 +12,9 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr, * stale TLB entries and, especially if we're flushing global * mappings, we don't want the compiler to reorder any subsequent * memory accesses before the TLB flush. - * - * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and - * invpcid (%rcx), %rax in long mode. */ - asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (&desc) : "memory"); + asm volatile("invpcid %[desc], %[type]" + :: [desc] "m" (desc), [type] "r" (type) : "memory"); } #define INVPCID_TYPE_INDIV_ADDR 0 -- cgit v1.2.3 From e72e8bf1c9847a12de74f2fd3ea1f5511866526b Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 31 Mar 2020 11:40:32 +0200 Subject: floppy: split the base port from the register in I/O accesses Currently we have architecture-specific fd_inb() and fd_outb() functions or macros, taking just a port which is in fact made of a base address and a register. The base address is FDC-specific and derived from the local or global "fdc" variable through the FD_IOPORT macro used in the base address calculation. This change splits this by explicitly passing the FDC's base address and the register separately to fd_outb() and fd_inb(). It affects the following archs: - x86, alpha, mips, powerpc, parisc, arm, m68k: simple remap of port -> base+reg - sparc32: use of reg only, since the base address was already masked out and the FDC controller is known from a static struct. - sparc64: like x86 for PCI, like sparc32 for 82077 Some archs use inline functions and others macros. This was not unified in order to minimize the number of changes to review. For the same reason checkpatch still spews a few warnings about things that were already there before. The parisc still uses hard-coded register values and could be cleaned up by taking the register definitions. The sparc per-controller inb/outb functions could further be refined to explicitly take an FDC register instead of a port in argument but it was not needed yet and may be cleaned later. Link: https://lore.kernel.org/r/20200331094054.24441-2-w@1wt.eu Cc: Ivan Kokshaysky Cc: Richard Henderson Cc: Matt Turner Cc: Ian Molton Cc: Russell King Cc: Geert Uytterhoeven Cc: Thomas Bogendoerfer Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: "David S. Miller" Cc: x86@kernel.org Signed-off-by: Willy Tarreau Signed-off-by: Denis Efremov --- arch/alpha/include/asm/floppy.h | 4 ++-- arch/arm/include/asm/floppy.h | 8 ++++---- arch/m68k/include/asm/floppy.h | 12 ++++++------ arch/mips/include/asm/mach-generic/floppy.h | 8 ++++---- arch/mips/include/asm/mach-jazz/floppy.h | 8 ++++---- arch/parisc/include/asm/floppy.h | 12 ++++++------ arch/powerpc/include/asm/floppy.h | 4 ++-- arch/sparc/include/asm/floppy_32.h | 4 ++-- arch/sparc/include/asm/floppy_64.h | 4 ++-- arch/x86/include/asm/floppy.h | 4 ++-- drivers/block/floppy.c | 4 ++-- 11 files changed, 36 insertions(+), 36 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/alpha/include/asm/floppy.h b/arch/alpha/include/asm/floppy.h index 942924756cf2..8dfdb3aa1d96 100644 --- a/arch/alpha/include/asm/floppy.h +++ b/arch/alpha/include/asm/floppy.h @@ -11,8 +11,8 @@ #define __ASM_ALPHA_FLOPPY_H -#define fd_inb(port) inb_p(port) -#define fd_outb(value,port) outb_p(value,port) +#define fd_inb(base, reg) inb_p((base) + (reg)) +#define fd_outb(value, base, reg) outb_p(value, (base) + (reg)) #define fd_enable_dma() enable_dma(FLOPPY_DMA) #define fd_disable_dma() disable_dma(FLOPPY_DMA) diff --git a/arch/arm/include/asm/floppy.h b/arch/arm/include/asm/floppy.h index 79fa327238e8..e1cb04ed5008 100644 --- a/arch/arm/include/asm/floppy.h +++ b/arch/arm/include/asm/floppy.h @@ -9,20 +9,20 @@ #ifndef __ASM_ARM_FLOPPY_H #define __ASM_ARM_FLOPPY_H -#define fd_outb(val,port) \ +#define fd_outb(val, base, reg) \ do { \ int new_val = (val); \ - if (((port) & 7) == FD_DOR) { \ + if ((reg) == FD_DOR) { \ if (new_val & 0xf0) \ new_val = (new_val & 0x0c) | \ floppy_selects[new_val & 3]; \ else \ new_val &= 0x0c; \ } \ - outb(new_val, (port)); \ + outb(new_val, (base) + (reg)); \ } while(0) -#define fd_inb(port) inb((port)) +#define fd_inb(base, reg) inb((base) + (reg)) #define fd_request_irq() request_irq(IRQ_FLOPPYDISK,floppy_interrupt,\ 0,"floppy",NULL) #define fd_free_irq() free_irq(IRQ_FLOPPYDISK,NULL) diff --git a/arch/m68k/include/asm/floppy.h b/arch/m68k/include/asm/floppy.h index c3b9ad6732fc..2a6ce29b92aa 100644 --- a/arch/m68k/include/asm/floppy.h +++ b/arch/m68k/include/asm/floppy.h @@ -63,21 +63,21 @@ static __inline__ void release_dma_lock(unsigned long flags) } -static __inline__ unsigned char fd_inb(int port) +static __inline__ unsigned char fd_inb(int base, int reg) { if(MACH_IS_Q40) - return inb_p(port); + return inb_p(base + reg); else if(MACH_IS_SUN3X) - return sun3x_82072_fd_inb(port); + return sun3x_82072_fd_inb(base + reg); return 0; } -static __inline__ void fd_outb(unsigned char value, int port) +static __inline__ void fd_outb(unsigned char value, int base, int reg) { if(MACH_IS_Q40) - outb_p(value, port); + outb_p(value, base + reg); else if(MACH_IS_SUN3X) - sun3x_82072_fd_outb(value, port); + sun3x_82072_fd_outb(value, base + reg); } diff --git a/arch/mips/include/asm/mach-generic/floppy.h b/arch/mips/include/asm/mach-generic/floppy.h index 9ec2f6a5200b..e3f446d54827 100644 --- a/arch/mips/include/asm/mach-generic/floppy.h +++ b/arch/mips/include/asm/mach-generic/floppy.h @@ -26,14 +26,14 @@ /* * How to access the FDC's registers. */ -static inline unsigned char fd_inb(unsigned int port) +static inline unsigned char fd_inb(unsigned int base, unsigned int reg) { - return inb_p(port); + return inb_p(base + reg); } -static inline void fd_outb(unsigned char value, unsigned int port) +static inline void fd_outb(unsigned char value, unsigned int base, unsigned int reg) { - outb_p(value, port); + outb_p(value, base + reg); } /* diff --git a/arch/mips/include/asm/mach-jazz/floppy.h b/arch/mips/include/asm/mach-jazz/floppy.h index 4b86c88a03b7..095000c290e5 100644 --- a/arch/mips/include/asm/mach-jazz/floppy.h +++ b/arch/mips/include/asm/mach-jazz/floppy.h @@ -17,19 +17,19 @@ #include #include -static inline unsigned char fd_inb(unsigned int port) +static inline unsigned char fd_inb(unsigned int base, unsigned int reg) { unsigned char c; - c = *(volatile unsigned char *) port; + c = *(volatile unsigned char *) (base + reg); udelay(1); return c; } -static inline void fd_outb(unsigned char value, unsigned int port) +static inline void fd_outb(unsigned char value, unsigned int base, unsigned int reg) { - *(volatile unsigned char *) port = value; + *(volatile unsigned char *) (base + reg) = value; } /* diff --git a/arch/parisc/include/asm/floppy.h b/arch/parisc/include/asm/floppy.h index 09b6f4c1687e..1aebc23b7744 100644 --- a/arch/parisc/include/asm/floppy.h +++ b/arch/parisc/include/asm/floppy.h @@ -29,8 +29,8 @@ #define CSW fd_routine[can_use_virtual_dma & 1] -#define fd_inb(port) readb(port) -#define fd_outb(value, port) writeb(value, port) +#define fd_inb(base, reg) readb((base) + (reg)) +#define fd_outb(value, base, reg) writeb(value, (base) + (reg)) #define fd_request_dma() CSW._request_dma(FLOPPY_DMA,"floppy") #define fd_free_dma() CSW._free_dma(FLOPPY_DMA) @@ -75,19 +75,19 @@ static void floppy_hardint(int irq, void *dev_id, struct pt_regs * regs) register char *lptr = virtual_dma_addr; for (lcount = virtual_dma_count; lcount; lcount--) { - st = fd_inb(virtual_dma_port+4) & 0xa0 ; + st = fd_inb(virtual_dma_port, 4) & 0xa0; if (st != 0xa0) break; if (virtual_dma_mode) { - fd_outb(*lptr, virtual_dma_port+5); + fd_outb(*lptr, virtual_dma_port, 5); } else { - *lptr = fd_inb(virtual_dma_port+5); + *lptr = fd_inb(virtual_dma_port, 5); } lptr++; } virtual_dma_count = lcount; virtual_dma_addr = lptr; - st = fd_inb(virtual_dma_port+4); + st = fd_inb(virtual_dma_port, 4); } #ifdef TRACE_FLPY_INT diff --git a/arch/powerpc/include/asm/floppy.h b/arch/powerpc/include/asm/floppy.h index 167c44b58848..ed467eb0c4c8 100644 --- a/arch/powerpc/include/asm/floppy.h +++ b/arch/powerpc/include/asm/floppy.h @@ -13,8 +13,8 @@ #include -#define fd_inb(port) inb_p(port) -#define fd_outb(value,port) outb_p(value,port) +#define fd_inb(base, reg) inb_p((base) + (reg)) +#define fd_outb(value, base, reg) outb_p(value, (base) + (reg)) #define fd_enable_dma() enable_dma(FLOPPY_DMA) #define fd_disable_dma() fd_ops->_disable_dma(FLOPPY_DMA) diff --git a/arch/sparc/include/asm/floppy_32.h b/arch/sparc/include/asm/floppy_32.h index b519acf4383d..4d08df4f4deb 100644 --- a/arch/sparc/include/asm/floppy_32.h +++ b/arch/sparc/include/asm/floppy_32.h @@ -59,8 +59,8 @@ struct sun_floppy_ops { static struct sun_floppy_ops sun_fdops; -#define fd_inb(port) sun_fdops.fd_inb(port) -#define fd_outb(value,port) sun_fdops.fd_outb(value,port) +#define fd_inb(base, reg) sun_fdops.fd_inb(reg) +#define fd_outb(value, base, reg) sun_fdops.fd_outb(value, reg) #define fd_enable_dma() sun_fd_enable_dma() #define fd_disable_dma() sun_fd_disable_dma() #define fd_request_dma() (0) /* nothing... */ diff --git a/arch/sparc/include/asm/floppy_64.h b/arch/sparc/include/asm/floppy_64.h index 3729fc35ba83..c0cf157e5b15 100644 --- a/arch/sparc/include/asm/floppy_64.h +++ b/arch/sparc/include/asm/floppy_64.h @@ -62,8 +62,8 @@ struct sun_floppy_ops { static struct sun_floppy_ops sun_fdops; -#define fd_inb(port) sun_fdops.fd_inb(port) -#define fd_outb(value,port) sun_fdops.fd_outb(value,port) +#define fd_inb(base, reg) sun_fdops.fd_inb((base) + (reg)) +#define fd_outb(value, base, reg) sun_fdops.fd_outb(value, (base) + (reg)) #define fd_enable_dma() sun_fdops.fd_enable_dma() #define fd_disable_dma() sun_fdops.fd_disable_dma() #define fd_request_dma() (0) /* nothing... */ diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 7ec59edde154..20088cb08f5e 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -31,8 +31,8 @@ #define CSW fd_routine[can_use_virtual_dma & 1] -#define fd_inb(port) inb_p(port) -#define fd_outb(value, port) outb_p(value, port) +#define fd_inb(base, reg) inb_p((base) + (reg)) +#define fd_outb(value, base, reg) outb_p(value, (base) + (reg)) #define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy") #define fd_free_dma() CSW._free_dma(FLOPPY_DMA) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index c3daa64cb52c..1cda39098b07 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -595,12 +595,12 @@ static unsigned char in_sector_offset; /* offset within physical sector, static inline unsigned char fdc_inb(int fdc, int reg) { - return fd_inb(fdc_state[fdc].address + reg); + return fd_inb(fdc_state[fdc].address, reg); } static inline void fdc_outb(unsigned char value, int fdc, int reg) { - fd_outb(value, fdc_state[fdc].address + reg); + fd_outb(value, fdc_state[fdc].address, reg); } static inline bool drive_no_geom(int drive) -- cgit v1.2.3 From 38ede90831c7f3e931b58c2b2790d02d5d061592 Mon Sep 17 00:00:00 2001 From: Willy Tarreau Date: Tue, 31 Mar 2020 11:40:39 +0200 Subject: floppy: use symbolic register names in the x86 port Now we can use FD_STATUS and FD_DATA instead of 4 or 5, let's do this, and also use STATUS_DMA and STATUS_READY for the status bits. Link: https://lore.kernel.org/r/20200331094054.24441-9-w@1wt.eu Cc: x86@kernel.org Signed-off-by: Willy Tarreau Signed-off-by: Denis Efremov --- arch/x86/include/asm/floppy.h | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h index 20088cb08f5e..d43717b423cb 100644 --- a/arch/x86/include/asm/floppy.h +++ b/arch/x86/include/asm/floppy.h @@ -77,25 +77,26 @@ static irqreturn_t floppy_hardint(int irq, void *dev_id) st = 1; for (lcount = virtual_dma_count, lptr = virtual_dma_addr; lcount; lcount--, lptr++) { - st = inb(virtual_dma_port + 4) & 0xa0; - if (st != 0xa0) + st = inb(virtual_dma_port + FD_STATUS); + st &= STATUS_DMA | STATUS_READY; + if (st != (STATUS_DMA | STATUS_READY)) break; if (virtual_dma_mode) - outb_p(*lptr, virtual_dma_port + 5); + outb_p(*lptr, virtual_dma_port + FD_DATA); else - *lptr = inb_p(virtual_dma_port + 5); + *lptr = inb_p(virtual_dma_port + FD_DATA); } virtual_dma_count = lcount; virtual_dma_addr = lptr; - st = inb(virtual_dma_port + 4); + st = inb(virtual_dma_port + FD_STATUS); } #ifdef TRACE_FLPY_INT calls++; #endif - if (st == 0x20) + if (st == STATUS_DMA) return IRQ_HANDLED; - if (!(st & 0x20)) { + if (!(st & STATUS_DMA)) { virtual_dma_residue += virtual_dma_count; virtual_dma_count = 0; #ifdef TRACE_FLPY_INT -- cgit v1.2.3 From 5274e6c172c47241534e970df26a522497086624 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 12 May 2020 07:54:35 -0700 Subject: x86/fpu/xstate: Rename validate_xstate_header() to validate_user_xstate_header() The function validate_xstate_header() validates an xstate header coming from userspace (PTRACE or sigreturn). To make it clear, rename it to validate_user_xstate_header(). Suggested-by: Dave Hansen Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Reviewed-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200512145444.15483-2-yu-cheng.yu@intel.com --- arch/x86/include/asm/fpu/xstate.h | 2 +- arch/x86/kernel/fpu/regset.c | 2 +- arch/x86/kernel/fpu/signal.c | 2 +- arch/x86/kernel/fpu/xstate.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index c6136d79f8c0..fc4db51f3b53 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -56,6 +56,6 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -extern int validate_xstate_header(const struct xstate_header *hdr); +int validate_user_xstate_header(const struct xstate_header *hdr); #endif diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index d652b939ccfb..bd1d0649f8ce 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -139,7 +139,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); if (!ret) - ret = validate_xstate_header(&xsave->header); + ret = validate_user_xstate_header(&xsave->header); } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 400a05e1c1c5..585e3651b98f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -366,7 +366,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); if (!ret && state_size > offsetof(struct xregs_state, header)) - ret = validate_xstate_header(&fpu->state.xsave.header); + ret = validate_user_xstate_header(&fpu->state.xsave.header); } if (ret) goto err_out; diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 32b153d38748..8ed64397c78b 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -472,7 +472,7 @@ int using_compacted_format(void) } /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ -int validate_xstate_header(const struct xstate_header *hdr) +int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) @@ -1147,7 +1147,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) memcpy(&hdr, kbuf + offset, size); - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { @@ -1201,7 +1201,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) if (__copy_from_user(&hdr, ubuf + offset, size)) return -EFAULT; - if (validate_xstate_header(&hdr)) + if (validate_user_xstate_header(&hdr)) return -EINVAL; for (i = 0; i < XFEATURE_MAX; i++) { -- cgit v1.2.3 From 8ab22804efefea9ecf3c68aa00f1fa69c70fcfad Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 12 May 2020 07:54:36 -0700 Subject: x86/fpu/xstate: Define new macros for supervisor and user xstates XCNTXT_MASK is 'all supported xfeatures' before introducing supervisor xstates. Rename it to XFEATURE_MASK_USER_SUPPORTED to make clear that these are user xstates. Replace XFEATURE_MASK_SUPERVISOR with the following: - XFEATURE_MASK_SUPERVISOR_SUPPORTED: Currently nothing. ENQCMD and Control-flow Enforcement Technology (CET) will be introduced in separate series. - XFEATURE_MASK_SUPERVISOR_UNSUPPORTED: Currently only Processor Trace. - XFEATURE_MASK_SUPERVISOR_ALL: the combination of above. Co-developed-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200512145444.15483-3-yu-cheng.yu@intel.com --- arch/x86/include/asm/fpu/xstate.h | 36 +++++++++++++++++++++++------------- arch/x86/kernel/fpu/init.c | 3 ++- arch/x86/kernel/fpu/xstate.c | 26 +++++++++++++------------- 3 files changed, 38 insertions(+), 27 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index fc4db51f3b53..b08fa823425f 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,19 +21,29 @@ #define XSAVE_YMM_SIZE 256 #define XSAVE_YMM_OFFSET (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET) -/* Supervisor features */ -#define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) - -/* All currently supported features */ -#define XCNTXT_MASK (XFEATURE_MASK_FP | \ - XFEATURE_MASK_SSE | \ - XFEATURE_MASK_YMM | \ - XFEATURE_MASK_OPMASK | \ - XFEATURE_MASK_ZMM_Hi256 | \ - XFEATURE_MASK_Hi16_ZMM | \ - XFEATURE_MASK_PKRU | \ - XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR) +/* All currently supported user features */ +#define XFEATURE_MASK_USER_SUPPORTED (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) + +/* All currently supported supervisor features */ +#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (0) + +/* + * Unsupported supervisor features. When a supervisor feature in this mask is + * supported in the future, move it to the supported supervisor feature mask. + */ +#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT) + +/* All supervisor states including supported and unsupported states. */ +#define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \ + XFEATURE_MASK_SUPERVISOR_UNSUPPORTED) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6ce7e0a23268..61ddc3a5e5c2 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -224,7 +224,8 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ u64 __init fpu__get_supported_xfeatures_mask(void) { - return XCNTXT_MASK; + return XFEATURE_MASK_USER_SUPPORTED | + XFEATURE_MASK_SUPERVISOR_SUPPORTED; } /* Legacy code to initialize eager fpu mode. */ diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 8ed64397c78b..9997df717339 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -208,14 +208,13 @@ void fpu__init_cpu_xstate(void) if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) return; /* - * Make it clear that XSAVES supervisor states are not yet - * implemented should anyone expect it to work by changing - * bits in XFEATURE_MASK_* macros and XCR0. + * Unsupported supervisor xstates should not be found in + * the xfeatures mask. */ - WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR), - "x86/fpu: XSAVES supervisor states are not yet implemented.\n"); + WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED), + "x86/fpu: Found unsupported supervisor xstates.\n"); - xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR; + xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; cr4_set_bits(X86_CR4_OSXSAVE); xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); @@ -438,7 +437,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr) * format. Checking a supervisor state's uncompacted offset is * an error. */ - if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) { + if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) { WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr); return -1; } @@ -475,7 +474,7 @@ int using_compacted_format(void) int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR)) + if (hdr->xfeatures & ~(xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED)) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -768,7 +767,8 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR); + update_regset_xstate_info(fpu_user_xstate_size, + xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -996,7 +996,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; /* * Copy xregs_state->header: @@ -1080,7 +1080,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR; + header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; /* * Copy xregs_state->header: @@ -1173,7 +1173,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: @@ -1229,7 +1229,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) * The state that came in from userspace was user-state only. * Mask all the user states out of 'xfeatures': */ - xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR; + xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL; /* * Add back in the features that came in from userspace: -- cgit v1.2.3 From 59566b0b622e3e6ea928c0b8cac8a5601b00b383 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 30 Apr 2020 20:21:47 -0400 Subject: x86/ftrace: Have ftrace trampolines turn read-only at the end of system boot up Booting one of my machines, it triggered the following crash: Kernel/User page tables isolation: enabled ftrace: allocating 36577 entries in 143 pages Starting tracer 'function' BUG: unable to handle page fault for address: ffffffffa000005c #PF: supervisor write access in kernel mode #PF: error_code(0x0003) - permissions violation PGD 2014067 P4D 2014067 PUD 2015063 PMD 7b253067 PTE 7b252061 Oops: 0003 [#1] PREEMPT SMP PTI CPU: 0 PID: 0 Comm: swapper Not tainted 5.4.0-test+ #24 Hardware name: To Be Filled By O.E.M. To Be Filled By O.E.M./To be filled by O.E.M., BIOS SDBLI944.86P 05/08/2007 RIP: 0010:text_poke_early+0x4a/0x58 Code: 34 24 48 89 54 24 08 e8 bf 72 0b 00 48 8b 34 24 48 8b 4c 24 08 84 c0 74 0b 48 89 df f3 a4 48 83 c4 10 5b c3 9c 58 fa 48 89 df a4 50 9d 48 83 c4 10 5b e9 d6 f9 ff ff 0 41 57 49 RSP: 0000:ffffffff82003d38 EFLAGS: 00010046 RAX: 0000000000000046 RBX: ffffffffa000005c RCX: 0000000000000005 RDX: 0000000000000005 RSI: ffffffff825b9a90 RDI: ffffffffa000005c RBP: ffffffffa000005c R08: 0000000000000000 R09: ffffffff8206e6e0 R10: ffff88807b01f4c0 R11: ffffffff8176c106 R12: ffffffff8206e6e0 R13: ffffffff824f2440 R14: 0000000000000000 R15: ffffffff8206eac0 FS: 0000000000000000(0000) GS:ffff88807d400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa000005c CR3: 0000000002012000 CR4: 00000000000006b0 Call Trace: text_poke_bp+0x27/0x64 ? mutex_lock+0x36/0x5d arch_ftrace_update_trampoline+0x287/0x2d5 ? ftrace_replace_code+0x14b/0x160 ? ftrace_update_ftrace_func+0x65/0x6c __register_ftrace_function+0x6d/0x81 ftrace_startup+0x23/0xc1 register_ftrace_function+0x20/0x37 func_set_flag+0x59/0x77 __set_tracer_option.isra.19+0x20/0x3e trace_set_options+0xd6/0x13e apply_trace_boot_options+0x44/0x6d register_tracer+0x19e/0x1ac early_trace_init+0x21b/0x2c9 start_kernel+0x241/0x518 ? load_ucode_intel_bsp+0x21/0x52 secondary_startup_64+0xa4/0xb0 I was able to trigger it on other machines, when I added to the kernel command line of both "ftrace=function" and "trace_options=func_stack_trace". The cause is the "ftrace=function" would register the function tracer and create a trampoline, and it will set it as executable and read-only. Then the "trace_options=func_stack_trace" would then update the same trampoline to include the stack tracer version of the function tracer. But since the trampoline already exists, it updates it with text_poke_bp(). The problem is that text_poke_bp() called while system_state == SYSTEM_BOOTING, it will simply do a memcpy() and not the page mapping, as it would think that the text is still read-write. But in this case it is not, and we take a fault and crash. Instead, lets keep the ftrace trampolines read-write during boot up, and then when the kernel executable text is set to read-only, the ftrace trampolines get set to read-only as well. Link: https://lkml.kernel.org/r/20200430202147.4dc6e2de@oasis.local.home Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Josh Poimboeuf Cc: "H. Peter Anvin" Cc: stable@vger.kernel.org Fixes: 768ae4406a5c ("x86/ftrace: Use text_poke()") Acked-by: Peter Zijlstra Signed-off-by: Steven Rostedt (VMware) --- arch/x86/include/asm/ftrace.h | 6 ++++++ arch/x86/kernel/ftrace.c | 29 ++++++++++++++++++++++++++++- arch/x86/mm/init_64.c | 3 +++ include/linux/ftrace.h | 23 +++++++++++++++++++++++ kernel/trace/ftrace_internal.h | 22 ---------------------- 5 files changed, 60 insertions(+), 23 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 85be2f506272..89af0d2c62aa 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -56,6 +56,12 @@ struct dyn_arch_ftrace { #ifndef __ASSEMBLY__ +#if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) +extern void set_ftrace_ops_ro(void); +#else +static inline void set_ftrace_ops_ro(void) { } +#endif + #define ARCH_HAS_SYSCALL_MATCH_SYM_NAME static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) { diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 37a0aeaf89e7..b0e641793be4 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -407,7 +407,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) set_vm_flush_reset_perms(trampoline); - set_memory_ro((unsigned long)trampoline, npages); + if (likely(system_state != SYSTEM_BOOTING)) + set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: @@ -415,6 +416,32 @@ fail: return 0; } +void set_ftrace_ops_ro(void) +{ + struct ftrace_ops *ops; + unsigned long start_offset; + unsigned long end_offset; + unsigned long npages; + unsigned long size; + + do_for_each_ftrace_op(ops, ftrace_ops_list) { + if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) + continue; + + if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { + start_offset = (unsigned long)ftrace_regs_caller; + end_offset = (unsigned long)ftrace_regs_caller_end; + } else { + start_offset = (unsigned long)ftrace_caller; + end_offset = (unsigned long)ftrace_epilogue; + } + size = end_offset - start_offset; + size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(size, PAGE_SIZE); + set_memory_ro((unsigned long)ops->trampoline, npages); + } while_for_each_ftrace_op(ops); +} + static unsigned long calc_trampoline_call_offset(bool save_regs) { unsigned long start_offset; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3b289c2f75cd..8b5f73f5e207 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -54,6 +54,7 @@ #include #include #include +#include #include "mm_internal.h" @@ -1291,6 +1292,8 @@ void mark_rodata_ro(void) all_end = roundup((unsigned long)_brk_end, PMD_SIZE); set_memory_nx(text_end, (all_end - text_end) >> PAGE_SHIFT); + set_ftrace_ops_ro(); + #ifdef CONFIG_CPA_DEBUG printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end); set_memory_rw(start, (end-start) >> PAGE_SHIFT); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db95244a62d4..ab4bd15cbcdb 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -210,6 +210,29 @@ struct ftrace_ops { #endif }; +extern struct ftrace_ops __rcu *ftrace_ops_list; +extern struct ftrace_ops ftrace_list_end; + +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw_check() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw_check() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +#define do_for_each_ftrace_op(op, list) \ + op = rcu_dereference_raw_check(list); \ + do + +/* + * Optimized for just a single item in the list (as that is the normal case). + */ +#define while_for_each_ftrace_op(op) \ + while (likely(op = rcu_dereference_raw_check((op)->next)) && \ + unlikely((op) != &ftrace_list_end)) + /* * Type of the current tracing. */ diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h index 0456e0a3dab1..382775edf690 100644 --- a/kernel/trace/ftrace_internal.h +++ b/kernel/trace/ftrace_internal.h @@ -4,28 +4,6 @@ #ifdef CONFIG_FUNCTION_TRACER -/* - * Traverse the ftrace_global_list, invoking all entries. The reason that we - * can use rcu_dereference_raw_check() is that elements removed from this list - * are simply leaked, so there is no need to interact with a grace-period - * mechanism. The rcu_dereference_raw_check() calls are needed to handle - * concurrent insertions into the ftrace_global_list. - * - * Silly Alpha and silly pointer-speculation compiler optimizations! - */ -#define do_for_each_ftrace_op(op, list) \ - op = rcu_dereference_raw_check(list); \ - do - -/* - * Optimized for just a single item in the list (as that is the normal case). - */ -#define while_for_each_ftrace_op(op) \ - while (likely(op = rcu_dereference_raw_check((op)->next)) && \ - unlikely((op) != &ftrace_list_end)) - -extern struct ftrace_ops __rcu *ftrace_ops_list; -extern struct ftrace_ops ftrace_list_end; extern struct mutex ftrace_lock; extern struct ftrace_ops global_ops; -- cgit v1.2.3 From 524bb73bc15c56f5587e33c817e103a259b019d2 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:37 -0700 Subject: x86/fpu/xstate: Separate user and supervisor xfeatures mask Before the introduction of XSAVES supervisor states, 'xfeatures_mask' is used at various places to determine XSAVE buffer components and XCR0 bits. It contains only user xstates. To support supervisor xstates, it is necessary to separate user and supervisor xstates: - First, change 'xfeatures_mask' to 'xfeatures_mask_all', which represents the full set of bits that should ever be set in a kernel XSAVE buffer. - Introduce xfeatures_mask_supervisor() and xfeatures_mask_user() to extract relevant xfeatures from xfeatures_mask_all. Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200512145444.15483-4-yu-cheng.yu@intel.com --- arch/x86/include/asm/fpu/internal.h | 2 +- arch/x86/include/asm/fpu/xstate.h | 13 ++++++- arch/x86/kernel/fpu/signal.c | 16 +++++--- arch/x86/kernel/fpu/xstate.c | 73 ++++++++++++++++++++++--------------- 4 files changed, 67 insertions(+), 37 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 44c48e34d799..ccb1bb32ad7d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -92,7 +92,7 @@ static inline void fpstate_init_xstate(struct xregs_state *xsave) * XRSTORS requires these bits set in xcomp_bv, or it will * trigger #GP: */ - xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask; + xsave->header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | xfeatures_mask_all; } static inline void fpstate_init_fxstate(struct fxregs_state *fx) diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index b08fa823425f..92104b298d77 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -51,7 +51,18 @@ #define REX_PREFIX #endif -extern u64 xfeatures_mask; +extern u64 xfeatures_mask_all; + +static inline u64 xfeatures_mask_supervisor(void) +{ + return xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_SUPPORTED; +} + +static inline u64 xfeatures_mask_user(void) +{ + return xfeatures_mask_all & XFEATURE_MASK_USER_SUPPORTED; +} + extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; extern void __init update_regset_xstate_info(unsigned int size, diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 585e3651b98f..3df0cfae535f 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -252,13 +252,17 @@ sanitize_restored_xstate(union fpregs_state *state, */ static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { + u64 init_bv; + if (use_xsave()) { if (fx_only) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_fxregs(buf); } else { - u64 init_bv = xfeatures_mask & ~xbv; + init_bv = xfeatures_mask_user() & ~xbv; + if (unlikely(init_bv)) copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_xregs(buf, xbv); @@ -358,7 +362,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (use_xsave() && !fx_only) { - u64 init_bv = xfeatures_mask & ~xfeatures; + u64 init_bv = xfeatures_mask_user() & ~xfeatures; if (using_compacted_format()) { ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); @@ -389,7 +393,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpregs_lock(); if (use_xsave()) { - u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + u64 init_bv; + + init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } @@ -465,7 +471,7 @@ void fpu__init_prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; - fx_sw_reserved.xfeatures = xfeatures_mask; + fx_sw_reserved.xfeatures = xfeatures_mask_user(); fx_sw_reserved.xstate_size = fpu_user_xstate_size; if (IS_ENABLED(CONFIG_IA32_EMULATION) || diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 9997df717339..fa71af643025 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -54,9 +54,10 @@ static short xsave_cpuid_features[] __initdata = { }; /* - * Mask of xstate features supported by the CPU and the kernel: + * This represents the full set of bits that should ever be set in a kernel + * XSAVE buffer, both supervisor and user xstates. */ -u64 xfeatures_mask __read_mostly; +u64 xfeatures_mask_all __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; @@ -76,7 +77,7 @@ unsigned int fpu_user_xstate_size; */ int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name) { - u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask; + u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all; if (unlikely(feature_name)) { long xfeature_idx, max_idx; @@ -150,7 +151,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * None of the feature bits are in init state. So nothing else * to do for us, as the memory layout is up to date. */ - if ((xfeatures & xfeatures_mask) == xfeatures_mask) + if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all) return; /* @@ -177,7 +178,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu) * in a special way already: */ feature_bit = 0x2; - xfeatures = (xfeatures_mask & ~xfeatures) >> 2; + xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2; /* * Update all the remaining memory layouts according to their @@ -205,19 +206,28 @@ void fpstate_sanitize_xstate(struct fpu *fpu) */ void fpu__init_cpu_xstate(void) { - if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask) + u64 unsup_bits; + + if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all) return; /* * Unsupported supervisor xstates should not be found in * the xfeatures mask. */ - WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED), - "x86/fpu: Found unsupported supervisor xstates.\n"); + unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n", + unsup_bits); - xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; + xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED; cr4_set_bits(X86_CR4_OSXSAVE); - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + + /* + * XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features + * managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user + * states can be set here. + */ + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); } /* @@ -225,9 +235,9 @@ void fpu__init_cpu_xstate(void) * functions here: one for user xstates and the other for * system xstates. For now, they are the same. */ -static int xfeature_enabled(enum xfeature xfeature) +static bool xfeature_enabled(enum xfeature xfeature) { - return !!(xfeatures_mask & (1UL << xfeature)); + return xfeatures_mask_all & BIT_ULL(xfeature); } /* @@ -414,7 +424,7 @@ static void __init setup_init_fpu_buf(void) if (boot_cpu_has(X86_FEATURE_XSAVES)) init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT | - xfeatures_mask; + xfeatures_mask_all; /* * Init all the features state with header.xfeatures being 0x0 @@ -474,7 +484,7 @@ int using_compacted_format(void) int validate_user_xstate_header(const struct xstate_header *hdr) { /* No unknown or supervisor features may be set */ - if (hdr->xfeatures & ~(xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED)) + if (hdr->xfeatures & ~xfeatures_mask_user()) return -EINVAL; /* Userspace must use the uncompacted format */ @@ -609,7 +619,7 @@ static void do_extra_xstate_size_checks(void) /* - * Get total size of enabled xstates in XCR0/xfeatures_mask. + * Get total size of enabled xstates in XCR0 | IA32_XSS. * * Note the SDM's wording here. "sub-function 0" only enumerates * the size of the *user* states. If we use it to size a buffer @@ -699,7 +709,7 @@ static int __init init_xstate_size(void) */ static void fpu__init_disable_system_xstate(void) { - xfeatures_mask = 0; + xfeatures_mask_all = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); } @@ -734,16 +744,21 @@ void __init fpu__init_system_xstate(void) return; } + /* + * Find user xstates supported by the processor. + */ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); - xfeatures_mask = eax + ((u64)edx << 32); + xfeatures_mask_all = eax + ((u64)edx << 32); - if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { + /* Place supervisor features in xfeatures_mask_all here */ + if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) { /* * This indicates that something really unexpected happened * with the enumeration. Disable XSAVE and try to continue * booting without it. This is too early to BUG(). */ - pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask); + pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", + xfeatures_mask_all); goto out_disable; } @@ -752,10 +767,10 @@ void __init fpu__init_system_xstate(void) */ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { if (!boot_cpu_has(xsave_cpuid_features[i])) - xfeatures_mask &= ~BIT(i); + xfeatures_mask_all &= ~BIT_ULL(i); } - xfeatures_mask &= fpu__get_supported_xfeatures_mask(); + xfeatures_mask_all &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ fpu__init_cpu_xstate(); @@ -767,8 +782,7 @@ void __init fpu__init_system_xstate(void) * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: */ - update_regset_xstate_info(fpu_user_xstate_size, - xfeatures_mask & XFEATURE_MASK_USER_SUPPORTED); + update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user()); fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); @@ -776,7 +790,7 @@ void __init fpu__init_system_xstate(void) print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", - xfeatures_mask, + xfeatures_mask_all, fpu_kernel_xstate_size, boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard"); return; @@ -795,7 +809,7 @@ void fpu__resume_cpu(void) * Restore XCR0 on xsave capable CPUs: */ if (boot_cpu_has(X86_FEATURE_XSAVE)) - xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask); + xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user()); } /* @@ -840,10 +854,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) /* * We should not ever be requesting features that we - * have not enabled. Remember that xfeatures_mask is - * what we write to the XCR0 register. + * have not enabled. */ - WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), + WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to @@ -996,7 +1009,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; + header.xfeatures &= xfeatures_mask_user(); /* * Copy xregs_state->header: @@ -1080,7 +1093,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i */ memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; - header.xfeatures &= XFEATURE_MASK_USER_SUPPORTED; + header.xfeatures &= xfeatures_mask_user(); /* * Copy xregs_state->header: -- cgit v1.2.3 From b860eb8dce5906b14e3a7f3c771e0b3d6ef61b94 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Tue, 12 May 2020 07:54:39 -0700 Subject: x86/fpu/xstate: Define new functions for clearing fpregs and xstates Currently, fpu__clear() clears all fpregs and xstates. Once XSAVES supervisor states are introduced, supervisor settings (e.g. CET xstates) must remain active for signals; It is necessary to have separate functions: - Create fpu__clear_user_states(): clear only user settings for signals; - Create fpu__clear_all(): clear both user and supervisor settings in flush_thread(). Also modify copy_init_fpstate_to_fpregs() to take a mask from above two functions. Remove obvious side-comment in fpu__clear(), while at it. [ bp: Make the second argument of fpu__clear() bool after requesting it a bunch of times during review. - Add a comment about copy_init_fpstate_to_fpregs() locking needs. ] Co-developed-by: Yu-cheng Yu Signed-off-by: Fenghua Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Reviewed-by: Tony Luck Link: https://lkml.kernel.org/r/20200512145444.15483-6-yu-cheng.yu@intel.com --- arch/x86/include/asm/fpu/internal.h | 3 ++- arch/x86/kernel/fpu/core.c | 53 ++++++++++++++++++++++++------------- arch/x86/kernel/fpu/signal.c | 4 +-- arch/x86/kernel/process.c | 2 +- arch/x86/kernel/signal.c | 2 +- 5 files changed, 41 insertions(+), 23 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index ccb1bb32ad7d..a42fcb4b690d 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -31,7 +31,8 @@ extern void fpu__save(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); extern int fpu__copy(struct task_struct *dst, struct task_struct *src); -extern void fpu__clear(struct fpu *fpu); +extern void fpu__clear_user_states(struct fpu *fpu); +extern void fpu__clear_all(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 12c70840980e..06c818967bb6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -291,15 +291,13 @@ void fpu__drop(struct fpu *fpu) } /* - * Clear FPU registers by setting them up from - * the init fpstate: + * Clear FPU registers by setting them up from the init fpstate. + * Caller must do fpregs_[un]lock() around it. */ -static inline void copy_init_fpstate_to_fpregs(void) +static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { - fpregs_lock(); - if (use_xsave()) - copy_kernel_to_xregs(&init_fpstate.xsave, -1); + copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); else @@ -307,9 +305,6 @@ static inline void copy_init_fpstate_to_fpregs(void) if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); - - fpregs_mark_activate(); - fpregs_unlock(); } /* @@ -318,18 +313,40 @@ static inline void copy_init_fpstate_to_fpregs(void) * Called by sys_execve(), by the signal handler code and by various * error paths. */ -void fpu__clear(struct fpu *fpu) +static void fpu__clear(struct fpu *fpu, bool user_only) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); /* Almost certainly an anomaly */ + WARN_ON_FPU(fpu != ¤t->thread.fpu); - fpu__drop(fpu); + if (!static_cpu_has(X86_FEATURE_FPU)) { + fpu__drop(fpu); + fpu__initialize(fpu); + return; + } - /* - * Make sure fpstate is cleared and initialized. - */ - fpu__initialize(fpu); - if (static_cpu_has(X86_FEATURE_FPU)) - copy_init_fpstate_to_fpregs(); + fpregs_lock(); + + if (user_only) { + if (!fpregs_state_valid(fpu, smp_processor_id()) && + xfeatures_mask_supervisor()) + copy_kernel_to_xregs(&fpu->state.xsave, + xfeatures_mask_supervisor()); + copy_init_fpstate_to_fpregs(xfeatures_mask_user()); + } else { + copy_init_fpstate_to_fpregs(xfeatures_mask_all); + } + + fpregs_mark_activate(); + fpregs_unlock(); +} + +void fpu__clear_user_states(struct fpu *fpu) +{ + fpu__clear(fpu, true); +} + +void fpu__clear_all(struct fpu *fpu) +{ + fpu__clear(fpu, false); } /* diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 3df0cfae535f..cd6eafba12da 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -289,7 +289,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) IS_ENABLED(CONFIG_IA32_EMULATION)); if (!buf) { - fpu__clear(fpu); + fpu__clear_user_states(fpu); return 0; } @@ -416,7 +416,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) err_out: if (ret) - fpu__clear(fpu); + fpu__clear_user_states(fpu); return ret; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9da70b279dad..de182b84723a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -191,7 +191,7 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - fpu__clear(&tsk->thread.fpu); + fpu__clear_all(&tsk->thread.fpu); } void disable_TSC(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 83b74fb38c8f..0052bbe5dfd4 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -732,7 +732,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - fpu__clear(fpu); + fpu__clear_user_states(fpu); } signal_setup_done(failed, ksig, stepping); } -- cgit v1.2.3 From 37486135d3a7b03acc7755b63627a130437f066a Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Tue, 12 May 2020 18:59:06 -0500 Subject: KVM: x86: Fix pkru save/restore when guest CR4.PKE=0, move it to x86.c Though rdpkru and wrpkru are contingent upon CR4.PKE, the PKRU resource isn't. It can be read with XSAVE and written with XRSTOR. So, if we don't set the guest PKRU value here(kvm_load_guest_xsave_state), the guest can read the host value. In case of kvm_load_host_xsave_state, guest with CR4.PKE clear could potentially use XRSTOR to change the host PKRU value. While at it, move pkru state save/restore to common code and the host_pkru field to kvm_vcpu_arch. This will let SVM support protection keys. Cc: stable@vger.kernel.org Reported-by: Jim Mattson Signed-off-by: Babu Moger Message-Id: <158932794619.44260.14508381096663848853.stgit@naples-babu.amd.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/vmx.c | 18 ------------------ arch/x86/kvm/x86.c | 17 +++++++++++++++++ 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9e8263b1e6fe..0a6b35353fc7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -578,6 +578,7 @@ struct kvm_vcpu_arch { unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr8; + u32 host_pkru; u32 pkru; u32 hflags; u64 efer; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index e45cf89c5821..89c766fad889 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1372,7 +1372,6 @@ void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx_vcpu_pi_load(vcpu, cpu); - vmx->host_pkru = read_pkru(); vmx->host_debugctlmsr = get_debugctlmsr(); } @@ -6564,11 +6563,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) kvm_load_guest_xsave_state(vcpu); - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && - vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vcpu->arch.pkru); - pt_guest_enter(vmx); if (vcpu_to_pmu(vcpu)->version) @@ -6658,18 +6652,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) pt_guest_exit(vmx); - /* - * eager fpu is enabled if PKEY is supported and CR4 is switched - * back on host, so it is safe to read guest PKRU from current - * XSAVE. - */ - if (static_cpu_has(X86_FEATURE_PKU) && - kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = rdpkru(); - if (vcpu->arch.pkru != vmx->host_pkru) - __write_pkru(vmx->host_pkru); - } - kvm_load_host_xsave_state(vcpu); vmx->nested.nested_run_pending = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 98176b80c481..d11eba8b85c6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -837,11 +837,25 @@ void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) vcpu->arch.ia32_xss != host_xss) wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss); } + + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) && + vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.pkru); } EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) { + if (static_cpu_has(X86_FEATURE_PKU) && + (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) || + (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) { + vcpu->arch.pkru = rdpkru(); + if (vcpu->arch.pkru != vcpu->arch.host_pkru) + __write_pkru(vcpu->arch.host_pkru); + } + if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) { if (vcpu->arch.xcr0 != host_xcr0) @@ -3549,6 +3563,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops.vcpu_load(vcpu, cpu); + /* Save host pkru register if supported */ + vcpu->arch.host_pkru = read_pkru(); + /* Apply any externally detected TSC adjustments (due to suspend) */ if (unlikely(vcpu->arch.tsc_offset_adjustment)) { adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); -- cgit v1.2.3 From d2060bd42e4482b15c35f961a294ee57f369027d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 22 Apr 2020 19:25:39 -0700 Subject: KVM: nVMX: Open a window for pending nested VMX preemption timer Add a kvm_x86_ops hook to detect a nested pending "hypervisor timer" and use it to effectively open a window for servicing the expired timer. Like pending SMIs on VMX, opening a window simply means requesting an immediate exit. This fixes a bug where an expired VMX preemption timer (for L2) will be delayed and/or lost if a pending exception is injected into L2. The pending exception is rightly prioritized by vmx_check_nested_events() and injected into L2, with the preemption timer left pending. Because no window opened, L2 is free to run uninterrupted. Fixes: f4124500c2c13 ("KVM: nVMX: Fully emulate preemption timer") Reported-by: Jim Mattson Cc: Oliver Upton Cc: Peter Shier Signed-off-by: Sean Christopherson Message-Id: <20200423022550.15113-3-sean.j.christopherson@intel.com> [Check it in kvm_vcpu_has_events too, to ensure that the preemption timer is serviced promptly even if the vCPU is halted and L1 is not intercepting HLT. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/vmx/nested.c | 10 ++++++++-- arch/x86/kvm/x86.c | 9 +++++++++ 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b3a5da27c2a5..e6671c61fd65 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1256,6 +1256,7 @@ struct kvm_x86_ops { struct kvm_x86_nested_ops { int (*check_events)(struct kvm_vcpu *vcpu); + bool (*hv_timer_pending)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, unsigned user_data_size); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index de678be3c92d..51220a14a633 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3687,6 +3687,12 @@ static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu) vcpu->arch.exception.payload); } +static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu) +{ + return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && + to_vmx(vcpu)->nested.preemption_timer_expired; +} + static int vmx_check_nested_events(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3742,8 +3748,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu) return 0; } - if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) && - vmx->nested.preemption_timer_expired) { + if (nested_vmx_preemption_timer_pending(vcpu)) { if (block_nested_events) return -EBUSY; nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0); @@ -6448,6 +6453,7 @@ __init int nested_vmx_hardware_setup(struct kvm_x86_ops *ops, struct kvm_x86_nested_ops vmx_nested_ops = { .check_events = vmx_check_nested_events, + .hv_timer_pending = nested_vmx_preemption_timer_pending, .get_state = vmx_get_nested_state, .set_state = vmx_set_nested_state, .get_vmcs12_pages = nested_get_vmcs12_pages, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 542a00008caa..e874182d113c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8355,6 +8355,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_x86_ops.enable_nmi_window(vcpu); if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) kvm_x86_ops.enable_irq_window(vcpu); + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + req_immediate_exit = true; WARN_ON(vcpu->arch.exception.pending); } @@ -10211,6 +10215,11 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_hv_has_stimer_pending(vcpu)) return true; + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + return true; + return false; } -- cgit v1.2.3 From 88c604b66eb6a74841cd40f5bdf639112ad69115 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Wed, 22 Apr 2020 19:25:41 -0700 Subject: KVM: x86: Make return for {interrupt_nmi,smi}_allowed() a bool instead of int Return an actual bool for kvm_x86_ops' {interrupt_nmi}_allowed() hook to better reflect the return semantics, and to avoid creating an even bigger mess when the related VMX code is refactored in upcoming patches. Signed-off-by: Sean Christopherson Message-Id: <20200423022550.15113-5-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 +++--- arch/x86/kvm/svm/svm.c | 16 ++++++++-------- arch/x86/kvm/vmx/vmx.c | 14 +++++++------- 3 files changed, 18 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e6671c61fd65..44268642b3c6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1138,8 +1138,8 @@ struct kvm_x86_ops { void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); - int (*interrupt_allowed)(struct kvm_vcpu *vcpu); - int (*nmi_allowed)(struct kvm_vcpu *vcpu); + bool (*interrupt_allowed)(struct kvm_vcpu *vcpu); + bool (*nmi_allowed)(struct kvm_vcpu *vcpu); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); @@ -1237,7 +1237,7 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); - int (*smi_allowed)(struct kvm_vcpu *vcpu); + bool (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); int (*enable_smi_window)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index f97f29e1eb49..c6e5374db825 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3065,11 +3065,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) set_cr_intercept(svm, INTERCEPT_CR8_WRITE); } -static int svm_nmi_allowed(struct kvm_vcpu *vcpu) +static bool svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; - int ret; + bool ret; ret = !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && !(svm->vcpu.arch.hflags & HF_NMI_MASK); @@ -3098,14 +3098,14 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } -static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) +static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *vmcb = svm->vmcb; if (!gif_set(svm) || (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) - return 0; + return false; if (is_guest_mode(vcpu) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)) return !!(svm->vcpu.arch.hflags & HF_HIF_MASK); @@ -3767,23 +3767,23 @@ static void svm_setup_mce(struct kvm_vcpu *vcpu) vcpu->arch.mcg_cap &= 0x1ff; } -static int svm_smi_allowed(struct kvm_vcpu *vcpu) +static bool svm_smi_allowed(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); /* Per APM Vol.2 15.22.2 "Response to SMI" */ if (!gif_set(svm)) - return 0; + return false; if (is_guest_mode(&svm->vcpu) && svm->nested.intercept & (1ULL << INTERCEPT_SMI)) { /* TODO: Might need to set exit_info_1 and exit_info_2 here */ svm->vmcb->control.exit_code = SVM_EXIT_SMI; svm->nested.exit_required = true; - return 0; + return false; } - return 1; + return true; } static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 46aa3ca01929..4ca1f1f4ef1a 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4510,21 +4510,21 @@ void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) } } -static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) +static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu) { if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; + return false; if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked) - return 0; + return false; return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); } -static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu) { if (to_vmx(vcpu)->nested.nested_run_pending) return false; @@ -7675,12 +7675,12 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } -static int vmx_smi_allowed(struct kvm_vcpu *vcpu) +static bool vmx_smi_allowed(struct kvm_vcpu *vcpu) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) - return 0; - return 1; + return false; + return true; } static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) -- cgit v1.2.3 From c300ab9f08df9e4b9f39d53a0691e234330df124 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 23 Apr 2020 14:08:58 -0400 Subject: KVM: x86: Replace late check_nested_events() hack with more precise fix Add an argument to interrupt_allowed and nmi_allowed, to checking if interrupt injection is blocked. Use the hook to handle the case where an interrupt arrives between check_nested_events() and the injection logic. Drop the retry of check_nested_events() that hack-a-fixed the same condition. Blocking injection is also a bit of a hack, e.g. KVM should do exiting and non-exiting interrupt processing in a single pass, but it's a more precise hack. The old comment is also misleading, e.g. KVM_REQ_EVENT is purely an optimization, setting it on every run loop (which KVM doesn't do) should not affect functionality, only performance. Signed-off-by: Sean Christopherson Message-Id: <20200423022550.15113-13-sean.j.christopherson@intel.com> [Extend to SVM, add SMI and NMI. Even though NMI and SMI cannot come asynchronously right now, making the fix generic is easy and removes a special case. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 6 +++--- arch/x86/kvm/svm/svm.c | 25 ++++++++++++++++++++----- arch/x86/kvm/vmx/vmx.c | 17 ++++++++++++++--- arch/x86/kvm/x86.c | 36 ++++++++++++------------------------ 4 files changed, 49 insertions(+), 35 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 44268642b3c6..efe6199c596c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1138,8 +1138,8 @@ struct kvm_x86_ops { void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); - bool (*interrupt_allowed)(struct kvm_vcpu *vcpu); - bool (*nmi_allowed)(struct kvm_vcpu *vcpu); + bool (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + bool (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); @@ -1237,7 +1237,7 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); - bool (*smi_allowed)(struct kvm_vcpu *vcpu); + bool (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); int (*enable_smi_window)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index fba8bdcfed0e..45c6e4b87eee 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3083,13 +3083,17 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) return ret; } -static bool svm_nmi_allowed(struct kvm_vcpu *vcpu) +static bool svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) return false; - return !svm_nmi_blocked(vcpu); + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) + return false; + + return !svm_nmi_blocked(vcpu); } static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) @@ -3138,13 +3142,20 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); } -static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu) +static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) return false; - return !svm_interrupt_blocked(vcpu); + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) + return false; + + return !svm_interrupt_blocked(vcpu); } static void enable_irq_window(struct kvm_vcpu *vcpu) @@ -3812,12 +3823,16 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu) return is_smm(vcpu); } -static bool svm_smi_allowed(struct kvm_vcpu *vcpu) +static bool svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) return false; + /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) + return false; + return !svm_smi_blocked(vcpu); } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 71cd210ec368..e1f5fc919fd9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4523,11 +4523,15 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } -static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu) +static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return false; + /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) + return false; + return !vmx_nmi_blocked(vcpu); } @@ -4541,11 +4545,18 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } -static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu) +static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) return false; + /* + * An IRQ must not be injected into L2 if it's supposed to VM-Exit, + * e.g. if the IRQ arrived asynchronously after checking nested events. + */ + if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return false; + return !vmx_interrupt_blocked(vcpu); } @@ -7688,7 +7699,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } -static bool vmx_smi_allowed(struct kvm_vcpu *vcpu) +static bool vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cb250f16bdf5..ec20368ac025 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7766,32 +7766,20 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) if (kvm_event_needs_reinjection(vcpu)) return 0; - if (vcpu->arch.smi_pending && kvm_x86_ops.smi_allowed(vcpu)) { + if (vcpu->arch.smi_pending && + kvm_x86_ops.smi_allowed(vcpu, true)) { vcpu->arch.smi_pending = false; ++vcpu->arch.smi_count; enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && kvm_x86_ops.nmi_allowed(vcpu)) { + } else if (vcpu->arch.nmi_pending && + kvm_x86_ops.nmi_allowed(vcpu, true)) { --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops.set_nmi(vcpu); - } else if (kvm_cpu_has_injectable_intr(vcpu)) { - /* - * Because interrupts can be injected asynchronously, we are - * calling check_nested_events again here to avoid a race condition. - * See https://lkml.org/lkml/2014/7/2/60 for discussion about this - * proposal and current concerns. Perhaps we should be setting - * KVM_REQ_EVENT only on certain events and not unconditionally? - */ - if (is_guest_mode(vcpu)) { - r = kvm_x86_ops.nested_ops->check_events(vcpu); - if (r != 0) - return r; - } - if (kvm_x86_ops.interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), - false); - kvm_x86_ops.set_irq(vcpu); - } + } else if (kvm_cpu_has_injectable_intr(vcpu) && + kvm_x86_ops.interrupt_allowed(vcpu, true)) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); + kvm_x86_ops.set_irq(vcpu); } return 0; @@ -10203,12 +10191,12 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (kvm_test_request(KVM_REQ_NMI, vcpu) || (vcpu->arch.nmi_pending && - kvm_x86_ops.nmi_allowed(vcpu))) + kvm_x86_ops.nmi_allowed(vcpu, false))) return true; if (kvm_test_request(KVM_REQ_SMI, vcpu) || (vcpu->arch.smi_pending && - kvm_x86_ops.smi_allowed(vcpu))) + kvm_x86_ops.smi_allowed(vcpu, false))) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -10260,7 +10248,7 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) { - return kvm_x86_ops.interrupt_allowed(vcpu); + return kvm_x86_ops.interrupt_allowed(vcpu, false); } unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) @@ -10425,7 +10413,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) * If interrupts are off we cannot even use an artificial * halt state. */ - return kvm_x86_ops.interrupt_allowed(vcpu); + return kvm_arch_interrupt_allowed(vcpu); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 56ba77a459a72a7d95be74355a40a91e1f6dd7f7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2020 21:32:25 -0700 Subject: KVM: x86: Save L1 TSC offset in 'struct kvm_vcpu_arch' Save L1's TSC offset in 'struct kvm_vcpu_arch' and drop the kvm_x86_ops hook read_l1_tsc_offset(). This avoids a retpoline (when configured) when reading L1's effective TSC, which is done at least once on every VM-Exit. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200502043234.12481-2-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 11 ----------- arch/x86/kvm/vmx/vmx.c | 12 ------------ arch/x86/kvm/x86.c | 9 ++++----- 4 files changed, 5 insertions(+), 29 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index efe6199c596c..3ff671e35d76 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -708,6 +708,7 @@ struct kvm_vcpu_arch { struct gfn_to_pfn_cache cache; } st; + u64 l1_tsc_offset; u64 tsc_offset; u64 last_guest_tsc; u64 last_host_tsc; @@ -1165,7 +1166,6 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - u64 (*read_l1_tsc_offset)(struct kvm_vcpu *vcpu); /* Returns actual tsc_offset set in active VMCS */ u64 (*write_l1_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 83175933c0a1..18a63641b612 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -951,16 +951,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 svm_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - if (is_guest_mode(vcpu)) - return svm->nested.hsave->control.tsc_offset; - - return vcpu->arch.tsc_offset; -} - static u64 svm_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -4075,7 +4065,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .read_l1_tsc_offset = svm_read_l1_tsc_offset, .write_l1_tsc_offset = svm_write_l1_tsc_offset, .load_mmu_pgd = svm_load_mmu_pgd, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d044a05a38bd..31a8d04a6c41 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1740,17 +1740,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) vmx_update_msr_bitmap(&vmx->vcpu); } -static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu) -{ - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (is_guest_mode(vcpu) && - (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)) - return vcpu->arch.tsc_offset - vmcs12->tsc_offset; - - return vcpu->arch.tsc_offset; -} - static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vmcs12 *vmcs12 = get_vmcs12(vcpu); @@ -7881,7 +7870,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .read_l1_tsc_offset = vmx_read_l1_tsc_offset, .write_l1_tsc_offset = vmx_write_l1_tsc_offset, .load_mmu_pgd = vmx_load_mmu_pgd, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec20368ac025..3622586a26a7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1910,7 +1910,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) { - u64 curr_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); + u64 curr_offset = vcpu->arch.l1_tsc_offset; vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } @@ -1952,14 +1952,13 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { - u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); - - return tsc_offset + kvm_scale_tsc(vcpu, host_tsc); + return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc); } EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { + vcpu->arch.l1_tsc_offset = offset; vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset); } @@ -2084,7 +2083,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { - u64 tsc_offset = kvm_x86_ops.read_l1_tsc_offset(vcpu); + u64 tsc_offset = vcpu->arch.l1_tsc_offset; kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment); } -- cgit v1.2.3 From f98c1e77127de7d9ff558570c25d02ef077df50f Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2020 21:32:30 -0700 Subject: KVM: VMX: Add proper cache tracking for CR4 Move CR4 caching into the standard register caching mechanism in order to take advantage of the availability checks provided by regs_avail. This avoids multiple VMREADs and retpolines (when configured) during nested VMX transitions as kvm_read_cr4_bits() is invoked multiple times on each transition, e.g. when stuffing CR0 and CR3. As an added bonus, this eliminates a kvm_x86_ops hook, saves a retpoline on SVM when reading CR4, and squashes the confusing naming discrepancy of "cache_reg" vs. "decache_cr4_guest_bits". No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200502043234.12481-7-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/kvm_cache_regs.h | 5 +++-- arch/x86/kvm/svm/svm.c | 5 ----- arch/x86/kvm/vmx/vmx.c | 18 +++++++++--------- arch/x86/kvm/vmx/vmx.h | 1 + 5 files changed, 14 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3ff671e35d76..3e2f582160fe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -168,6 +168,7 @@ enum kvm_reg { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, + VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, VCPU_EXREG_EXIT_INFO_1, @@ -1092,7 +1093,6 @@ struct kvm_x86_ops { struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); - void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 62558b9bdda7..921a539bcb96 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -129,8 +129,9 @@ static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu) static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS; - if (tmask & vcpu->arch.cr4_guest_owned_bits) - kvm_x86_ops.decache_cr4_guest_bits(vcpu); + if ((tmask & vcpu->arch.cr4_guest_owned_bits) && + !kvm_register_is_available(vcpu, VCPU_EXREG_CR4)) + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR4); return vcpu->arch.cr4 & mask; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 18a63641b612..805297e9a1cb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1525,10 +1525,6 @@ static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) { } -static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ -} - static void update_cr0_intercept(struct vcpu_svm *svm) { ulong gcr0 = svm->vcpu.arch.cr0; @@ -4007,7 +4003,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, - .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, .set_cr0 = svm_set_cr0, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 31a8d04a6c41..83e3fe083679 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2204,6 +2204,8 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { + unsigned long guest_owned_bits; + kvm_register_mark_available(vcpu, reg); switch (reg) { @@ -2221,6 +2223,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; + case VCPU_EXREG_CR4: + guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; + + vcpu->arch.cr4 &= ~guest_owned_bits; + vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; + break; default: WARN_ON_ONCE(1); break; @@ -2922,14 +2930,6 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; } -static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; - - vcpu->arch.cr4 &= ~cr4_guest_owned_bits; - vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits; -} - static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -3128,6 +3128,7 @@ int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; vcpu->arch.cr4 = cr4; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR4); if (!enable_unrestricted_guest) { if (enable_ept) { @@ -7809,7 +7810,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, - .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 373674d455e1..04bb557acdd2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -453,6 +453,7 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3) + | (1 << VCPU_EXREG_CR4) | (1 << VCPU_EXREG_EXIT_INFO_1) | (1 << VCPU_EXREG_EXIT_INFO_2)); vcpu->arch.regs_dirty = 0; -- cgit v1.2.3 From bd31fe495d0d1a67fe6f44f06dfef637f202241d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2020 21:32:31 -0700 Subject: KVM: VMX: Add proper cache tracking for CR0 Move CR0 caching into the standard register caching mechanism in order to take advantage of the availability checks provided by regs_avail. This avoids multiple VMREADs in the (uncommon) case where kvm_read_cr0() is called multiple times in a single VM-Exit, and more importantly eliminates a kvm_x86_ops hook, saves a retpoline on SVM when reading CR0, and squashes the confusing naming discrepancy of "cache_reg" vs. "decache_cr0_guest_bits". No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200502043234.12481-8-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/kvm_cache_regs.h | 5 +++-- arch/x86/kvm/svm/svm.c | 5 ----- arch/x86/kvm/vmx/vmx.c | 16 +++++++--------- arch/x86/kvm/vmx/vmx.h | 1 + 5 files changed, 12 insertions(+), 17 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3e2f582160fe..ad882127079f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -167,6 +167,7 @@ enum kvm_reg { NR_VCPU_REGS, VCPU_EXREG_PDPTR = NR_VCPU_REGS, + VCPU_EXREG_CR0, VCPU_EXREG_CR3, VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, @@ -1092,7 +1093,6 @@ struct kvm_x86_ops { void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); - void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); int (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 921a539bcb96..ff2d0e9ca3bc 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -116,8 +116,9 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; - if (tmask & vcpu->arch.cr0_guest_owned_bits) - kvm_x86_ops.decache_cr0_guest_bits(vcpu); + if ((tmask & vcpu->arch.cr0_guest_owned_bits) && + !kvm_register_is_available(vcpu, VCPU_EXREG_CR0)) + kvm_x86_ops.cache_reg(vcpu, VCPU_EXREG_CR0); return vcpu->arch.cr0 & mask; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 805297e9a1cb..9e24b5b6fae1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1521,10 +1521,6 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) mark_dirty(svm->vmcb, VMCB_DT); } -static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ -} - static void update_cr0_intercept(struct vcpu_svm *svm) { ulong gcr0 = svm->vcpu.arch.cr0; @@ -4002,7 +3998,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_segment = svm_set_segment, .get_cpl = svm_get_cpl, .get_cs_db_l_bits = kvm_get_cs_db_l_bits, - .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, .set_cr0 = svm_set_cr0, .set_cr4 = svm_set_cr4, .set_efer = svm_set_efer, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 83e3fe083679..a7dca7e305b4 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -2219,6 +2219,12 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) if (enable_ept) ept_save_pdptrs(vcpu); break; + case VCPU_EXREG_CR0: + guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; + + vcpu->arch.cr0 &= ~guest_owned_bits; + vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; + break; case VCPU_EXREG_CR3: if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu))) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); @@ -2922,14 +2928,6 @@ static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu) vpid_sync_context(to_vmx(vcpu)->vpid); } -static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) -{ - ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits; - - vcpu->arch.cr0 &= ~cr0_guest_owned_bits; - vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; -} - static void ept_load_pdptrs(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.walk_mmu; @@ -3019,6 +3017,7 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; + kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); @@ -7809,7 +7808,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .set_segment = vmx_set_segment, .get_cpl = vmx_get_cpl, .get_cs_db_l_bits = vmx_get_cs_db_l_bits, - .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, .set_cr0 = vmx_set_cr0, .set_cr4 = vmx_set_cr4, .set_efer = vmx_set_efer, diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 04bb557acdd2..298ddef79d00 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -452,6 +452,7 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) | (1 << VCPU_EXREG_RFLAGS) | (1 << VCPU_EXREG_PDPTR) | (1 << VCPU_EXREG_SEGMENTS) + | (1 << VCPU_EXREG_CR0) | (1 << VCPU_EXREG_CR3) | (1 << VCPU_EXREG_CR4) | (1 << VCPU_EXREG_EXIT_INFO_1) -- cgit v1.2.3 From e93fd3b3e89e9664039281fe7e56e6f764f2a909 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 May 2020 21:32:34 -0700 Subject: KVM: x86/mmu: Capture TDP level when updating CPUID Snapshot the TDP level now that it's invariant (SVM) or dependent only on host capabilities and guest CPUID (VMX). This avoids having to call kvm_x86_ops.get_tdp_level() when initializing a TDP MMU and/or calculating the page role, and thus avoids the associated retpoline. Drop the WARN in vmx_get_tdp_level() as updating CPUID while L2 is active is legal, if dodgy. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200502043234.12481-11-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/mmu/mmu.c | 6 +++--- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 -- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ad882127079f..505cf19ace80 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -687,6 +687,7 @@ struct kvm_vcpu_arch { struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES]; int maxphyaddr; + int tdp_level; /* emulate context */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 6828be99b908..44dfaefdad0e 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -124,8 +124,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) MSR_IA32_MISC_ENABLE_MWAIT); } - /* Update physical-address width */ + /* Note, maxphyaddr must be updated before tdp_level. */ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); + vcpu->arch.tdp_level = kvm_x86_ops.get_tdp_level(vcpu); kvm_mmu_reset_context(vcpu); kvm_pmu_refresh(vcpu); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index e618472c572b..10cb8db54cd0 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4894,7 +4894,7 @@ kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu, bool base_only) union kvm_mmu_role role = kvm_calc_mmu_role_common(vcpu, base_only); role.base.ad_disabled = (shadow_accessed_mask == 0); - role.base.level = kvm_x86_ops.get_tdp_level(vcpu); + role.base.level = vcpu->arch.tdp_level; role.base.direct = true; role.base.gpte_is_8_bytes = true; @@ -4915,7 +4915,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->sync_page = nonpaging_sync_page; context->invlpg = NULL; context->update_pte = nonpaging_update_pte; - context->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); + context->shadow_root_level = vcpu->arch.tdp_level; context->direct_map = true; context->get_guest_pgd = get_cr3; context->get_pdptr = kvm_pdptr_read; @@ -5680,7 +5680,7 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) * SVM's 32-bit NPT support, TDP paging doesn't use PAE paging and can * skip allocating the PDP table. */ - if (tdp_enabled && kvm_x86_ops.get_tdp_level(vcpu) > PT32E_ROOT_LEVEL) + if (tdp_enabled && vcpu->arch.tdp_level > PT32E_ROOT_LEVEL) return 0; page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_DMA32); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 712db507c819..a89a166d1cb8 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -86,7 +86,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu->get_guest_pgd = nested_svm_get_tdp_cr3; vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit; - vcpu->arch.mmu->shadow_root_level = kvm_x86_ops.get_tdp_level(vcpu); + vcpu->arch.mmu->shadow_root_level = vcpu->arch.tdp_level; reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; } diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7a43fbe05e2d..93b2a708b1da 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3025,8 +3025,6 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) static int vmx_get_tdp_level(struct kvm_vcpu *vcpu) { - WARN_ON(is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu))); - if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48)) return 5; return 4; -- cgit v1.2.3 From c95473e175dd1234b7440daa6eb2670ebf529653 Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:41 -0700 Subject: x86/fpu/xstate: Update copy_kernel_to_xregs_err() for supervisor states The function copy_kernel_to_xregs_err() uses XRSTOR which can work with standard or compacted format without supervisor xstates. However, when supervisor xstates are present, XRSTORS must be used. Fix it by using XRSTORS when supervisor state handling is enabled. I also considered if there were additional cases where XRSTOR might be mistakenly called instead of XRSTORS. There are only three XRSTOR sites in the kernel: 1. copy_kernel_to_xregs_booting(), already switches between XRSTOR and XRSTORS based on X86_FEATURE_XSAVES. 2. copy_user_to_xregs(), which *needs* XRSTOR because it is copying from userspace and must never copy supervisor state with XRSTORS. 3. copy_kernel_to_xregs_err() mistakenly used XRSTOR only. Fix it. [ bp: Massage commit message. ] Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Reviewed-by: Dave Hansen Link: https://lkml.kernel.org/r/20200512145444.15483-8-yu-cheng.yu@intel.com --- arch/x86/include/asm/fpu/internal.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index a42fcb4b690d..42159f45bf9c 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -400,7 +400,10 @@ static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) u32 hmask = mask >> 32; int err; - XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + if (static_cpu_has(X86_FEATURE_XSAVES)) + XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); + else + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); return err; } -- cgit v1.2.3 From a9a3ed1eff3601b63aea4fb462d8b3b92c7c1e7e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 22 Apr 2020 18:11:30 +0200 Subject: x86: Fix early boot crash on gcc-10, third try MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... or the odyssey of trying to disable the stack protector for the function which generates the stack canary value. The whole story started with Sergei reporting a boot crash with a kernel built with gcc-10: Kernel panic — not syncing: stack-protector: Kernel stack is corrupted in: start_secondary CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.6.0-rc5—00235—gfffb08b37df9 #139 Hardware name: Gigabyte Technology Co., Ltd. To be filled by O.E.M./H77M—D3H, BIOS F12 11/14/2013 Call Trace: dump_stack panic ? start_secondary __stack_chk_fail start_secondary secondary_startup_64 -—-[ end Kernel panic — not syncing: stack—protector: Kernel stack is corrupted in: start_secondary This happens because gcc-10 tail-call optimizes the last function call in start_secondary() - cpu_startup_entry() - and thus emits a stack canary check which fails because the canary value changes after the boot_init_stack_canary() call. To fix that, the initial attempt was to mark the one function which generates the stack canary with: __attribute__((optimize("-fno-stack-protector"))) ... start_secondary(void *unused) however, using the optimize attribute doesn't work cumulatively as the attribute does not add to but rather replaces previously supplied optimization options - roughly all -fxxx options. The key one among them being -fno-omit-frame-pointer and thus leading to not present frame pointer - frame pointer which the kernel needs. The next attempt to prevent compilers from tail-call optimizing the last function call cpu_startup_entry(), shy of carving out start_secondary() into a separate compilation unit and building it with -fno-stack-protector, was to add an empty asm(""). This current solution was short and sweet, and reportedly, is supported by both compilers but we didn't get very far this time: future (LTO?) optimization passes could potentially eliminate this, which leads us to the third attempt: having an actual memory barrier there which the compiler cannot ignore or move around etc. That should hold for a long time, but hey we said that about the other two solutions too so... Reported-by: Sergei Trofimovich Signed-off-by: Borislav Petkov Tested-by: Kalle Valo Cc: Link: https://lkml.kernel.org/r/20200314164451.346497-1-slyfox@gentoo.org --- arch/x86/include/asm/stackprotector.h | 7 ++++++- arch/x86/kernel/smpboot.c | 8 ++++++++ arch/x86/xen/smp_pv.c | 1 + include/linux/compiler.h | 6 ++++++ init/main.c | 2 ++ 5 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 91e29b6a86a5..9804a7957f4e 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -55,8 +55,13 @@ /* * Initialize the stackprotector canary value. * - * NOTE: this must only be called from functions that never return, + * NOTE: this must only be called from functions that never return * and it must always be inlined. + * + * In addition, it should be called from a compilation unit for which + * stack protector is disabled. Alternatively, the caller should not end + * with a function call which gets tail-call optimized as that would + * lead to checking a modified canary value. */ static __always_inline void boot_init_stack_canary(void) { diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c89e4d9ad28..2f24c334a938 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -266,6 +266,14 @@ static void notrace start_secondary(void *unused) wmb(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + + /* + * Prevent tail call to cpu_startup_entry() because the stack protector + * guard has been changed a couple of function calls up, in + * boot_init_stack_canary() and must not be checked before tail calling + * another function. + */ + prevent_tail_call_optimization(); } /** diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 8fb8a50a28b4..f2adb63b2d7c 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -93,6 +93,7 @@ asmlinkage __visible void cpu_bringup_and_idle(void) cpu_bringup(); boot_init_stack_canary(); cpu_startup_entry(CPUHP_AP_ONLINE_IDLE); + prevent_tail_call_optimization(); } void xen_smp_intr_free_pv(unsigned int cpu) diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 034b0a644efc..448c91bf543b 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -356,4 +356,10 @@ static inline void *offset_to_ptr(const int *off) /* &a[0] degrades to a pointer: a different type from an array */ #define __must_be_array(a) BUILD_BUG_ON_ZERO(__same_type((a), &(a)[0])) +/* + * This is needed in functions which generate the stack canary, see + * arch/x86/kernel/smpboot.c::start_secondary() for an example. + */ +#define prevent_tail_call_optimization() mb() + #endif /* __LINUX_COMPILER_H */ diff --git a/init/main.c b/init/main.c index 1a5da2c2660c..ad3812b5ae65 100644 --- a/init/main.c +++ b/init/main.c @@ -1036,6 +1036,8 @@ asmlinkage __visible void __init start_kernel(void) /* Do the rest non-__init'ed, we're now alive */ arch_call_rest_init(); + + prevent_tail_call_optimization(); } /* Call all constructor functions linked into the kernel. */ -- cgit v1.2.3 From 68cda40d9f3c4cb880108eb22f974d9e3d5dc6c5 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 May 2020 15:05:29 -0700 Subject: KVM: nVMX: Tweak handling of failure code for nested VM-Enter failure Use an enum for passing around the failure code for a failed VM-Enter that results in VM-Exit to provide a level of indirection from the final resting place of the failure code, vmcs.EXIT_QUALIFICATION. The exit qualification field is an unsigned long, e.g. passing around 'u32 exit_qual' throws up red flags as it suggests KVM may be dropping bits when reporting errors to L1. This is a red herring because the only defined failure codes are 0, 2, 3, and 4, i.e. don't come remotely close to overflowing a u32. Setting vmcs.EXIT_QUALIFICATION on entry failure is further complicated by the MSR load list, which returns the (1-based) entry that failed, and the number of MSRs to load is a 32-bit VMCS field. At first blush, it would appear that overflowing a u32 is possible, but the number of MSRs that can be loaded is hardcapped at 4096 (limited by MSR_IA32_VMX_MISC). In other words, there are two completely disparate types of data that eventually get stuffed into vmcs.EXIT_QUALIFICATION, neither of which is an 'unsigned long' in nature. This was presumably the reasoning for switching to 'u32' when the related code was refactored in commit ca0bde28f2ed6 ("kvm: nVMX: Split VMCS checks from nested_vmx_run()"). Using an enum for the failure code addresses the technically-possible- but-will-never-happen scenario where Intel defines a failure code that doesn't fit in a 32-bit integer. The enum variables and values will either be automatically sized (gcc 5.4 behavior) or be subjected to some combination of truncation. The former case will simply work, while the latter will trigger a compile-time warning unless the compiler is being particularly unhelpful. Separating the failure code from the failed MSR entry allows for disassociating both from vmcs.EXIT_QUALIFICATION, which avoids the conundrum where KVM has to choose between 'u32 exit_qual' and tracking values as 'unsigned long' that have no business being tracked as such. To cement the split, set vmcs12->exit_qualification directly from the entry error code or failed MSR index instead of bouncing through a local variable. Opportunistically rename the variables in load_vmcs12_host_state() and vmx_set_nested_state() to call out that they're ignored, set exit_reason on demand on nested VM-Enter failure, and add a comment in nested_vmx_load_msr() to call out that returning 'i + 1' can't wrap. No functional change intended. Reported-by: Vitaly Kuznetsov Cc: Jim Mattson Signed-off-by: Sean Christopherson Message-Id: <20200511220529.11402-1-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 10 ++++++---- arch/x86/kvm/vmx/nested.c | 47 +++++++++++++++++++++++++++------------------- 2 files changed, 34 insertions(+), 23 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 5e090d1f03f8..cd7de4b401fe 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -527,10 +527,12 @@ struct vmx_msr_entry { /* * Exit Qualifications for entry failure during or after loading guest state */ -#define ENTRY_FAIL_DEFAULT 0 -#define ENTRY_FAIL_PDPTE 2 -#define ENTRY_FAIL_NMI 3 -#define ENTRY_FAIL_VMCS_LINK_PTR 4 +enum vm_entry_failure_code { + ENTRY_FAIL_DEFAULT = 0, + ENTRY_FAIL_PDPTE = 2, + ENTRY_FAIL_NMI = 3, + ENTRY_FAIL_VMCS_LINK_PTR = 4, +}; /* * Exit Qualifications for EPT Violations diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 90ba278dd572..b2cd8d399644 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -922,6 +922,7 @@ static u32 nested_vmx_load_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) } return 0; fail: + /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */ return i + 1; } @@ -1117,7 +1118,7 @@ static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu) * @entry_failure_code. */ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, - u32 *entry_failure_code) + enum vm_entry_failure_code *entry_failure_code) { if (CC(!nested_cr3_valid(vcpu, cr3))) { *entry_failure_code = ENTRY_FAIL_DEFAULT; @@ -2469,7 +2470,7 @@ static void prepare_vmcs02_rare(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * is assigned to entry_failure_code on failure. */ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *entry_failure_code) + enum vm_entry_failure_code *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs; @@ -2929,11 +2930,11 @@ static int nested_check_guest_non_reg_state(struct vmcs12 *vmcs12) static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - u32 *exit_qual) + enum vm_entry_failure_code *entry_failure_code) { bool ia32e; - *exit_qual = ENTRY_FAIL_DEFAULT; + *entry_failure_code = ENTRY_FAIL_DEFAULT; if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) || CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))) @@ -2948,7 +2949,7 @@ static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu, return -EINVAL; if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) { - *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR; + *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR; return -EINVAL; } @@ -3240,9 +3241,9 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + enum vm_entry_failure_code entry_failure_code; bool evaluate_pending_interrupts; - u32 exit_reason = EXIT_REASON_INVALID_STATE; - u32 exit_qual; + u32 exit_reason, failed_index; if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) kvm_vcpu_flush_tlb_current(vcpu); @@ -3290,24 +3291,33 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, return NVMX_VMENTRY_VMFAIL; } - if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) + if (nested_vmx_check_guest_state(vcpu, vmcs12, + &entry_failure_code)) { + exit_reason = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit; + } } enter_guest_mode(vcpu); if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) vcpu->arch.tsc_offset += vmcs12->tsc_offset; - if (prepare_vmcs02(vcpu, vmcs12, &exit_qual)) + if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) { + exit_reason = EXIT_REASON_INVALID_STATE; + vmcs12->exit_qualification = entry_failure_code; goto vmentry_fail_vmexit_guest_mode; + } if (from_vmentry) { - exit_reason = EXIT_REASON_MSR_LOAD_FAIL; - exit_qual = nested_vmx_load_msr(vcpu, - vmcs12->vm_entry_msr_load_addr, - vmcs12->vm_entry_msr_load_count); - if (exit_qual) + failed_index = nested_vmx_load_msr(vcpu, + vmcs12->vm_entry_msr_load_addr, + vmcs12->vm_entry_msr_load_count); + if (failed_index) { + exit_reason = EXIT_REASON_MSR_LOAD_FAIL; + vmcs12->exit_qualification = failed_index; goto vmentry_fail_vmexit_guest_mode; + } } else { /* * The MMU is not initialized to point at the right entities yet and @@ -3371,7 +3381,6 @@ vmentry_fail_vmexit: load_vmcs12_host_state(vcpu, vmcs12); vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY; - vmcs12->exit_qualification = exit_qual; if (enable_shadow_vmcs || vmx->nested.hv_evmcs) vmx->nested.need_vmcs12_to_shadow_sync = true; return NVMX_VMENTRY_VMEXIT; @@ -4065,8 +4074,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + enum vm_entry_failure_code ignored; struct kvm_segment seg; - u32 entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -4101,7 +4110,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * Only PDPTE load can fail as the value of cr3 was checked on entry and * couldn't have changed. */ - if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) @@ -6001,7 +6010,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12; - u32 exit_qual; + enum vm_entry_failure_code ignored; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; int ret; @@ -6142,7 +6151,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, if (nested_vmx_check_controls(vcpu, vmcs12) || nested_vmx_check_host_state(vcpu, vmcs12) || - nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual)) + nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) goto error_guest_mode; vmx->nested.dirty_vmcs12 = true; -- cgit v1.2.3 From a71936ab46f1da1539d97a98dfb2f94ee383d687 Mon Sep 17 00:00:00 2001 From: Xiaoyao Li Date: Wed, 29 Apr 2020 23:43:12 +0800 Subject: kvm: x86: Cleanup vcpu->arch.guest_xstate_size vcpu->arch.guest_xstate_size lost its only user since commit df1daba7d1cb ("KVM: x86: support XSAVES usage in the host"), so clean it up. Signed-off-by: Xiaoyao Li Message-Id: <20200429154312.1411-1-xiaoyao.li@intel.com> Reviewed-by: Vitaly Kuznetsov Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/cpuid.c | 4 +--- arch/x86/kvm/x86.c | 2 -- 3 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 505cf19ace80..86295a01f5ca 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -657,7 +657,6 @@ struct kvm_vcpu_arch { u64 xcr0; u64 guest_supported_xcr0; - u32 guest_xstate_size; struct kvm_pio_request pio; void *pio_data; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 44dfaefdad0e..cd708b0b460a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -86,12 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0xD, 0); if (!best) { vcpu->arch.guest_supported_xcr0 = 0; - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; } else { vcpu->arch.guest_supported_xcr0 = (best->eax | ((u64)best->edx << 32)) & supported_xcr0; - vcpu->arch.guest_xstate_size = best->ebx = - xstate_required_size(vcpu->arch.xcr0, false); + best->ebx = xstate_required_size(vcpu->arch.xcr0, false); } best = kvm_find_cpuid_entry(vcpu, 0xD, 1); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3622586a26a7..6b958d6c9427 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9387,8 +9387,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) } fx_init(vcpu); - vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET; - vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; -- cgit v1.2.3 From e662ec3e0705cfee5b36aecd62adbc36df85eab3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 Apr 2020 17:54:21 -0700 Subject: KVM: x86/mmu: Move max hugepage level to a separate #define Rename PT_MAX_HUGEPAGE_LEVEL to KVM_MAX_HUGEPAGE_LEVEL and make it a separate define in anticipation of dropping KVM's PT_*_LEVEL enums in favor of the kernel's PG_LEVEL_* enums. No functional change intended. Signed-off-by: Sean Christopherson Message-Id: <20200428005422.4235-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 5 ++--- arch/x86/kvm/mmu/mmu.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 86295a01f5ca..77a97a72bb3e 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -114,10 +114,9 @@ enum { PT_PAGE_TABLE_LEVEL = 1, PT_DIRECTORY_LEVEL = 2, PT_PDPE_LEVEL = 3, - /* set max level to the biggest one */ - PT_MAX_HUGEPAGE_LEVEL = PT_PDPE_LEVEL, }; -#define KVM_NR_PAGE_SIZES (PT_MAX_HUGEPAGE_LEVEL - \ +#define KVM_MAX_HUGEPAGE_LEVEL PT_PDPE_LEVEL +#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - \ PT_PAGE_TABLE_LEVEL + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 10cb8db54cd0..7a712e903e93 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1199,7 +1199,7 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, struct kvm_lpage_info *linfo; int i; - for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PT_DIRECTORY_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->disallow_lpage += count; WARN_ON(linfo->disallow_lpage < 0); @@ -1763,7 +1763,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int i; bool write_protected = false; - for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } @@ -1952,7 +1952,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, + KVM_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, &iterator) ret |= handler(kvm, iterator.rmap, memslot, @@ -4214,7 +4214,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, { int max_level; - for (max_level = PT_MAX_HUGEPAGE_LEVEL; + for (max_level = KVM_MAX_HUGEPAGE_LEVEL; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { int page_num = KVM_PAGES_PER_HPAGE(max_level); @@ -5641,7 +5641,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static __always_inline bool @@ -5649,7 +5649,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, - PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); + KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } static __always_inline bool @@ -5867,7 +5867,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + PT_PAGE_TABLE_LEVEL, + KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true); } } @@ -5889,7 +5890,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, spin_lock(&kvm->mmu_lock); flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, PT_MAX_HUGEPAGE_LEVEL, false); + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); spin_unlock(&kvm->mmu_lock); /* -- cgit v1.2.3 From 3bae0459bcd559506a2ca5807040ff722de5b136 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 27 Apr 2020 17:54:22 -0700 Subject: KVM: x86/mmu: Drop KVM's hugepage enums in favor of the kernel's enums Replace KVM's PT_PAGE_TABLE_LEVEL, PT_DIRECTORY_LEVEL and PT_PDPE_LEVEL with the kernel's PG_LEVEL_4K, PG_LEVEL_2M and PG_LEVEL_1G. KVM's enums are borderline impossible to remember and result in code that is visually difficult to audit, e.g. if (!enable_ept) ept_lpage_level = 0; else if (cpu_has_vmx_ept_1g_page()) ept_lpage_level = PT_PDPE_LEVEL; else if (cpu_has_vmx_ept_2m_page()) ept_lpage_level = PT_DIRECTORY_LEVEL; else ept_lpage_level = PT_PAGE_TABLE_LEVEL; versus if (!enable_ept) ept_lpage_level = 0; else if (cpu_has_vmx_ept_1g_page()) ept_lpage_level = PG_LEVEL_1G; else if (cpu_has_vmx_ept_2m_page()) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; No functional change intended. Suggested-by: Barret Rhoden Signed-off-by: Sean Christopherson Message-Id: <20200428005422.4235-4-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 12 ++--- arch/x86/kvm/mmu/mmu.c | 107 +++++++++++++++++++--------------------- arch/x86/kvm/mmu/page_track.c | 4 +- arch/x86/kvm/mmu/paging_tmpl.h | 18 +++---- arch/x86/kvm/mmu_audit.c | 6 +-- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 6 +-- arch/x86/kvm/x86.c | 4 +- 8 files changed, 73 insertions(+), 86 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 77a97a72bb3e..b8d035694b87 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -110,14 +110,8 @@ #define UNMAPPED_GVA (~(gpa_t)0) /* KVM Hugepage definitions for x86 */ -enum { - PT_PAGE_TABLE_LEVEL = 1, - PT_DIRECTORY_LEVEL = 2, - PT_PDPE_LEVEL = 3, -}; -#define KVM_MAX_HUGEPAGE_LEVEL PT_PDPE_LEVEL -#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - \ - PT_PAGE_TABLE_LEVEL + 1) +#define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G +#define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) @@ -126,7 +120,7 @@ enum { static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) { - /* KVM_HPAGE_GFN_SHIFT(PT_PAGE_TABLE_LEVEL) must be 0. */ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 7a712e903e93..fd94668c0f0f 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -623,7 +623,7 @@ static int is_large_pte(u64 pte) static int is_last_spte(u64 pte, int level) { - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) return 1; if (is_large_pte(pte)) return 1; @@ -1199,7 +1199,7 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, struct kvm_lpage_info *linfo; int i; - for (i = PT_DIRECTORY_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->disallow_lpage += count; WARN_ON(linfo->disallow_lpage < 0); @@ -1228,7 +1228,7 @@ static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) slot = __gfn_to_memslot(slots, gfn); /* the non-leaf shadow pages are keeping readonly. */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return kvm_slot_page_track_add_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); @@ -1256,7 +1256,7 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) gfn = sp->gfn; slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, gfn); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return kvm_slot_page_track_remove_page(kvm, slot, gfn, KVM_PAGE_TRACK_WRITE); @@ -1401,7 +1401,7 @@ static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, unsigned long idx; idx = gfn_to_index(gfn, slot->base_gfn, level); - return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx]; + return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, @@ -1532,8 +1532,7 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) { if (is_large_pte(*sptep)) { - WARN_ON(page_header(__pa(sptep))->role.level == - PT_PAGE_TABLE_LEVEL); + WARN_ON(page_header(__pa(sptep))->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); --kvm->stat.lpages; return true; @@ -1685,7 +1684,7 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1711,7 +1710,7 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, while (mask) { rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PT_PAGE_TABLE_LEVEL, slot); + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head); /* clear the first set bit */ @@ -1763,7 +1762,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, int i; bool write_protected = false; - for (i = PT_PAGE_TABLE_LEVEL; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { + for (i = PG_LEVEL_4K; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { rmap_head = __gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } @@ -1951,7 +1950,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_start = hva_to_gfn_memslot(hva_start, memslot); gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, + for_each_slot_rmap_range(memslot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, gfn_start, gfn_end - 1, &iterator) @@ -2346,7 +2345,7 @@ static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, if (!s->unsync) continue; - WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(s->role.level != PG_LEVEL_4K); ret |= kvm_sync_page(vcpu, s, invalid_list); } @@ -2375,7 +2374,7 @@ static int mmu_pages_next(struct kvm_mmu_pages *pvec, int level = sp->role.level; parents->idx[level-1] = idx; - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) break; parents->parent[level-2] = sp; @@ -2397,7 +2396,7 @@ static int mmu_pages_first(struct kvm_mmu_pages *pvec, sp = pvec->page[0].sp; level = sp->role.level; - WARN_ON(level == PT_PAGE_TABLE_LEVEL); + WARN_ON(level == PG_LEVEL_4K); parents->parent[level-2] = sp; @@ -2545,11 +2544,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, * be inconsistent with guest page table. */ account_shadowed(vcpu->kvm, sp); - if (level == PT_PAGE_TABLE_LEVEL && - rmap_write_protect(vcpu, gfn)) + if (level == PG_LEVEL_4K && rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs_with_address(vcpu->kvm, gfn, 1); - if (level > PT_PAGE_TABLE_LEVEL && need_sync) + if (level > PG_LEVEL_4K && need_sync) flush |= kvm_sync_pages(vcpu, gfn, &invalid_list); } clear_page(sp->spt); @@ -2600,7 +2598,7 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator, static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator) { - if (iterator->level < PT_PAGE_TABLE_LEVEL) + if (iterator->level < PG_LEVEL_4K) return false; iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level); @@ -2721,7 +2719,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm, struct mmu_page_path parents; struct kvm_mmu_pages pages; - if (parent->role.level == PT_PAGE_TABLE_LEVEL) + if (parent->role.level == PG_LEVEL_4K) return 0; while (mmu_unsync_walk(parent, &pages)) { @@ -2920,7 +2918,7 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, if (sp->unsync) continue; - WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(sp->role.level != PG_LEVEL_4K); kvm_unsync_page(vcpu, sp); } @@ -3019,7 +3017,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (!speculative) spte |= spte_shadow_accessed_mask(spte); - if (level > PT_PAGE_TABLE_LEVEL && (pte_access & ACC_EXEC_MASK) && + if (level > PG_LEVEL_4K && (pte_access & ACC_EXEC_MASK) && is_nx_huge_page_enabled()) { pte_access &= ~ACC_EXEC_MASK; } @@ -3032,7 +3030,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, if (pte_access & ACC_USER_MASK) spte |= shadow_user_mask; - if (level > PT_PAGE_TABLE_LEVEL) + if (level > PG_LEVEL_4K) spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops.get_mt_mask(vcpu, gfn, @@ -3102,8 +3100,7 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * If we overwrite a PTE page pointer with a 2MB PMD, unlink * the parent of the now unreachable PTE. */ - if (level > PT_PAGE_TABLE_LEVEL && - !is_large_pte(*sptep)) { + if (level > PG_LEVEL_4K && !is_large_pte(*sptep)) { struct kvm_mmu_page *child; u64 pte = *sptep; @@ -3227,7 +3224,7 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep) if (sp_ad_disabled(sp)) return; - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return; __direct_pte_prefetch(vcpu, sp, sptep); @@ -3240,12 +3237,8 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, pte_t *pte; int level; - BUILD_BUG_ON(PT_PAGE_TABLE_LEVEL != (int)PG_LEVEL_4K || - PT_DIRECTORY_LEVEL != (int)PG_LEVEL_2M || - PT_PDPE_LEVEL != (int)PG_LEVEL_1G); - if (!PageCompound(pfn_to_page(pfn)) && !kvm_is_zone_device_pfn(pfn)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; /* * Note, using the already-retrieved memslot and __gfn_to_hva_memslot() @@ -3259,7 +3252,7 @@ static int host_pfn_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn, pte = lookup_address_in_mm(vcpu->kvm->mm, hva, &level); if (unlikely(!pte)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; return level; } @@ -3273,28 +3266,28 @@ static int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t mask; int level; - if (unlikely(max_level == PT_PAGE_TABLE_LEVEL)) - return PT_PAGE_TABLE_LEVEL; + if (unlikely(max_level == PG_LEVEL_4K)) + return PG_LEVEL_4K; if (is_error_noslot_pfn(pfn) || kvm_is_reserved_pfn(pfn)) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, true); if (!slot) - return PT_PAGE_TABLE_LEVEL; + return PG_LEVEL_4K; max_level = min(max_level, max_page_level); - for ( ; max_level > PT_PAGE_TABLE_LEVEL; max_level--) { + for ( ; max_level > PG_LEVEL_4K; max_level--) { linfo = lpage_info_slot(gfn, slot, max_level); if (!linfo->disallow_lpage) break; } - if (max_level == PT_PAGE_TABLE_LEVEL) - return PT_PAGE_TABLE_LEVEL; + if (max_level == PG_LEVEL_4K) + return PG_LEVEL_4K; level = host_pfn_mapping_level(vcpu, gfn, pfn, slot); - if (level == PT_PAGE_TABLE_LEVEL) + if (level == PG_LEVEL_4K) return level; level = min(level, max_level); @@ -3316,7 +3309,7 @@ static void disallowed_hugepage_adjust(struct kvm_shadow_walk_iterator it, int level = *levelp; u64 spte = *it.sptep; - if (it.level == level && level > PT_PAGE_TABLE_LEVEL && + if (it.level == level && level > PG_LEVEL_4K && is_nx_huge_page_enabled() && is_shadow_present_pte(spte) && !is_large_pte(spte)) { @@ -3573,7 +3566,7 @@ static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) break; } @@ -4132,7 +4125,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, return r; if (lpage_disallowed) - max_level = PT_PAGE_TABLE_LEVEL; + max_level = PG_LEVEL_4K; if (fast_page_fault(vcpu, gpa, error_code)) return RET_PF_RETRY; @@ -4168,7 +4161,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, /* This path builds a PAE pagetable, we can map 2mb pages at maximum. */ return direct_page_fault(vcpu, gpa & PAGE_MASK, error_code, prefault, - PT_DIRECTORY_LEVEL, false); + PG_LEVEL_2M, false); } int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, @@ -4215,7 +4208,7 @@ int kvm_tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, int max_level; for (max_level = KVM_MAX_HUGEPAGE_LEVEL; - max_level > PT_PAGE_TABLE_LEVEL; + max_level > PG_LEVEL_4K; max_level--) { int page_num = KVM_PAGES_PER_HPAGE(max_level); gfn_t base = (gpa >> PAGE_SHIFT) & ~(page_num - 1); @@ -4376,11 +4369,11 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, gpte &= level - mmu->last_nonleaf_level; /* - * PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set - * iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means - * level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. + * PG_LEVEL_4K always terminates. The RHS has bit 7 set + * iff level <= PG_LEVEL_4K, which for our purpose means + * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. */ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + gpte |= level - PG_LEVEL_4K - 1; return gpte & PT_PAGE_SIZE_MASK; } @@ -5193,7 +5186,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *new) { - if (sp->role.level != PT_PAGE_TABLE_LEVEL) { + if (sp->role.level != PG_LEVEL_4K) { ++vcpu->kvm->stat.mmu_pde_zapped; return; } @@ -5251,7 +5244,7 @@ static bool detect_write_flooding(struct kvm_mmu_page *sp) * Skip write-flooding detected for the sp whose level is 1, because * it can become unsync, then the guest page is not write-protected. */ - if (sp->role.level == PT_PAGE_TABLE_LEVEL) + if (sp->role.level == PG_LEVEL_4K) return false; atomic_inc(&sp->write_flooding_count); @@ -5582,9 +5575,9 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_page_level) if (tdp_enabled) max_page_level = tdp_page_level; else if (boot_cpu_has(X86_FEATURE_GBPAGES)) - max_page_level = PT_PDPE_LEVEL; + max_page_level = PG_LEVEL_1G; else - max_page_level = PT_DIRECTORY_LEVEL; + max_page_level = PG_LEVEL_2M; } EXPORT_SYMBOL_GPL(kvm_configure_mmu); @@ -5640,7 +5633,7 @@ static __always_inline bool slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } @@ -5648,7 +5641,7 @@ static __always_inline bool slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K + 1, KVM_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); } @@ -5656,8 +5649,8 @@ static __always_inline bool slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, slot_level_handler fn, bool lock_flush_tlb) { - return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, - PT_PAGE_TABLE_LEVEL, lock_flush_tlb); + return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, + PG_LEVEL_4K, lock_flush_tlb); } static void free_mmu_pages(struct kvm_mmu *mmu) @@ -5867,7 +5860,7 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) continue; slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, - PT_PAGE_TABLE_LEVEL, + PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true); } diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index ddc1ec3bdacd..a7bcde34d1f2 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -61,7 +61,7 @@ static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn, { int index, val; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); val = slot->arch.gfn_track[mode][index]; @@ -151,7 +151,7 @@ bool kvm_page_track_is_active(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return false; - index = gfn_to_index(gfn, slot->base_gfn, PT_PAGE_TABLE_LEVEL); + index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K); return !!READ_ONCE(slot->arch.gfn_track[mode][index]); } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ca39bd315f70..38c576495048 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -75,7 +75,7 @@ #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) -#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) +#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) /* * The guest_walker structure emulates the behavior of the hardware page @@ -198,7 +198,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; - if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL)) + if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) goto no_present; return false; @@ -436,7 +436,7 @@ retry_walk: gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; - if (PTTYPE == 32 && walker->level > PT_PAGE_TABLE_LEVEL && is_cpuid_PSE36()) + if (PTTYPE == 32 && walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault); @@ -552,7 +552,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * we call mmu_set_spte() with host_writable = true because * pte_prefetch_gfn_to_pfn always gets a writable pfn. */ - mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn, + mmu_set_spte(vcpu, spte, pte_access, 0, PG_LEVEL_4K, gfn, pfn, true, true); kvm_release_pfn_clean(pfn); @@ -575,7 +575,7 @@ static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, u64 mask; int r, index; - if (level == PT_PAGE_TABLE_LEVEL) { + if (level == PG_LEVEL_4K) { mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); @@ -600,7 +600,7 @@ static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, sp = page_header(__pa(sptep)); - if (sp->role.level > PT_PAGE_TABLE_LEVEL) + if (sp->role.level > PG_LEVEL_4K) return; if (sp->role.direct) @@ -828,7 +828,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable); if (lpage_disallowed || is_self_change_mapping) - max_level = PT_PAGE_TABLE_LEVEL; + max_level = PG_LEVEL_4K; else max_level = walker.level; @@ -884,7 +884,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; - WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL); + WARN_ON(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) offset = sp->role.quadrant << PT64_LEVEL_BITS; @@ -1070,7 +1070,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE; set_spte_ret |= set_spte(vcpu, &sp->spt[i], - pte_access, PT_PAGE_TABLE_LEVEL, + pte_access, PG_LEVEL_4K, gfn, spte_to_pfn(sp->spt[i]), true, false, host_writable); } diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ca39f62aabc6..9d2844f87f6d 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -100,7 +100,7 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) sp = page_header(__pa(sptep)); if (sp->unsync) { - if (level != PT_PAGE_TABLE_LEVEL) { + if (level != PG_LEVEL_4K) { audit_printk(vcpu->kvm, "unsync sp: %p " "level = %d\n", sp, level); return; @@ -176,7 +176,7 @@ static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp) { int i; - if (sp->role.level != PT_PAGE_TABLE_LEVEL) + if (sp->role.level != PG_LEVEL_4K) return; for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PT_PAGE_TABLE_LEVEL, slot); + rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9e24b5b6fae1..4c808cc059f1 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -888,7 +888,7 @@ static __init int svm_hardware_setup(void) if (npt_enabled && !npt) npt_enabled = false; - kvm_configure_mmu(npt_enabled, PT_PDPE_LEVEL); + kvm_configure_mmu(npt_enabled, PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); if (nrips) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 93b2a708b1da..30faae51573d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -8011,11 +8011,11 @@ static __init int hardware_setup(void) if (!enable_ept) ept_lpage_level = 0; else if (cpu_has_vmx_ept_1g_page()) - ept_lpage_level = PT_PDPE_LEVEL; + ept_lpage_level = PG_LEVEL_1G; else if (cpu_has_vmx_ept_2m_page()) - ept_lpage_level = PT_DIRECTORY_LEVEL; + ept_lpage_level = PG_LEVEL_2M; else - ept_lpage_level = PT_PAGE_TABLE_LEVEL; + ept_lpage_level = PG_LEVEL_4K; kvm_configure_mmu(enable_ept, ept_lpage_level); /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6b958d6c9427..a52ef81fb87a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10046,7 +10046,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, { /* Still write protect RO slot */ if (new->flags & KVM_MEM_READONLY) { - kvm_mmu_slot_remove_write_access(kvm, new, PT_PAGE_TABLE_LEVEL); + kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); return; } @@ -10086,7 +10086,7 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } else { int level = kvm_dirty_log_manual_protect_and_init_set(kvm) ? - PT_DIRECTORY_LEVEL : PT_PAGE_TABLE_LEVEL; + PG_LEVEL_2M : PG_LEVEL_4K; /* * If we're with initial-all-set, we don't need -- cgit v1.2.3 From dd03bcaad0b1a62c8ea6297e6f2a5993c1c5cd30 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 16 Apr 2020 11:58:59 -0400 Subject: KVM: X86: Force ASYNC_PF_PER_VCPU to be power of two Forcing the ASYNC_PF_PER_VCPU to be power of two is much easier to be used rather than calling roundup_pow_of_two() from time to time. Do this by adding a BUILD_BUG_ON() inside the hash function. Another point is that generally async pf does not allow concurrency over ASYNC_PF_PER_VCPU after all (see kvm_setup_async_pf()), so it does not make much sense either to have it not a power of two or some of the entries will definitely be wasted. Signed-off-by: Peter Xu Message-Id: <20200416155859.267366-1-peterx@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b8d035694b87..04f44961858b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -763,7 +763,7 @@ struct kvm_vcpu_arch { struct { bool halted; - gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)]; + gfn_t gfns[ASYNC_PF_PER_VCPU]; struct gfn_to_hva_cache data; u64 msr_val; u32 id; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a52ef81fb87a..8e28f21fb39b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -257,7 +257,7 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++) + for (i = 0; i < ASYNC_PF_PER_VCPU; i++) vcpu->arch.apf.gfns[i] = ~0; } @@ -10310,12 +10310,14 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) { + BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); + return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); } static inline u32 kvm_async_pf_next_probe(u32 key) { - return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1); + return (key + 1) & (ASYNC_PF_PER_VCPU - 1); } static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) @@ -10333,7 +10335,7 @@ static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) int i; u32 key = kvm_async_pf_hash_fn(gfn); - for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) && + for (i = 0; i < ASYNC_PF_PER_VCPU && (vcpu->arch.apf.gfns[key] != gfn && vcpu->arch.apf.gfns[key] != ~0); i++) key = kvm_async_pf_next_probe(key); -- cgit v1.2.3 From 2c4c41325540cf3abb12aef142c0e550f6afeffc Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 8 May 2020 16:53:48 -0700 Subject: KVM: x86: Print symbolic names of VMX VM-Exit flags in traces Use __print_flags() to display the names of VMX flags in VM-Exit traces and strip the flags when printing the basic exit reason, e.g. so that a failed VM-Entry due to invalid guest state gets recorded as "INVALID_STATE FAILED_VMENTRY" instead of "0x80000021". Opportunstically fix misaligned variables in the kvm_exit and kvm_nested_vmexit_inject tracepoints. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Message-Id: <20200508235348.19427-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/vmx.h | 3 +++ arch/x86/kvm/trace.h | 32 +++++++++++++++++--------------- 2 files changed, 20 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index e95b72ec19bc..b8ff9e8ac0d5 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -150,6 +150,9 @@ { EXIT_REASON_UMWAIT, "UMWAIT" }, \ { EXIT_REASON_TPAUSE, "TPAUSE" } +#define VMX_EXIT_REASON_FLAGS \ + { VMX_EXIT_REASONS_FAILED_VMENTRY, "FAILED_VMENTRY" } + #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 249062f24b94..54a10c98d746 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -225,6 +225,14 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 +#define kvm_print_exit_reason(exit_reason, isa) \ + (isa == KVM_ISA_VMX) ? \ + __print_symbolic(exit_reason & 0xffff, VMX_EXIT_REASONS) : \ + __print_symbolic(exit_reason, SVM_EXIT_REASONS), \ + (isa == KVM_ISA_VMX && exit_reason & ~0xffff) ? " " : "", \ + (isa == KVM_ISA_VMX) ? \ + __print_flags(exit_reason & ~0xffff, " ", VMX_EXIT_REASON_FLAGS) : "" + /* * Tracepoint for kvm guest exit: */ @@ -250,12 +258,10 @@ TRACE_EVENT(kvm_exit, &__entry->info2); ), - TP_printk("vcpu %u reason %s rip 0x%lx info %llx %llx", + TP_printk("vcpu %u reason %s%s%s rip 0x%lx info %llx %llx", __entry->vcpu_id, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), - __entry->guest_rip, __entry->info1, __entry->info2) + kvm_print_exit_reason(__entry->exit_reason, __entry->isa), + __entry->guest_rip, __entry->info1, __entry->info2) ); /* @@ -588,12 +594,10 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_int_info_err = exit_int_info_err; __entry->isa = isa; ), - TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " + TP_printk("rip: 0x%016llx reason: %s%s%s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), + kvm_print_exit_reason(__entry->exit_code, __entry->isa), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); @@ -626,13 +630,11 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __entry->isa = isa; ), - TP_printk("reason: %s ext_inf1: 0x%016llx " + TP_printk("reason: %s%s%s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - (__entry->isa == KVM_ISA_VMX) ? - __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : - __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), - __entry->exit_info1, __entry->exit_info2, - __entry->exit_int_info, __entry->exit_int_info_err) + kvm_print_exit_reason(__entry->exit_code, __entry->isa), + __entry->exit_info1, __entry->exit_info2, + __entry->exit_int_info, __entry->exit_int_info_err) ); /* -- cgit v1.2.3 From 404d5d7bff0d419fe11c7eaebca9ec8f25258f95 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 28 Apr 2020 14:23:25 +0800 Subject: KVM: X86: Introduce more exit_fastpath_completion enum values Adds a fastpath_t typedef since enum lines are a bit long, and replace EXIT_FASTPATH_SKIP_EMUL_INS with two new exit_fastpath_completion enum values. - EXIT_FASTPATH_EXIT_HANDLED kvm will still go through it's full run loop, but it would skip invoking the exit handler. - EXIT_FASTPATH_REENTER_GUEST complete fastpath, guest can be re-entered without invoking the exit handler or going back to vcpu_run Tested-by: Haiwei Li Cc: Haiwei Li Signed-off-by: Wanpeng Li Message-Id: <1588055009-12677-4-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 +++- arch/x86/kvm/svm/svm.c | 15 +++++++-------- arch/x86/kvm/vmx/vmx.c | 26 ++++++++++++++++++-------- arch/x86/kvm/x86.c | 19 ++++++++++--------- arch/x86/kvm/x86.h | 2 +- 5 files changed, 39 insertions(+), 27 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 04f44961858b..c3906fb2b93f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -182,8 +182,10 @@ enum { enum exit_fastpath_completion { EXIT_FASTPATH_NONE, - EXIT_FASTPATH_SKIP_EMUL_INS, + EXIT_FASTPATH_REENTER_GUEST, + EXIT_FASTPATH_EXIT_HANDLED, }; +typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 4c808cc059f1..29e7f7bc284d 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2893,8 +2893,7 @@ static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) *info2 = control->exit_info_2; } -static int handle_exit(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion exit_fastpath) +static int handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_svm *svm = to_svm(vcpu); struct kvm_run *kvm_run = vcpu->run; @@ -2952,10 +2951,10 @@ static int handle_exit(struct kvm_vcpu *vcpu, __func__, svm->vmcb->control.exit_int_info, exit_code); - if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { - kvm_skip_emulated_instruction(vcpu); + if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - } else if (exit_code >= ARRAY_SIZE(svm_exit_handlers) + + if (exit_code >= ARRAY_SIZE(svm_exit_handlers) || !svm_exit_handlers[exit_code]) { vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code); dump_vmcb(vcpu); @@ -3324,7 +3323,7 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu) svm_complete_interrupts(svm); } -static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { if (!is_guest_mode(vcpu) && to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && @@ -3336,9 +3335,9 @@ static enum exit_fastpath_completion svm_exit_handlers_fastpath(struct kvm_vcpu void __svm_vcpu_run(unsigned long vmcb_pa, unsigned long *regs); -static enum exit_fastpath_completion svm_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) { - enum exit_fastpath_completion exit_fastpath; + fastpath_t exit_fastpath; struct vcpu_svm *svm = to_svm(vcpu); svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d883fbb63566..c7730d3aa706 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5926,8 +5926,7 @@ void dump_vmcs(void) * The guest has exited. See if we can fix it or if we need userspace * assistance. */ -static int vmx_handle_exit(struct kvm_vcpu *vcpu, - enum exit_fastpath_completion exit_fastpath) +static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exit_reason = vmx->exit_reason; @@ -6034,10 +6033,8 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu, } } - if (exit_fastpath == EXIT_FASTPATH_SKIP_EMUL_INS) { - kvm_skip_emulated_instruction(vcpu); + if (exit_fastpath != EXIT_FASTPATH_NONE) return 1; - } if (exit_reason >= kvm_vmx_max_exit_handlers) goto unexpected_vmexit; @@ -6628,7 +6625,7 @@ void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp) } } -static enum exit_fastpath_completion vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) +static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu) { switch (to_vmx(vcpu)->exit_reason) { case EXIT_REASON_MSR_WRITE: @@ -6640,12 +6637,13 @@ static enum exit_fastpath_completion vmx_exit_handlers_fastpath(struct kvm_vcpu bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched); -static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu) +static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) { - enum exit_fastpath_completion exit_fastpath; + fastpath_t exit_fastpath; struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long cr3, cr4; +reenter_guest: /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && vmx->loaded_vmcs->soft_vnmi_blocked)) @@ -6807,6 +6805,18 @@ static enum exit_fastpath_completion vmx_vcpu_run(struct kvm_vcpu *vcpu) return EXIT_FASTPATH_NONE; exit_fastpath = vmx_exit_handlers_fastpath(vcpu); + if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) { + if (!kvm_vcpu_exit_request(vcpu)) { + /* + * FIXME: this goto should be a loop in vcpu_enter_guest, + * but it would incur the cost of a retpoline for now. + * Revisit once static calls are available. + */ + goto reenter_guest; + } + exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; + } + return exit_fastpath; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 29a41aa98929..71749fcb229e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1608,27 +1608,28 @@ static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data return 1; } -enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) { u32 msr = kvm_rcx_read(vcpu); u64 data; - int ret = 0; + fastpath_t ret = EXIT_FASTPATH_NONE; switch (msr) { case APIC_BASE_MSR + (APIC_ICR >> 4): data = kvm_read_edx_eax(vcpu); - ret = handle_fastpath_set_x2apic_icr_irqoff(vcpu, data); + if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { + kvm_skip_emulated_instruction(vcpu); + ret = EXIT_FASTPATH_EXIT_HANDLED; + } break; default: - return EXIT_FASTPATH_NONE; + break; } - if (!ret) { + if (ret != EXIT_FASTPATH_NONE) trace_kvm_msr_write(msr, data); - return EXIT_FASTPATH_SKIP_EMUL_INS; - } - return EXIT_FASTPATH_NONE; + return ret; } EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); @@ -8205,7 +8206,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) bool req_int_win = dm_request_for_irq_injection(vcpu) && kvm_cpu_accept_dm_intr(vcpu); - enum exit_fastpath_completion exit_fastpath; + fastpath_t exit_fastpath; bool req_immediate_exit = false; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e02fe28254b6..6eb62e97e59f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -274,7 +274,7 @@ bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, bool kvm_vector_hashing_enabled(void); int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int emulation_type, void *insn, int insn_len); -enum exit_fastpath_completion handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); +fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu); extern u64 host_xcr0; extern u64 supported_xcr0; -- cgit v1.2.3 From 93dff2fed2fb4a513196b7df05742c6fcdfd5178 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 8 May 2020 13:36:43 -0700 Subject: KVM: nVMX: Migrate the VMX-preemption timer The hrtimer used to emulate the VMX-preemption timer must be pinned to the same logical processor as the vCPU thread to be interrupted if we want to have any hope of adhering to the architectural specification of the VMX-preemption timer. Even with this change, the emulated VMX-preemption timer VM-exit occasionally arrives too late. Signed-off-by: Jim Mattson Reviewed-by: Peter Shier Reviewed-by: Oliver Upton Message-Id: <20200508203643.85477-4-jmattson@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/irq.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 11 +++++++++++ 3 files changed, 15 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c3906fb2b93f..a3ce391c9e7a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1247,6 +1247,8 @@ struct kvm_x86_ops { bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu); + + void (*migrate_timers)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index e330e7d125f7..54f7ea68083b 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -159,6 +159,8 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) { __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); + if (kvm_x86_ops.migrate_timers) + kvm_x86_ops.migrate_timers(vcpu); } bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 68f4efb2b9c2..6a03c27ff314 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7814,6 +7814,16 @@ static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu) return to_vmx(vcpu)->nested.vmxon; } +static void vmx_migrate_timers(struct kvm_vcpu *vcpu) +{ + if (is_guest_mode(vcpu)) { + struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer; + + if (hrtimer_try_to_cancel(timer) == 1) + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } +} + static void hardware_unsetup(void) { if (nested) @@ -7957,6 +7967,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = { .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault, .apic_init_signal_blocked = vmx_apic_init_signal_blocked, + .migrate_timers = vmx_migrate_timers, }; static __init int hardware_setup(void) -- cgit v1.2.3 From cb953129bfe5c0f2da835a0469930873fb7e71df Mon Sep 17 00:00:00 2001 From: David Matlack Date: Fri, 8 May 2020 11:22:40 -0700 Subject: kvm: add halt-polling cpu usage stats Two new stats for exposing halt-polling cpu usage: halt_poll_success_ns halt_poll_fail_ns Thus sum of these 2 stats is the total cpu time spent polling. "success" means the VCPU polled until a virtual interrupt was delivered. "fail" means the VCPU had to schedule out (either because the maximum poll time was reached or it needed to yield the CPU). To avoid touching every arch's kvm_vcpu_stat struct, only update and export halt-polling cpu usage stats if we're on x86. Exporting cpu usage as a u64 and in nanoseconds means we will overflow at ~500 years, which seems reasonably large. Signed-off-by: David Matlack Signed-off-by: Jon Cargille Reviewed-by: Jim Mattson Message-Id: <20200508182240.68440-1-jcargill@google.com> Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 2 ++ arch/arm64/kvm/guest.c | 2 ++ arch/mips/include/asm/kvm_host.h | 2 ++ arch/mips/kvm/mips.c | 2 ++ arch/powerpc/kvm/booke.c | 2 ++ arch/s390/include/asm/kvm_host.h | 2 ++ arch/s390/kvm/kvm-s390.c | 2 ++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 2 ++ virt/kvm/kvm_main.c | 18 +++++++++++++++--- 10 files changed, 33 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 32c8a675e5a4..3833736dd064 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -415,6 +415,8 @@ struct kvm_vm_stat { struct kvm_vcpu_stat { u64 halt_successful_poll; u64 halt_attempted_poll; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; u64 halt_poll_invalid; u64 halt_wakeup; u64 hvc_exit_stat; diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 863a0d158fb8..55ebb9ea74f6 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -40,6 +40,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("mmio_exit_user", mmio_exit_user), VCPU_STAT("mmio_exit_kernel", mmio_exit_kernel), VCPU_STAT("exits", exits), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), { NULL } }; diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 2c343c346b79..e28b5a946e26 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -174,6 +174,8 @@ struct kvm_vcpu_stat { #endif u64 halt_successful_poll; u64 halt_attempted_poll; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; u64 halt_poll_invalid; u64 halt_wakeup; }; diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 9787cdec33e6..99ed08aff31e 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -72,6 +72,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_attempted_poll", halt_attempted_poll), VCPU_STAT("halt_poll_invalid", halt_poll_invalid), VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), {NULL} }; diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index c2984cb6dfa7..888afe8d35cc 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -54,6 +54,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_wakeup", halt_wakeup), VCPU_STAT("doorbell", dbell_exits), VCPU_STAT("guest doorbell", gdbell_exits), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), VM_STAT("remote_tlb_flush", remote_tlb_flush), { NULL } }; diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d6bcd34f3ec3..176f74cc2257 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -375,6 +375,8 @@ struct kvm_vcpu_stat { u64 halt_poll_invalid; u64 halt_no_poll_steal; u64 halt_wakeup; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; u64 instruction_lctl; u64 instruction_lctlg; u64 instruction_stctl; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 389ff1b7cd43..a560a368f92c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -75,6 +75,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("halt_poll_invalid", halt_poll_invalid), VCPU_STAT("halt_no_poll_steal", halt_no_poll_steal), VCPU_STAT("halt_wakeup", halt_wakeup), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), VCPU_STAT("instruction_lctlg", instruction_lctlg), VCPU_STAT("instruction_lctl", instruction_lctl), VCPU_STAT("instruction_stctl", instruction_stctl), diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a3ce391c9e7a..fd78bd44b2d6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1031,6 +1031,8 @@ struct kvm_vcpu_stat { u64 irq_injections; u64 nmi_injections; u64 req_event; + u64 halt_poll_success_ns; + u64 halt_poll_fail_ns; }; struct x86_instruction_info; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fb7c1a8f1a5..471fccf7f850 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -217,6 +217,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { VCPU_STAT("nmi_injections", nmi_injections), VCPU_STAT("req_event", req_event), VCPU_STAT("l1d_flush", l1d_flush), + VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns), + VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns), VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped), VM_STAT("mmu_pte_write", mmu_pte_write), VM_STAT("mmu_pte_updated", mmu_pte_updated), diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 11844fad60fd..da6da386b591 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2667,18 +2667,27 @@ out: return ret; } +static inline void +update_halt_poll_stats(struct kvm_vcpu *vcpu, u64 poll_ns, bool waited) +{ + if (waited) + vcpu->stat.halt_poll_fail_ns += poll_ns; + else + vcpu->stat.halt_poll_success_ns += poll_ns; +} + /* * The vCPU has executed a HLT instruction with in-kernel mode enabled. */ void kvm_vcpu_block(struct kvm_vcpu *vcpu) { - ktime_t start, cur; + ktime_t start, cur, poll_end; bool waited = false; u64 block_ns; kvm_arch_vcpu_blocking(vcpu); - start = cur = ktime_get(); + start = cur = poll_end = ktime_get(); if (vcpu->halt_poll_ns && !kvm_arch_no_poll(vcpu)) { ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns); @@ -2694,7 +2703,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) ++vcpu->stat.halt_poll_invalid; goto out; } - cur = ktime_get(); + poll_end = cur = ktime_get(); } while (single_task_running() && ktime_before(cur, stop)); } @@ -2714,6 +2723,9 @@ out: kvm_arch_vcpu_unblocking(vcpu); block_ns = ktime_to_ns(cur) - ktime_to_ns(start); + update_halt_poll_stats( + vcpu, ktime_to_ns(ktime_sub(poll_end, start)), waited); + if (!kvm_arch_no_poll(vcpu)) { if (!vcpu_valid_wakeup(vcpu)) { shrink_halt_poll_ns(vcpu); -- cgit v1.2.3 From eeedf1533687b8e81865fdbde79eddf7c4b76c9a Mon Sep 17 00:00:00 2001 From: Yu-cheng Yu Date: Tue, 12 May 2020 07:54:42 -0700 Subject: x86/fpu: Introduce copy_supervisor_to_kernel() The XSAVES instruction takes a mask and saves only the features specified in that mask. The kernel normally specifies that all features be saved. XSAVES also unconditionally uses the "compacted format" which means that all specified features are saved next to each other in memory. If a feature is removed from the mask, all the features after it will "move up" into earlier locations in the buffer. Introduce copy_supervisor_to_kernel(), which saves only supervisor states and then moves those states into the standard location where they are normally found. Signed-off-by: Yu-cheng Yu Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200512145444.15483-9-yu-cheng.yu@intel.com --- arch/x86/include/asm/fpu/xstate.h | 1 + arch/x86/kernel/fpu/xstate.c | 84 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 92104b298d77..422d8369012a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -75,6 +75,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf); int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf); +void copy_supervisor_to_kernel(struct xregs_state *xsave); /* Validate an xstate header supplied by userspace (ptrace or sigreturn) */ int validate_user_xstate_header(const struct xstate_header *hdr); diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index a68213ed5be6..587e03f0094d 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -62,6 +62,7 @@ u64 xfeatures_mask_all __read_mostly; static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; +static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1}; /* * The XSAVE area of kernel can be in standard or compacted format; @@ -392,6 +393,33 @@ static void __init setup_xstate_comp_offsets(void) } } +/* + * Setup offsets of a supervisor-state-only XSAVES buffer: + * + * The offsets stored in xstate_comp_offsets[] only work for one specific + * value of the Requested Feature BitMap (RFBM). In cases where a different + * RFBM value is used, a different set of offsets is required. This set of + * offsets is for when RFBM=xfeatures_mask_supervisor(). + */ +static void __init setup_supervisor_only_offsets(void) +{ + unsigned int next_offset; + int i; + + next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE; + + for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) { + if (!xfeature_enabled(i) || !xfeature_is_supervisor(i)) + continue; + + if (xfeature_is_aligned(i)) + next_offset = ALIGN(next_offset, 64); + + xstate_supervisor_only_offsets[i] = next_offset; + next_offset += xstate_sizes[i]; + } +} + /* * Print out xstate component offsets and sizes */ @@ -790,6 +818,7 @@ void __init fpu__init_system_xstate(void) fpu__init_prepare_fx_sw_frame(); setup_init_fpu_buf(); setup_xstate_comp_offsets(); + setup_supervisor_only_offsets(); print_xstate_offset_size(); pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n", @@ -1262,6 +1291,61 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) return 0; } +/* + * Save only supervisor states to the kernel buffer. This blows away all + * old states, and is intended to be used only in __fpu__restore_sig(), where + * user states are restored from the user buffer. + */ +void copy_supervisor_to_kernel(struct xregs_state *xstate) +{ + struct xstate_header *header; + u64 max_bit, min_bit; + u32 lmask, hmask; + int err, i; + + if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES))) + return; + + if (!xfeatures_mask_supervisor()) + return; + + max_bit = __fls(xfeatures_mask_supervisor()); + min_bit = __ffs(xfeatures_mask_supervisor()); + + lmask = xfeatures_mask_supervisor(); + hmask = xfeatures_mask_supervisor() >> 32; + XSTATE_OP(XSAVES, xstate, lmask, hmask, err); + + /* We should never fault when copying to a kernel buffer: */ + if (WARN_ON_FPU(err)) + return; + + /* + * At this point, the buffer has only supervisor states and must be + * converted back to normal kernel format. + */ + header = &xstate->header; + header->xcomp_bv |= xfeatures_mask_all; + + /* + * This only moves states up in the buffer. Start with + * the last state and move backwards so that states are + * not overwritten until after they are moved. Note: + * memmove() allows overlapping src/dst buffers. + */ + for (i = max_bit; i >= min_bit; i--) { + u8 *xbuf = (u8 *)xstate; + + if (!((header->xfeatures >> i) & 1)) + continue; + + /* Move xfeature 'i' into its normal location */ + memmove(xbuf + xstate_comp_offsets[i], + xbuf + xstate_supervisor_only_offsets[i], + xstate_sizes[i]); + } +} + #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * Report the amount of time elapsed in millisecond since last AVX512 -- cgit v1.2.3 From 3d81b3d1e55a518837c3d1f722c6d93abe34aa85 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Fri, 8 May 2020 12:58:17 +0200 Subject: x86/cpu: Use RDRAND and RDSEED mnemonics in archrandom.h Current minimum required version of binutils is 2.23, which supports RDRAND and RDSEED instruction mnemonics. Replace the byte-wise specification of RDRAND and RDSEED with these proper mnemonics. Signed-off-by: Uros Bizjak Signed-off-by: Borislav Petkov Reviewed-by: H. Peter Anvin (Intel) Acked-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200508105817.207887-1-ubizjak@gmail.com --- arch/x86/include/asm/archrandom.h | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 7a4bb1bd4bdb..ebc248e49549 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -15,16 +15,6 @@ #define RDRAND_RETRY_LOOPS 10 -#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" -#define RDSEED_INT ".byte 0x0f,0xc7,0xf8" -#ifdef CONFIG_X86_64 -# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" -# define RDSEED_LONG ".byte 0x48,0x0f,0xc7,0xf8" -#else -# define RDRAND_LONG RDRAND_INT -# define RDSEED_LONG RDSEED_INT -#endif - /* Unconditional execution of RDRAND and RDSEED */ static inline bool __must_check rdrand_long(unsigned long *v) @@ -32,9 +22,9 @@ static inline bool __must_check rdrand_long(unsigned long *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG + asm volatile("rdrand %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); @@ -46,9 +36,9 @@ static inline bool __must_check rdrand_int(unsigned int *v) bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT + asm volatile("rdrand %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); if (ok) return true; } while (--retry); @@ -58,18 +48,18 @@ static inline bool __must_check rdrand_int(unsigned int *v) static inline bool __must_check rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG + asm volatile("rdseed %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } static inline bool __must_check rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT + asm volatile("rdseed %[out]" CC_SET(c) - : CC_OUT(c) (ok), "=a" (*v)); + : CC_OUT(c) (ok), [out] "=r" (*v)); return ok; } -- cgit v1.2.3 From b052df3da821adfd6be26a6eb16624fb50e90e56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 5 Mar 2020 00:52:41 +0100 Subject: x86/entry: Get rid of ist_begin/end_non_atomic() This is completely overengineered and definitely not an interface which should be made available to anything else than this particular MCE case. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.462640294@linutronix.de --- arch/x86/include/asm/traps.h | 2 -- arch/x86/kernel/cpu/mce/core.c | 6 ++++-- arch/x86/kernel/traps.c | 37 ------------------------------------- 3 files changed, 4 insertions(+), 41 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c26a7e1d8a2c..fe109fc9a1d2 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -120,8 +120,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void); extern void ist_enter(struct pt_regs *regs); extern void ist_exit(struct pt_regs *regs); -extern void ist_begin_non_atomic(struct pt_regs *regs); -extern void ist_end_non_atomic(void); #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 54165f3569e8..98bf91cd7d5d 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1352,13 +1352,15 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code) /* Fault was in user mode and we need to take some action */ if ((m.cs & 3) == 3) { - ist_begin_non_atomic(regs); + /* If this triggers there is no way to recover. Die hard. */ + BUG_ON(!on_thread_stack() || !user_mode(regs)); local_irq_enable(); + preempt_enable(); if (kill_it || do_memory_failure(&m)) force_sig(SIGBUS); + preempt_disable(); local_irq_disable(); - ist_end_non_atomic(); } else { if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0)) mce_panic("Failed kernel mode recovery", &m, msg); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..6740e8351486 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -117,43 +117,6 @@ void ist_exit(struct pt_regs *regs) rcu_nmi_exit(); } -/** - * ist_begin_non_atomic() - begin a non-atomic section in an IST exception - * @regs: regs passed to the IST exception handler - * - * IST exception handlers normally cannot schedule. As a special - * exception, if the exception interrupted userspace code (i.e. - * user_mode(regs) would return true) and the exception was not - * a double fault, it can be safe to schedule. ist_begin_non_atomic() - * begins a non-atomic section within an ist_enter()/ist_exit() region. - * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call ist_end_non_atomic() - * before ist_exit(). - */ -void ist_begin_non_atomic(struct pt_regs *regs) -{ - BUG_ON(!user_mode(regs)); - - /* - * Sanity check: we need to be on the normal thread stack. This - * will catch asm bugs and any attempt to use ist_preempt_enable - * from double_fault. - */ - BUG_ON(!on_thread_stack()); - - preempt_enable_no_resched(); -} - -/** - * ist_end_non_atomic() - begin a non-atomic section in an IST exception - * - * Ends a non-atomic section started with ist_begin_non_atomic(). - */ -void ist_end_non_atomic(void) -{ - preempt_disable(); -} - int is_valid_bugaddr(unsigned long addr) { unsigned short ud; -- cgit v1.2.3 From 0d00449c7a28a1514595630735df383dec606812 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 Feb 2020 09:46:43 +0100 Subject: x86: Replace ist_enter() with nmi_enter() A few exceptions (like #DB and #BP) can happen at any location in the code, this then means that tracers should treat events from these exceptions as NMI-like. The interrupted context could be holding locks with interrupts disabled for instance. Similarly, #MC is an actual NMI-like exception. All of them use ist_enter() which only concerns itself with RCU, but does not do any of the other setup that NMIs need. This means things like: printk() raw_spin_lock_irq(&logbuf_lock); <#DB/#BP/#MC> printk() raw_spin_lock_irq(&logbuf_lock); are entirely possible (well, not really since printk tries hard to play nice, but the concept stands). So replace ist_enter() with nmi_enter(). Also observe that any nmi_enter() caller must be both notrace and NOKPROBE, or in the noinstr text section. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Link: https://lkml.kernel.org/r/20200505134101.525508608@linutronix.de --- arch/x86/include/asm/traps.h | 3 -- arch/x86/kernel/cpu/mce/core.c | 5 +-- arch/x86/kernel/cpu/mce/p5.c | 5 +-- arch/x86/kernel/cpu/mce/winchip.c | 5 +-- arch/x86/kernel/traps.c | 71 +++++++++------------------------------ 5 files changed, 24 insertions(+), 65 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index fe109fc9a1d2..6f6c417e1e46 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -118,9 +118,6 @@ void smp_spurious_interrupt(struct pt_regs *regs); void smp_error_interrupt(struct pt_regs *regs); asmlinkage void smp_irq_move_cleanup_interrupt(void); -extern void ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs); - #ifdef CONFIG_VMAP_STACK void __noreturn handle_stack_overflow(const char *message, struct pt_regs *regs, diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 2f0ef95795f3..e9265e2f28c9 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1266,7 +1267,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) if (__mc_check_crashing_cpu(cpu)) return; - ist_enter(regs); + nmi_enter(); this_cpu_inc(mce_exception_count); @@ -1374,7 +1375,7 @@ void noinstr do_machine_check(struct pt_regs *regs, long error_code) } out_ist: - ist_exit(regs); + nmi_exit(); } EXPORT_SYMBOL_GPL(do_machine_check); diff --git a/arch/x86/kernel/cpu/mce/p5.c b/arch/x86/kernel/cpu/mce/p5.c index 4ae6df556526..5ee94aa1b766 100644 --- a/arch/x86/kernel/cpu/mce/p5.c +++ b/arch/x86/kernel/cpu/mce/p5.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -24,7 +25,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) { u32 loaddr, hi, lotype; - ist_enter(regs); + nmi_enter(); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +40,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mce/winchip.c b/arch/x86/kernel/cpu/mce/winchip.c index a30ea13cccc2..b3938c195365 100644 --- a/arch/x86/kernel/cpu/mce/winchip.c +++ b/arch/x86/kernel/cpu/mce/winchip.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -18,12 +19,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - ist_enter(regs); + nmi_enter(); pr_emerg("CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs); + nmi_exit(); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 6740e8351486..f7cfb9d0ad02 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -37,10 +37,12 @@ #include #include #include +#include +#include + #include #include #include -#include #include #include #include @@ -82,41 +84,6 @@ static inline void cond_local_irq_disable(struct pt_regs *regs) local_irq_disable(); } -/* - * In IST context, we explicitly disable preemption. This serves two - * purposes: it makes it much less likely that we would accidentally - * schedule in IST context and it will force a warning if we somehow - * manage to schedule by accident. - */ -void ist_enter(struct pt_regs *regs) -{ - if (user_mode(regs)) { - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); - } else { - /* - * We might have interrupted pretty much anything. In - * fact, if we're a machine check, we can even interrupt - * NMI processing. We don't want in_nmi() to return true, - * but we need to notify RCU. - */ - rcu_nmi_enter(); - } - - preempt_disable(); - - /* This code is a bit fragile. Test it. */ - RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work"); -} -NOKPROBE_SYMBOL(ist_enter); - -void ist_exit(struct pt_regs *regs) -{ - preempt_enable_no_resched(); - - if (!user_mode(regs)) - rcu_nmi_exit(); -} - int is_valid_bugaddr(unsigned long addr) { unsigned short ud; @@ -326,7 +293,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign * The net result is that our #GP handler will think that we * entered from usermode with the bad user context. * - * No need for ist_enter here because we don't use RCU. + * No need for nmi_enter() here because we don't use RCU. */ if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY && regs->cs == __KERNEL_CS && @@ -361,7 +328,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign } #endif - ist_enter(regs); + nmi_enter(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -555,19 +522,13 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; /* - * Unlike any other non-IST entry, we can be called from a kprobe in - * non-CONTEXT_KERNEL kernel mode or even during context tracking - * state changes. Make sure that we wake up RCU even if we're coming - * from kernel code. - * - * This means that we can't schedule even if we came from a - * preemptible kernel context. That's okay. + * Unlike any other non-IST entry, we can be called from pretty much + * any location in the kernel through kprobes -- text_poke() will most + * likely be handled by poke_int3_handler() above. This means this + * handler is effectively NMI-like. */ - if (!user_mode(regs)) { - rcu_nmi_enter(); - preempt_disable(); - } - RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); + if (!user_mode(regs)) + nmi_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, @@ -589,10 +550,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) cond_local_irq_disable(regs); exit: - if (!user_mode(regs)) { - preempt_enable_no_resched(); - rcu_nmi_exit(); - } + if (!user_mode(regs)) + nmi_exit(); } NOKPROBE_SYMBOL(do_int3); @@ -696,7 +655,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) unsigned long dr6; int si_code; - ist_enter(regs); + nmi_enter(); get_debugreg(dr6, 6); /* @@ -789,7 +748,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - ist_exit(regs); + nmi_exit(); } NOKPROBE_SYMBOL(do_debug); -- cgit v1.2.3 From ef68017eb5704eb2b0577c3aa6619e13caf2b59f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 28 Feb 2020 10:42:48 -0800 Subject: x86/kvm: Handle async page faults directly through do_page_fault() KVM overloads #PF to indicate two types of not-actually-page-fault events. Right now, the KVM guest code intercepts them by modifying the IDT and hooking the #PF vector. This makes the already fragile fault code even harder to understand, and it also pollutes call traces with async_page_fault and do_async_page_fault for normal page faults. Clean it up by moving the logic into do_page_fault() using a static branch. This gets rid of the platform trap_init override mechanism completely. [ tglx: Fixed up 32bit, removed error code from the async functions and massaged coding style ] Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Paolo Bonzini Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.169270470@linutronix.de --- arch/x86/entry/entry_32.S | 8 -------- arch/x86/entry/entry_64.S | 4 ---- arch/x86/include/asm/kvm_para.h | 19 +++++++++++++++++-- arch/x86/include/asm/x86_init.h | 2 -- arch/x86/kernel/kvm.c | 39 +++++++++++++++++++++------------------ arch/x86/kernel/traps.c | 2 -- arch/x86/kernel/x86_init.c | 1 - arch/x86/mm/fault.c | 19 +++++++++++++++++++ 8 files changed, 57 insertions(+), 37 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b67bae7091d7..8ba0985f5016 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1693,14 +1693,6 @@ SYM_CODE_START(general_protection) jmp common_exception SYM_CODE_END(general_protection) -#ifdef CONFIG_KVM_GUEST -SYM_CODE_START(async_page_fault) - ASM_CLAC - pushl $do_async_page_fault - jmp common_exception_read_cr2 -SYM_CODE_END(async_page_fault) -#endif - SYM_CODE_START(rewind_stack_do_exit) /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3063aa9090f9..9ab3ea6d02fc 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1202,10 +1202,6 @@ idtentry xendebug do_debug has_error_code=0 idtentry general_protection do_general_protection has_error_code=1 idtentry page_fault do_page_fault has_error_code=1 read_cr2=1 -#ifdef CONFIG_KVM_GUEST -idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1 -#endif - #ifdef CONFIG_X86_MCE idtentry machine_check do_mce has_error_code=0 paranoid=1 #endif diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 9b4df6eaa11a..5261363adda3 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -91,8 +91,18 @@ unsigned int kvm_arch_para_hints(void); void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); -extern void kvm_disable_steal_time(void); -void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address); +void kvm_disable_steal_time(void); +bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); + +DECLARE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + +static __always_inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + if (static_branch_unlikely(&kvm_async_pf_enabled)) + return __kvm_handle_async_pf(regs, token); + else + return false; +} #ifdef CONFIG_PARAVIRT_SPINLOCKS void __init kvm_spinlock_init(void); @@ -130,6 +140,11 @@ static inline void kvm_disable_steal_time(void) { return; } + +static inline bool kvm_handle_async_pf(struct pt_regs *regs, u32 token) +{ + return false; +} #endif #endif /* _ASM_X86_KVM_PARA_H */ diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 96d9cd208610..6807153c0410 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -50,14 +50,12 @@ struct x86_init_resources { * @pre_vector_init: init code to run before interrupt vectors * are set up. * @intr_init: interrupt init code - * @trap_init: platform specific trap setup * @intr_mode_select: interrupt delivery mode selection * @intr_mode_init: interrupt delivery mode setup */ struct x86_init_irqs { void (*pre_vector_init)(void); void (*intr_init)(void); - void (*trap_init)(void); void (*intr_mode_select)(void); void (*intr_mode_init)(void); }; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6efe0410fb72..5ad3fcca2309 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -35,6 +35,8 @@ #include #include +DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); + static int kvmapf = 1; static int __init parse_no_kvmapf(char *arg) @@ -242,25 +244,27 @@ u32 kvm_read_and_reset_pf_reason(void) EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); -dotraplinkage void -do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { + /* + * If we get a page fault right here, the pf_reason seems likely + * to be clobbered. Bummer. + */ switch (kvm_read_and_reset_pf_reason()) { default: - do_page_fault(regs, error_code, address); - break; + return false; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ - kvm_async_pf_task_wait((u32)address, !user_mode(regs)); - break; + kvm_async_pf_task_wait(token, !user_mode(regs)); + return true; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - kvm_async_pf_task_wake((u32)address); + kvm_async_pf_task_wake(token); rcu_irq_exit(); - break; + return true; } } -NOKPROBE_SYMBOL(do_async_page_fault); +NOKPROBE_SYMBOL(__kvm_handle_async_pf); static void __init paravirt_ops_setup(void) { @@ -306,7 +310,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) static void kvm_guest_cpu_init(void) { if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); + u64 pa; + + WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled)); + + pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); #ifdef CONFIG_PREEMPTION pa |= KVM_ASYNC_PF_SEND_ALWAYS; @@ -592,12 +600,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu) } #endif -static void __init kvm_apf_trap_init(void) -{ - update_intr_gate(X86_TRAP_PF, async_page_fault); -} - - static void kvm_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { @@ -632,8 +634,6 @@ static void __init kvm_guest_init(void) register_reboot_notifier(&kvm_pv_reboot_nb); for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) raw_spin_lock_init(&async_pf_sleepers[i].lock); - if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF)) - x86_init.irqs.trap_init = kvm_apf_trap_init; if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { has_steal_clock = 1; @@ -649,6 +649,9 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) + static_branch_enable(&kvm_async_pf_enabled); + #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d54cffdc7cac..821fac47eef6 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -983,7 +983,5 @@ void __init trap_init(void) idt_setup_ist_traps(); - x86_init.irqs.trap_init(); - idt_setup_debugidt_traps(); } diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 85f1a90c55cd..123f1c1f1788 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = { .irqs = { .pre_vector_init = init_ISA_irqs, .intr_init = native_init_IRQ, - .trap_init = x86_init_noop, .intr_mode_select = apic_intr_mode_select, .intr_mode_init = apic_intr_mode_init }, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a51df516b87b..6486ccec1b0e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -30,6 +30,7 @@ #include /* store_idt(), ... */ #include /* exception stack */ #include /* VMALLOC_START, ... */ +#include /* kvm_handle_async_pf */ #define CREATE_TRACE_POINTS #include @@ -1523,6 +1524,24 @@ do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, unsigned long address) { prefetchw(¤t->mm->mmap_sem); + /* + * KVM has two types of events that are, logically, interrupts, but + * are unfortunately delivered using the #PF vector. These events are + * "you just accessed valid memory, but the host doesn't have it right + * now, so I'll put you to sleep if you continue" and "that memory + * you tried to access earlier is available now." + * + * We are relying on the interrupted context being sane (valid RSP, + * relevant locks not held, etc.), which is fine as long as the + * interrupted context had IF=1. We are also relying on the KVM + * async pf type field and CR2 being read consistently instead of + * getting values from real and async page faults mixed up. + * + * Fingers crossed. + */ + if (kvm_handle_async_pf(regs, (u32)address)) + return; + trace_page_fault_entries(regs, hw_error_code, address); if (unlikely(kmmio_fault(regs, address))) -- cgit v1.2.3 From 6bca69ada4bc20fa27eb44a5e09da3363d1752af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 7 Mar 2020 00:42:06 +0100 Subject: x86/kvm: Sanitize kvm_async_pf_task_wait() While working on the entry consolidation I stumbled over the KVM async page fault handler and kvm_async_pf_task_wait() in particular. It took me a while to realize that the randomly sprinkled around rcu_irq_enter()/exit() invocations are just cargo cult programming. Several patches "fixed" RCU splats by curing the symptoms without noticing that the code is flawed from a design perspective. The main problem is that this async injection is not based on a proper handshake mechanism and only respects the minimal requirement, i.e. the guest is not in a state where it has interrupts disabled. Aside of that the actual code is a convoluted one fits it all swiss army knife. It is invoked from different places with different RCU constraints: 1) Host side: vcpu_enter_guest() kvm_x86_ops->handle_exit() kvm_handle_page_fault() kvm_async_pf_task_wait() The invocation happens from fully preemptible context. 2) Guest side: The async page fault interrupted: a) user space b) preemptible kernel code which is not in a RCU read side critical section c) non-preemtible kernel code or a RCU read side critical section or kernel code with CONFIG_PREEMPTION=n which allows not to differentiate between #2b and #2c. RCU is watching for: #1 The vCPU exited and current is definitely not the idle task #2a The #PF entry code on the guest went through enter_from_user_mode() which reactivates RCU #2b There is no preemptible, interrupts enabled code in the kernel which can run with RCU looking away. (The idle task is always non preemptible). I.e. all schedulable states (#1, #2a, #2b) do not need any of this RCU voodoo at all. In #2c RCU is eventually not watching, but as that state cannot schedule anyway there is no point to worry about it so it has to invoke rcu_irq_enter() before running that code. This can be optimized, but this will be done as an extra step in course of the entry code consolidation work. So the proper solution for this is to: - Split kvm_async_pf_task_wait() into schedule and halt based waiting interfaces which share the enqueueing code. - Add comments (condensed form of this changelog) to spare others the time waste and pain of reverse engineering all of this with the help of uncomprehensible changelogs and code history. - Invoke kvm_async_pf_task_wait_schedule() from kvm_handle_page_fault(), user mode and schedulable kernel side async page faults (#1, #2a, #2b) - Invoke kvm_async_pf_task_wait_halt() for the non schedulable kernel case (#2c). For this case also remove the rcu_irq_exit()/enter() pair around the halt as it is just a pointless exercise: - vCPUs can VMEXIT at any random point and can be scheduled out for an arbitrary amount of time by the host and this is not any different except that it voluntary triggers the exit via halt. - The interrupted context could have RCU watching already. So the rcu_irq_exit() before the halt is not gaining anything aside of confusing the reader. Claiming that this might prevent RCU stalls is just an illusion. Signed-off-by: Thomas Gleixner Reviewed-by: Alexandre Chartre Acked-by: Paolo Bonzini Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200505134059.262701431@linutronix.de --- arch/x86/include/asm/kvm_para.h | 4 +- arch/x86/kernel/kvm.c | 201 ++++++++++++++++++++++++++++------------ arch/x86/kvm/mmu/mmu.c | 2 +- 3 files changed, 144 insertions(+), 63 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 5261363adda3..118e5c2379f9 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,7 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_hints(void); -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); +void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); void kvm_disable_steal_time(void); @@ -113,7 +113,7 @@ static inline void kvm_spinlock_init(void) #endif /* CONFIG_PARAVIRT_SPINLOCKS */ #else /* CONFIG_KVM_GUEST */ -#define kvm_async_pf_task_wait(T, I) do {} while(0) +#define kvm_async_pf_task_wait_schedule(T) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5ad3fcca2309..c6a82f9f537f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -75,7 +75,7 @@ struct kvm_task_sleep_node { struct swait_queue_head wq; u32 token; int cpu; - bool halted; + bool use_halt; }; static struct kvm_task_sleep_head { @@ -98,75 +98,145 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b, return NULL; } -/* - * @interrupt_kernel: Is this called from a routine which interrupts the kernel - * (other than user space)? - */ -void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) +static bool kvm_async_pf_queue_task(u32 token, bool use_halt, + struct kvm_task_sleep_node *n) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; - struct kvm_task_sleep_node n, *e; - DECLARE_SWAITQUEUE(wait); - - rcu_irq_enter(); + struct kvm_task_sleep_node *e; raw_spin_lock(&b->lock); e = _find_apf_task(b, token); if (e) { /* dummy entry exist -> wake up was delivered ahead of PF */ hlist_del(&e->link); - kfree(e); raw_spin_unlock(&b->lock); + kfree(e); + return false; + } - rcu_irq_exit(); + n->token = token; + n->cpu = smp_processor_id(); + n->use_halt = use_halt; + init_swait_queue_head(&n->wq); + hlist_add_head(&n->link, &b->list); + raw_spin_unlock(&b->lock); + return true; +} + +/* + * kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled + * @token: Token to identify the sleep node entry + * + * Invoked from the async pagefault handling code or from the VM exit page + * fault handler. In both cases RCU is watching. + */ +void kvm_async_pf_task_wait_schedule(u32 token) +{ + struct kvm_task_sleep_node n; + DECLARE_SWAITQUEUE(wait); + + lockdep_assert_irqs_disabled(); + + if (!kvm_async_pf_queue_task(token, false, &n)) return; + + for (;;) { + prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); + if (hlist_unhashed(&n.link)) + break; + + local_irq_enable(); + schedule(); + local_irq_disable(); } + finish_swait(&n.wq, &wait); +} +EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule); - n.token = token; - n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || - (IS_ENABLED(CONFIG_PREEMPT_COUNT) - ? preempt_count() > 1 || rcu_preempt_depth() - : interrupt_kernel); - init_swait_queue_head(&n.wq); - hlist_add_head(&n.link, &b->list); - raw_spin_unlock(&b->lock); +/* + * Invoked from the async page fault handler. + */ +static void kvm_async_pf_task_wait_halt(u32 token) +{ + struct kvm_task_sleep_node n; + + if (!kvm_async_pf_queue_task(token, true, &n)) + return; for (;;) { - if (!n.halted) - prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE); if (hlist_unhashed(&n.link)) break; + /* + * No point in doing anything about RCU here. Any RCU read + * side critical section or RCU watching section can be + * interrupted by VMEXITs and the host is free to keep the + * vCPU scheduled out as long as it sees fit. This is not + * any different just because of the halt induced voluntary + * VMEXIT. + * + * Also the async page fault could have interrupted any RCU + * watching context, so invoking rcu_irq_exit()/enter() + * around this is not gaining anything. + */ + native_safe_halt(); + local_irq_disable(); + } +} - rcu_irq_exit(); +/* Invoked from the async page fault handler */ +static void kvm_async_pf_task_wait(u32 token, bool usermode) +{ + bool can_schedule; - if (!n.halted) { - local_irq_enable(); - schedule(); - local_irq_disable(); - } else { - /* - * We cannot reschedule. So halt. - */ - native_safe_halt(); - local_irq_disable(); - } + /* + * No need to check whether interrupts were disabled because the + * host will (hopefully) only inject an async page fault into + * interrupt enabled regions. + * + * If CONFIG_PREEMPTION is enabled then check whether the code + * which triggered the page fault is preemptible. This covers user + * mode as well because preempt_count() is obviously 0 there. + * + * The check for rcu_preempt_depth() is also required because + * voluntary scheduling inside a rcu read locked section is not + * allowed. + * + * The idle task is already covered by this because idle always + * has a preempt count > 0. + * + * If CONFIG_PREEMPTION is disabled only allow scheduling when + * coming from user mode as there is no indication whether the + * context which triggered the page fault could schedule or not. + */ + if (IS_ENABLED(CONFIG_PREEMPTION)) + can_schedule = preempt_count() + rcu_preempt_depth() == 0; + else + can_schedule = usermode; + /* + * If the kernel context is allowed to schedule then RCU is + * watching because no preemptible code in the kernel is inside RCU + * idle state. So it can be treated like user mode. User mode is + * safe because the #PF entry invoked enter_from_user_mode(). + * + * For the non schedulable case invoke rcu_irq_enter() for + * now. This will be moved out to the pagefault entry code later + * and only invoked when really needed. + */ + if (can_schedule) { + kvm_async_pf_task_wait_schedule(token); + } else { rcu_irq_enter(); + kvm_async_pf_task_wait_halt(token); + rcu_irq_exit(); } - if (!n.halted) - finish_swait(&n.wq, &wait); - - rcu_irq_exit(); - return; } -EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait); static void apf_task_wake_one(struct kvm_task_sleep_node *n) { hlist_del_init(&n->link); - if (n->halted) + if (n->use_halt) smp_send_reschedule(n->cpu); else if (swq_has_sleeper(&n->wq)) swake_up_one(&n->wq); @@ -177,12 +247,13 @@ static void apf_task_wake_all(void) int i; for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) { - struct hlist_node *p, *next; struct kvm_task_sleep_head *b = &async_pf_sleepers[i]; + struct kvm_task_sleep_node *n; + struct hlist_node *p, *next; + raw_spin_lock(&b->lock); hlist_for_each_safe(p, next, &b->list) { - struct kvm_task_sleep_node *n = - hlist_entry(p, typeof(*n), link); + n = hlist_entry(p, typeof(*n), link); if (n->cpu == smp_processor_id()) apf_task_wake_one(n); } @@ -223,8 +294,9 @@ again: n->cpu = smp_processor_id(); init_swait_queue_head(&n->wq); hlist_add_head(&n->link, &b->list); - } else + } else { apf_task_wake_one(n); + } raw_spin_unlock(&b->lock); return; } @@ -246,23 +318,33 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - /* - * If we get a page fault right here, the pf_reason seems likely - * to be clobbered. Bummer. - */ - switch (kvm_read_and_reset_pf_reason()) { + u32 reason = kvm_read_and_reset_pf_reason(); + + switch (reason) { + case KVM_PV_REASON_PAGE_NOT_PRESENT: + case KVM_PV_REASON_PAGE_READY: + break; default: return false; - case KVM_PV_REASON_PAGE_NOT_PRESENT: + } + + /* + * If the host managed to inject an async #PF into an interrupt + * disabled region, then die hard as this is not going to end well + * and the host side is seriously broken. + */ + if (unlikely(!(regs->flags & X86_EFLAGS_IF))) + panic("Host injected async #PF in interrupt disabled region\n"); + + if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) { /* page is swapped out by the host. */ - kvm_async_pf_task_wait(token, !user_mode(regs)); - return true; - case KVM_PV_REASON_PAGE_READY: + kvm_async_pf_task_wait(token, user_mode(regs)); + } else { rcu_irq_enter(); kvm_async_pf_task_wake(token); rcu_irq_exit(); - return true; } + return true; } NOKPROBE_SYMBOL(__kvm_handle_async_pf); @@ -326,12 +408,12 @@ static void kvm_guest_cpu_init(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); - printk(KERN_INFO"KVM setup async PF for cpu %d\n", - smp_processor_id()); + pr_info("KVM setup async PF for cpu %d\n", smp_processor_id()); } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) { unsigned long pa; + /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __this_cpu_write(kvm_apic_eoi, 0); @@ -352,8 +434,7 @@ static void kvm_pv_disable_apf(void) wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(apf_reason.enabled, 0); - printk(KERN_INFO"Unregister pv shared memory for cpu %d\n", - smp_processor_id()); + pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id()); } static void kvm_pv_guest_cpu_reboot(void *unused) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 8071952e9cf2..dd900a648059 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4198,7 +4198,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, case KVM_PV_REASON_PAGE_NOT_PRESENT: vcpu->arch.apf.host_apf_reason = 0; local_irq_disable(); - kvm_async_pf_task_wait(fault_address, 0); + kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: -- cgit v1.2.3 From 0e5e3d4461a22d739fb2284a6e313fb6cecf2871 Mon Sep 17 00:00:00 2001 From: Benjamin Thiel Date: Sat, 16 May 2020 14:38:16 +0200 Subject: x86/audit: Fix a -Wmissing-prototypes warning for ia32_classify_syscall() Lift the prototype of ia32_classify_syscall() into its own header. Signed-off-by: Benjamin Thiel Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200516123816.2680-1-b.thiel@posteo.de --- arch/x86/ia32/audit.c | 1 + arch/x86/include/asm/audit.h | 7 +++++++ arch/x86/kernel/audit_64.c | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) create mode 100644 arch/x86/include/asm/audit.h (limited to 'arch/x86/include') diff --git a/arch/x86/ia32/audit.c b/arch/x86/ia32/audit.c index 3d21eab7aaed..6efe6cb3768a 100644 --- a/arch/x86/ia32/audit.c +++ b/arch/x86/ia32/audit.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include unsigned ia32_dir_class[] = { #include diff --git a/arch/x86/include/asm/audit.h b/arch/x86/include/asm/audit.h new file mode 100644 index 000000000000..36aec57ea7a3 --- /dev/null +++ b/arch/x86/include/asm/audit.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_AUDIT_H +#define _ASM_X86_AUDIT_H + +int ia32_classify_syscall(unsigned int syscall); + +#endif /* _ASM_X86_AUDIT_H */ diff --git a/arch/x86/kernel/audit_64.c b/arch/x86/kernel/audit_64.c index e1efe44ebefc..83d9cad4e68b 100644 --- a/arch/x86/kernel/audit_64.c +++ b/arch/x86/kernel/audit_64.c @@ -3,6 +3,7 @@ #include #include #include +#include static unsigned dir_class[] = { #include @@ -41,7 +42,6 @@ int audit_classify_arch(int arch) int audit_classify_syscall(int abi, unsigned syscall) { #ifdef CONFIG_IA32_EMULATION - extern int ia32_classify_syscall(unsigned); if (abi == AUDIT_ARCH_I386) return ia32_classify_syscall(syscall); #endif -- cgit v1.2.3 From 7357b1df744c2a3bcbe00cea0eef1509d004f488 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 22 Apr 2020 12:57:34 -0700 Subject: KVM: x86: hyperv: Remove duplicate definitions of Reference TSC Page The Hyper-V Reference TSC Page structure is defined twice. struct ms_hyperv_tsc_page has padding out to a full 4 Kbyte page size. But the padding is not needed because the declaration includes a union with HV_HYP_PAGE_SIZE. KVM uses the second definition, which is struct _HV_REFERENCE_TSC_PAGE, because it does not have the padding. Fix the duplication by removing the padding from ms_hyperv_tsc_page. Fix up the KVM code to use it. Remove the no longer used struct _HV_REFERENCE_TSC_PAGE. There is no functional change. Signed-off-by: Michael Kelley Acked-by: Paolo Bonzini Reviewed-by: Vitaly Kuznetsov Link: https://lore.kernel.org/r/20200422195737.10223-2-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/include/asm/hyperv-tlfs.h | 8 -------- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/hyperv.c | 4 ++-- 3 files changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 29336574d0bc..0e4d76920957 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -303,7 +303,6 @@ struct ms_hyperv_tsc_page { u32 reserved1; volatile u64 tsc_scale; volatile s64 tsc_offset; - u64 reserved2[509]; } __packed; /* @@ -433,13 +432,6 @@ enum HV_GENERIC_SET_FORMAT { */ #define HV_CLOCK_HZ (NSEC_PER_SEC/100) -typedef struct _HV_REFERENCE_TSC_PAGE { - __u32 tsc_sequence; - __u32 res1; - __u64 tsc_scale; - __s64 tsc_offset; -} __packed HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE; - /* Define the number of synthetic interrupt sources. */ #define HV_SYNIC_SINT_COUNT (16) /* Define the expected SynIC version. */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 42a2d0d3984a..4698343b9a05 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -865,7 +865,7 @@ struct kvm_hv { u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; - HV_REFERENCE_TSC_PAGE tsc_ref; + struct ms_hyperv_tsc_page tsc_ref; struct idr conn_to_evt; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index bcefa9d4e57e..1f3c6fd3cdaa 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -900,7 +900,7 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu, * These two equivalencies are implemented in this function. */ static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock, - HV_REFERENCE_TSC_PAGE *tsc_ref) + struct ms_hyperv_tsc_page *tsc_ref) { u64 max_mul; @@ -941,7 +941,7 @@ void kvm_hv_setup_tsc_page(struct kvm *kvm, u64 gfn; BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence)); - BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0); + BUILD_BUG_ON(offsetof(struct ms_hyperv_tsc_page, tsc_sequence) != 0); if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)) return; -- cgit v1.2.3 From a8a42d0284f18a64d540a604aecec9f961964e3b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 22 Apr 2020 12:57:35 -0700 Subject: x86/hyperv: Remove HV_PROCESSOR_POWER_STATE #defines The HV_PROCESSOR_POWER_STATE_C #defines date back to year 2010, but they are not in the TLFS v6.0 document and are not used anywhere in Linux. Remove them. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20200422195737.10223-3-mikelley@microsoft.com Signed-off-by: Wei Liu --- arch/x86/include/asm/hyperv-tlfs.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 0e4d76920957..2dd1ceb2bcf8 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -390,11 +390,6 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 -#define HV_PROCESSOR_POWER_STATE_C0 0 -#define HV_PROCESSOR_POWER_STATE_C1 1 -#define HV_PROCESSOR_POWER_STATE_C2 2 -#define HV_PROCESSOR_POWER_STATE_C3 3 - #define HV_FLUSH_ALL_PROCESSORS BIT(0) #define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) #define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) -- cgit v1.2.3 From c55a844f46f958b4df804eccd734b86795e40359 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 22 Apr 2020 12:57:36 -0700 Subject: x86/hyperv: Split hyperv-tlfs.h into arch dependent and independent files In preparation for adding ARM64 support, split hyperv-tlfs.h into architecture dependent and architecture independent files, similar to what has been done with mshyperv.h. Move architecture independent definitions into include/asm-generic/hyperv-tlfs.h. The split will avoid duplicating significant lines of code in the ARM64 version of hyperv-tlfs.h. The split has no functional impact. Some of the common definitions have "X64" in the symbol name. Change these to remove the "X64" in the architecture independent version of hyperv-tlfs.h, but add aliases with the "X64" in the x86 version so that x86 code will continue to compile. A later patch set will change all the references and allow removal of the aliases. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/20200422195737.10223-4-mikelley@microsoft.com Signed-off-by: Wei Liu --- MAINTAINERS | 1 + arch/x86/include/asm/hyperv-tlfs.h | 459 +++---------------------------------- include/asm-generic/hyperv-tlfs.h | 442 +++++++++++++++++++++++++++++++++++ 3 files changed, 479 insertions(+), 423 deletions(-) create mode 100644 include/asm-generic/hyperv-tlfs.h (limited to 'arch/x86/include') diff --git a/MAINTAINERS b/MAINTAINERS index b816a453b10e..bfb9f384b3f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7883,6 +7883,7 @@ F: drivers/pci/controller/pci-hyperv.c F: drivers/scsi/storvsc_drv.c F: drivers/uio/uio_hv_generic.c F: drivers/video/fbdev/hyperv_fb.c +F: include/asm-generic/hyperv-tlfs.h F: include/asm-generic/mshyperv.h F: include/clocksource/hyperv_timer.h F: include/linux/hyperv.h diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 2dd1ceb2bcf8..4e91f6118d5d 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -11,17 +11,6 @@ #include #include - -/* - * While not explicitly listed in the TLFS, Hyper-V always runs with a page size - * of 4096. These definitions are used when communicating with Hyper-V using - * guest physical pages and guest physical page addresses, since the guest page - * size may not be 4096 on all architectures. - */ -#define HV_HYP_PAGE_SHIFT 12 -#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) -#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) - /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent * is set by CPUID(HvCpuIdFunctionVersionAndFeatures). @@ -39,78 +28,41 @@ #define HYPERV_CPUID_MAX 0x4000ffff /* - * Feature identification. EAX indicates which features are available - * to the partition based upon the current partition privileges. - * These are HYPERV_CPUID_FEATURES.EAX bits. + * Aliases for Group A features that have X64 in the name. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EAX bits. */ -/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */ -#define HV_X64_MSR_VP_RUNTIME_AVAILABLE BIT(0) -/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/ -#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) -/* - * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM - * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available - */ -#define HV_X64_MSR_SYNIC_AVAILABLE BIT(2) -/* - * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through - * HV_X64_MSR_STIMER3_COUNT) available - */ -#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) -/* - * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR) - * are available - */ -#define HV_X64_MSR_APIC_ACCESS_AVAILABLE BIT(4) -/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/ -#define HV_X64_MSR_HYPERCALL_AVAILABLE BIT(5) -/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/ -#define HV_X64_MSR_VP_INDEX_AVAILABLE BIT(6) -/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/ -#define HV_X64_MSR_RESET_AVAILABLE BIT(7) -/* - * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE, - * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE, - * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available - */ -#define HV_X64_MSR_STAT_PAGES_AVAILABLE BIT(8) -/* Partition reference TSC MSR is available */ -#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) -/* Partition Guest IDLE MSR is available */ -#define HV_X64_MSR_GUEST_IDLE_AVAILABLE BIT(10) -/* - * There is a single feature flag that signifies if the partition has access - * to MSRs with local APIC and TSC frequencies. - */ -#define HV_X64_ACCESS_FREQUENCY_MSRS BIT(11) -/* AccessReenlightenmentControls privilege */ -#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) -/* AccessTscInvariantControls privilege */ -#define HV_X64_ACCESS_TSC_INVARIANT BIT(15) +#define HV_X64_MSR_VP_RUNTIME_AVAILABLE \ + HV_MSR_VP_RUNTIME_AVAILABLE +#define HV_X64_MSR_SYNIC_AVAILABLE \ + HV_MSR_SYNIC_AVAILABLE +#define HV_X64_MSR_APIC_ACCESS_AVAILABLE \ + HV_MSR_APIC_ACCESS_AVAILABLE +#define HV_X64_MSR_HYPERCALL_AVAILABLE \ + HV_MSR_HYPERCALL_AVAILABLE +#define HV_X64_MSR_VP_INDEX_AVAILABLE \ + HV_MSR_VP_INDEX_AVAILABLE +#define HV_X64_MSR_RESET_AVAILABLE \ + HV_MSR_RESET_AVAILABLE +#define HV_X64_MSR_GUEST_IDLE_AVAILABLE \ + HV_MSR_GUEST_IDLE_AVAILABLE +#define HV_X64_ACCESS_FREQUENCY_MSRS \ + HV_ACCESS_FREQUENCY_MSRS +#define HV_X64_ACCESS_REENLIGHTENMENT \ + HV_ACCESS_REENLIGHTENMENT +#define HV_X64_ACCESS_TSC_INVARIANT \ + HV_ACCESS_TSC_INVARIANT /* - * Feature identification: indicates which flags were specified at partition - * creation. The format is the same as the partition creation flag structure - * defined in section Partition Creation Flags. - * These are HYPERV_CPUID_FEATURES.EBX bits. + * Aliases for Group B features that have X64 in the name. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EBX bits. */ -#define HV_X64_CREATE_PARTITIONS BIT(0) -#define HV_X64_ACCESS_PARTITION_ID BIT(1) -#define HV_X64_ACCESS_MEMORY_POOL BIT(2) -#define HV_X64_ADJUST_MESSAGE_BUFFERS BIT(3) -#define HV_X64_POST_MESSAGES BIT(4) -#define HV_X64_SIGNAL_EVENTS BIT(5) -#define HV_X64_CREATE_PORT BIT(6) -#define HV_X64_CONNECT_PORT BIT(7) -#define HV_X64_ACCESS_STATS BIT(8) -#define HV_X64_DEBUGGING BIT(11) -#define HV_X64_CPU_POWER_MANAGEMENT BIT(12) +#define HV_X64_POST_MESSAGES HV_POST_MESSAGES +#define HV_X64_SIGNAL_EVENTS HV_SIGNAL_EVENTS /* - * Feature identification. EDX indicates which miscellaneous features - * are available to the partition. - * These are HYPERV_CPUID_FEATURES.EDX bits. + * Group D Features. The bit assignments are custom to each architecture. + * On x86/x64 these are HYPERV_CPUID_FEATURES.EDX bits. */ /* The MWAIT instruction is available (per section MONITOR / MWAIT) */ #define HV_X64_MWAIT_AVAILABLE BIT(0) @@ -187,7 +139,7 @@ * processor, except for virtual processors that are reported as sibling SMT * threads. */ -#define HV_X64_NO_NONARCH_CORESHARING BIT(18) +#define HV_X64_NO_NONARCH_CORESHARING BIT(18) /* Nested features. These are HYPERV_CPUID_NESTED_FEATURES.EAX bits. */ #define HV_X64_NESTED_DIRECT_FLUSH BIT(17) @@ -295,42 +247,6 @@ union hv_x64_msr_hypercall_contents { } __packed; }; -/* - * TSC page layout. - */ -struct ms_hyperv_tsc_page { - volatile u32 tsc_sequence; - u32 reserved1; - volatile u64 tsc_scale; - volatile s64 tsc_offset; -} __packed; - -/* - * The guest OS needs to register the guest ID with the hypervisor. - * The guest ID is a 64 bit entity and the structure of this ID is - * specified in the Hyper-V specification: - * - * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx - * - * While the current guideline does not specify how Linux guest ID(s) - * need to be generated, our plan is to publish the guidelines for - * Linux and other guest operating systems that currently are hosted - * on Hyper-V. The implementation here conforms to this yet - * unpublished guidelines. - * - * - * Bit(s) - * 63 - Indicates if the OS is Open Source or not; 1 is Open Source - * 62:56 - Os Type; Linux is 0x100 - * 55:48 - Distro specific identification - * 47:16 - Linux kernel version number - * 15:0 - Distro specific identification - * - * - */ - -#define HV_LINUX_VENDOR_ID 0x8100 - struct hv_reenlightenment_control { __u64 vector:8; __u64 reserved1:8; @@ -354,31 +270,12 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) -/* - * Crash notification (HV_X64_MSR_CRASH_CTL) flags. - */ -#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) -#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) #define HV_IPI_LOW_VECTOR 0x10 #define HV_IPI_HIGH_VECTOR 0xff -/* Declare the various hypercall operations. */ -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 -#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 -#define HVCALL_SEND_IPI 0x000b -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 -#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 -#define HVCALL_SEND_IPI_EX 0x0015 -#define HVCALL_POST_MESSAGE 0x005c -#define HVCALL_SIGNAL_EVENT 0x005d -#define HVCALL_RETARGET_INTERRUPT 0x007e -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af -#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 - #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE 0x00000001 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_MASK \ @@ -390,63 +287,6 @@ struct hv_tsc_emulation_status { #define HV_X64_MSR_TSC_REFERENCE_ENABLE 0x00000001 #define HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT 12 -#define HV_FLUSH_ALL_PROCESSORS BIT(0) -#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) -#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) -#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) - -enum HV_GENERIC_SET_FORMAT { - HV_GENERIC_SET_SPARSE_4K, - HV_GENERIC_SET_ALL, -}; - -#define HV_PARTITION_ID_SELF ((u64)-1) - -#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) -#define HV_HYPERCALL_FAST_BIT BIT(16) -#define HV_HYPERCALL_VARHEAD_OFFSET 17 -#define HV_HYPERCALL_REP_COMP_OFFSET 32 -#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) -#define HV_HYPERCALL_REP_START_OFFSET 48 -#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) - -/* hypercall status code */ -#define HV_STATUS_SUCCESS 0 -#define HV_STATUS_INVALID_HYPERCALL_CODE 2 -#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 -#define HV_STATUS_INVALID_ALIGNMENT 4 -#define HV_STATUS_INVALID_PARAMETER 5 -#define HV_STATUS_INSUFFICIENT_MEMORY 11 -#define HV_STATUS_INVALID_PORT_ID 17 -#define HV_STATUS_INVALID_CONNECTION_ID 18 -#define HV_STATUS_INSUFFICIENT_BUFFERS 19 - -/* - * The Hyper-V TimeRefCount register and the TSC - * page provide a guest VM clock with 100ns tick rate - */ -#define HV_CLOCK_HZ (NSEC_PER_SEC/100) - -/* Define the number of synthetic interrupt sources. */ -#define HV_SYNIC_SINT_COUNT (16) -/* Define the expected SynIC version. */ -#define HV_SYNIC_VERSION_1 (0x1) -/* Valid SynIC vectors are 16-255. */ -#define HV_SYNIC_FIRST_VALID_VECTOR (16) - -#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) -#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) -#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) -#define HV_SYNIC_SINT_MASKED (1ULL << 16) -#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) -#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) - -#define HV_SYNIC_STIMER_COUNT (4) - -/* Define synthetic interrupt controller message constants. */ -#define HV_MESSAGE_SIZE (256) -#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) -#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) /* Define hypervisor message types. */ enum hv_message_type { @@ -457,76 +297,25 @@ enum hv_message_type { HVMSG_GPA_INTERCEPT = 0x80000001, /* Timer notification messages. */ - HVMSG_TIMER_EXPIRED = 0x80000010, + HVMSG_TIMER_EXPIRED = 0x80000010, /* Error messages. */ HVMSG_INVALID_VP_REGISTER_VALUE = 0x80000020, HVMSG_UNRECOVERABLE_EXCEPTION = 0x80000021, - HVMSG_UNSUPPORTED_FEATURE = 0x80000022, + HVMSG_UNSUPPORTED_FEATURE = 0x80000022, /* Trace buffer complete messages. */ HVMSG_EVENTLOG_BUFFERCOMPLETE = 0x80000040, /* Platform-specific processor intercept messages. */ - HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, + HVMSG_X64_IOPORT_INTERCEPT = 0x80010000, HVMSG_X64_MSR_INTERCEPT = 0x80010001, - HVMSG_X64_CPUID_INTERCEPT = 0x80010002, + HVMSG_X64_CPUID_INTERCEPT = 0x80010002, HVMSG_X64_EXCEPTION_INTERCEPT = 0x80010003, - HVMSG_X64_APIC_EOI = 0x80010004, - HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 + HVMSG_X64_APIC_EOI = 0x80010004, + HVMSG_X64_LEGACY_FP_ERROR = 0x80010005 }; -/* Define synthetic interrupt controller message flags. */ -union hv_message_flags { - __u8 asu8; - struct { - __u8 msg_pending:1; - __u8 reserved:7; - } __packed; -}; - -/* Define port identifier type. */ -union hv_port_id { - __u32 asu32; - struct { - __u32 id:24; - __u32 reserved:8; - } __packed u; -}; - -/* Define synthetic interrupt controller message header. */ -struct hv_message_header { - __u32 message_type; - __u8 payload_size; - union hv_message_flags message_flags; - __u8 reserved[2]; - union { - __u64 sender; - union hv_port_id port; - }; -} __packed; - -/* Define synthetic interrupt controller message format. */ -struct hv_message { - struct hv_message_header header; - union { - __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; - } u; -} __packed; - -/* Define the synthetic interrupt message page layout. */ -struct hv_message_page { - struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; -} __packed; - -/* Define timer message payload structure. */ -struct hv_timer_message_payload { - __u32 timer_index; - __u32 reserved; - __u64 expiration_time; /* When the timer expired */ - __u64 delivery_time; /* When the message was delivered */ -} __packed; - struct hv_nested_enlightenments_control { struct { __u32 directhypercall:1; @@ -754,187 +543,11 @@ struct hv_enlightened_vmcs { #define HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL 0xFFFF -/* Define synthetic interrupt controller flag constants. */ -#define HV_EVENT_FLAGS_COUNT (256 * 8) -#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) - -/* - * Synthetic timer configuration. - */ -union hv_stimer_config { - u64 as_uint64; - struct { - u64 enable:1; - u64 periodic:1; - u64 lazy:1; - u64 auto_enable:1; - u64 apic_vector:8; - u64 direct_mode:1; - u64 reserved_z0:3; - u64 sintx:4; - u64 reserved_z1:44; - } __packed; -}; - - -/* Define the synthetic interrupt controller event flags format. */ -union hv_synic_event_flags { - unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; -}; - -/* Define SynIC control register. */ -union hv_synic_scontrol { - u64 as_uint64; - struct { - u64 enable:1; - u64 reserved:63; - } __packed; -}; - -/* Define synthetic interrupt source. */ -union hv_synic_sint { - u64 as_uint64; - struct { - u64 vector:8; - u64 reserved1:8; - u64 masked:1; - u64 auto_eoi:1; - u64 polling:1; - u64 reserved2:45; - } __packed; -}; - -/* Define the format of the SIMP register */ -union hv_synic_simp { - u64 as_uint64; - struct { - u64 simp_enabled:1; - u64 preserved:11; - u64 base_simp_gpa:52; - } __packed; -}; - -/* Define the format of the SIEFP register */ -union hv_synic_siefp { - u64 as_uint64; - struct { - u64 siefp_enabled:1; - u64 preserved:11; - u64 base_siefp_gpa:52; - } __packed; -}; - -struct hv_vpset { - u64 format; - u64 valid_bank_mask; - u64 bank_contents[]; -} __packed; - -/* HvCallSendSyntheticClusterIpi hypercall */ -struct hv_send_ipi { - u32 vector; - u32 reserved; - u64 cpu_mask; -} __packed; - -/* HvCallSendSyntheticClusterIpiEx hypercall */ -struct hv_send_ipi_ex { - u32 vector; - u32 reserved; - struct hv_vpset vp_set; -} __packed; - -/* HvFlushGuestPhysicalAddressSpace hypercalls */ -struct hv_guest_mapping_flush { - u64 address_space; - u64 flags; -} __packed; - -/* - * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited - * by the bitwidth of "additional_pages" in union hv_gpa_page_range. - */ -#define HV_MAX_FLUSH_PAGES (2048) - -/* HvFlushGuestPhysicalAddressList hypercall */ -union hv_gpa_page_range { - u64 address_space; - struct { - u64 additional_pages:11; - u64 largepage:1; - u64 basepfn:52; - } page; -}; - -/* - * All input flush parameters should be in single page. The max flush - * count is equal with how many entries of union hv_gpa_page_range can - * be populated into the input parameter page. - */ -#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ - sizeof(union hv_gpa_page_range)) - -struct hv_guest_mapping_flush_list { - u64 address_space; - u64 flags; - union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; -}; - -/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ -struct hv_tlb_flush { - u64 address_space; - u64 flags; - u64 processor_mask; - u64 gva_list[]; -} __packed; - -/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ -struct hv_tlb_flush_ex { - u64 address_space; - u64 flags; - struct hv_vpset hv_vp_set; - u64 gva_list[]; -} __packed; - struct hv_partition_assist_pg { u32 tlb_lock_count; }; -union hv_msi_entry { - u64 as_uint64; - struct { - u32 address; - u32 data; - } __packed; -}; - -struct hv_interrupt_entry { - u32 source; /* 1 for MSI(-X) */ - u32 reserved1; - union hv_msi_entry msi_entry; -} __packed; -/* - * flags for hv_device_interrupt_target.flags - */ -#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 -#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 - -struct hv_device_interrupt_target { - u32 vector; - u32 flags; - union { - u64 vp_mask; - struct hv_vpset vp_set; - }; -} __packed; +#include -/* HvRetargetDeviceInterrupt hypercall */ -struct hv_retarget_device_interrupt { - u64 partition_id; /* use "self" */ - u64 device_id; - struct hv_interrupt_entry int_entry; - u64 reserved2; - struct hv_device_interrupt_target int_target; -} __packed __aligned(8); #endif diff --git a/include/asm-generic/hyperv-tlfs.h b/include/asm-generic/hyperv-tlfs.h new file mode 100644 index 000000000000..1f92ef92eb56 --- /dev/null +++ b/include/asm-generic/hyperv-tlfs.h @@ -0,0 +1,442 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * This file contains definitions from Hyper-V Hypervisor Top-Level Functional + * Specification (TLFS): + * https://docs.microsoft.com/en-us/virtualization/hyper-v-on-windows/reference/tlfs + */ + +#ifndef _ASM_GENERIC_HYPERV_TLFS_H +#define _ASM_GENERIC_HYPERV_TLFS_H + +#include +#include +#include + +/* + * While not explicitly listed in the TLFS, Hyper-V always runs with a page size + * of 4096. These definitions are used when communicating with Hyper-V using + * guest physical pages and guest physical page addresses, since the guest page + * size may not be 4096 on all architectures. + */ +#define HV_HYP_PAGE_SHIFT 12 +#define HV_HYP_PAGE_SIZE BIT(HV_HYP_PAGE_SHIFT) +#define HV_HYP_PAGE_MASK (~(HV_HYP_PAGE_SIZE - 1)) + +/* + * Hyper-V provides two categories of flags relevant to guest VMs. The + * "Features" category indicates specific functionality that is available + * to guests on this particular instance of Hyper-V. The "Features" + * are presented in four groups, each of which is 32 bits. The group A + * and B definitions are common across architectures and are listed here. + * However, not all flags are relevant on all architectures. + * + * Groups C and D vary across architectures and are listed in the + * architecture specific portion of hyperv-tlfs.h. Some of these flags exist + * on multiple architectures, but the bit positions are different so they + * cannot appear in the generic portion of hyperv-tlfs.h. + * + * The "Enlightenments" category provides recommendations on whether to use + * specific enlightenments that are available. The Enlighenments are a single + * group of 32 bits, but they vary across architectures and are listed in + * the architecture specific portion of hyperv-tlfs.h. + */ + +/* + * Group A Features. + */ + +/* VP Runtime register available */ +#define HV_MSR_VP_RUNTIME_AVAILABLE BIT(0) +/* Partition Reference Counter available*/ +#define HV_MSR_TIME_REF_COUNT_AVAILABLE BIT(1) +/* Basic SynIC register available */ +#define HV_MSR_SYNIC_AVAILABLE BIT(2) +/* Synthetic Timer registers available */ +#define HV_MSR_SYNTIMER_AVAILABLE BIT(3) +/* Virtual APIC assist and VP assist page registers available */ +#define HV_MSR_APIC_ACCESS_AVAILABLE BIT(4) +/* Hypercall and Guest OS ID registers available*/ +#define HV_MSR_HYPERCALL_AVAILABLE BIT(5) +/* Access virtual processor index register available*/ +#define HV_MSR_VP_INDEX_AVAILABLE BIT(6) +/* Virtual system reset register available*/ +#define HV_MSR_RESET_AVAILABLE BIT(7) +/* Access statistics page registers available */ +#define HV_MSR_STAT_PAGES_AVAILABLE BIT(8) +/* Partition reference TSC register is available */ +#define HV_MSR_REFERENCE_TSC_AVAILABLE BIT(9) +/* Partition Guest IDLE register is available */ +#define HV_MSR_GUEST_IDLE_AVAILABLE BIT(10) +/* Partition local APIC and TSC frequency registers available */ +#define HV_ACCESS_FREQUENCY_MSRS BIT(11) +/* AccessReenlightenmentControls privilege */ +#define HV_ACCESS_REENLIGHTENMENT BIT(13) +/* AccessTscInvariantControls privilege */ +#define HV_ACCESS_TSC_INVARIANT BIT(15) + +/* + * Group B features. + */ +#define HV_CREATE_PARTITIONS BIT(0) +#define HV_ACCESS_PARTITION_ID BIT(1) +#define HV_ACCESS_MEMORY_POOL BIT(2) +#define HV_ADJUST_MESSAGE_BUFFERS BIT(3) +#define HV_POST_MESSAGES BIT(4) +#define HV_SIGNAL_EVENTS BIT(5) +#define HV_CREATE_PORT BIT(6) +#define HV_CONNECT_PORT BIT(7) +#define HV_ACCESS_STATS BIT(8) +#define HV_DEBUGGING BIT(11) +#define HV_CPU_POWER_MANAGEMENT BIT(12) + + +/* + * TSC page layout. + */ +struct ms_hyperv_tsc_page { + volatile u32 tsc_sequence; + u32 reserved1; + volatile u64 tsc_scale; + volatile s64 tsc_offset; +} __packed; + +/* + * The guest OS needs to register the guest ID with the hypervisor. + * The guest ID is a 64 bit entity and the structure of this ID is + * specified in the Hyper-V specification: + * + * msdn.microsoft.com/en-us/library/windows/hardware/ff542653%28v=vs.85%29.aspx + * + * While the current guideline does not specify how Linux guest ID(s) + * need to be generated, our plan is to publish the guidelines for + * Linux and other guest operating systems that currently are hosted + * on Hyper-V. The implementation here conforms to this yet + * unpublished guidelines. + * + * + * Bit(s) + * 63 - Indicates if the OS is Open Source or not; 1 is Open Source + * 62:56 - Os Type; Linux is 0x100 + * 55:48 - Distro specific identification + * 47:16 - Linux kernel version number + * 15:0 - Distro specific identification + * + * + */ + +#define HV_LINUX_VENDOR_ID 0x8100 + +/* + * Crash notification flags. + */ +#define HV_CRASH_CTL_CRASH_NOTIFY_MSG BIT_ULL(62) +#define HV_CRASH_CTL_CRASH_NOTIFY BIT_ULL(63) + +/* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 +#define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_SEND_IPI 0x000b +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 +#define HVCALL_SEND_IPI_EX 0x0015 +#define HVCALL_POST_MESSAGE 0x005c +#define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_RETARGET_INTERRUPT 0x007e +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af +#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARSE_4K, + HV_GENERIC_SET_ALL, +}; + +#define HV_PARTITION_ID_SELF ((u64)-1) +#define HV_VP_INDEX_SELF ((u32)-2) + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_1 BIT_ULL(32) +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* hypercall status code */ +#define HV_STATUS_SUCCESS 0 +#define HV_STATUS_INVALID_HYPERCALL_CODE 2 +#define HV_STATUS_INVALID_HYPERCALL_INPUT 3 +#define HV_STATUS_INVALID_ALIGNMENT 4 +#define HV_STATUS_INVALID_PARAMETER 5 +#define HV_STATUS_INSUFFICIENT_MEMORY 11 +#define HV_STATUS_INVALID_PORT_ID 17 +#define HV_STATUS_INVALID_CONNECTION_ID 18 +#define HV_STATUS_INSUFFICIENT_BUFFERS 19 + +/* + * The Hyper-V TimeRefCount register and the TSC + * page provide a guest VM clock with 100ns tick rate + */ +#define HV_CLOCK_HZ (NSEC_PER_SEC/100) + +/* Define the number of synthetic interrupt sources. */ +#define HV_SYNIC_SINT_COUNT (16) +/* Define the expected SynIC version. */ +#define HV_SYNIC_VERSION_1 (0x1) +/* Valid SynIC vectors are 16-255. */ +#define HV_SYNIC_FIRST_VALID_VECTOR (16) + +#define HV_SYNIC_CONTROL_ENABLE (1ULL << 0) +#define HV_SYNIC_SIMP_ENABLE (1ULL << 0) +#define HV_SYNIC_SIEFP_ENABLE (1ULL << 0) +#define HV_SYNIC_SINT_MASKED (1ULL << 16) +#define HV_SYNIC_SINT_AUTO_EOI (1ULL << 17) +#define HV_SYNIC_SINT_VECTOR_MASK (0xFF) + +#define HV_SYNIC_STIMER_COUNT (4) + +/* Define synthetic interrupt controller message constants. */ +#define HV_MESSAGE_SIZE (256) +#define HV_MESSAGE_PAYLOAD_BYTE_COUNT (240) +#define HV_MESSAGE_PAYLOAD_QWORD_COUNT (30) + +/* Define synthetic interrupt controller message flags. */ +union hv_message_flags { + __u8 asu8; + struct { + __u8 msg_pending:1; + __u8 reserved:7; + } __packed; +}; + +/* Define port identifier type. */ +union hv_port_id { + __u32 asu32; + struct { + __u32 id:24; + __u32 reserved:8; + } __packed u; +}; + +/* Define synthetic interrupt controller message header. */ +struct hv_message_header { + __u32 message_type; + __u8 payload_size; + union hv_message_flags message_flags; + __u8 reserved[2]; + union { + __u64 sender; + union hv_port_id port; + }; +} __packed; + +/* Define synthetic interrupt controller message format. */ +struct hv_message { + struct hv_message_header header; + union { + __u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT]; + } u; +} __packed; + +/* Define the synthetic interrupt message page layout. */ +struct hv_message_page { + struct hv_message sint_message[HV_SYNIC_SINT_COUNT]; +} __packed; + +/* Define timer message payload structure. */ +struct hv_timer_message_payload { + __u32 timer_index; + __u32 reserved; + __u64 expiration_time; /* When the timer expired */ + __u64 delivery_time; /* When the message was delivered */ +} __packed; + + +/* Define synthetic interrupt controller flag constants. */ +#define HV_EVENT_FLAGS_COUNT (256 * 8) +#define HV_EVENT_FLAGS_LONG_COUNT (256 / sizeof(unsigned long)) + +/* + * Synthetic timer configuration. + */ +union hv_stimer_config { + u64 as_uint64; + struct { + u64 enable:1; + u64 periodic:1; + u64 lazy:1; + u64 auto_enable:1; + u64 apic_vector:8; + u64 direct_mode:1; + u64 reserved_z0:3; + u64 sintx:4; + u64 reserved_z1:44; + } __packed; +}; + + +/* Define the synthetic interrupt controller event flags format. */ +union hv_synic_event_flags { + unsigned long flags[HV_EVENT_FLAGS_LONG_COUNT]; +}; + +/* Define SynIC control register. */ +union hv_synic_scontrol { + u64 as_uint64; + struct { + u64 enable:1; + u64 reserved:63; + } __packed; +}; + +/* Define synthetic interrupt source. */ +union hv_synic_sint { + u64 as_uint64; + struct { + u64 vector:8; + u64 reserved1:8; + u64 masked:1; + u64 auto_eoi:1; + u64 polling:1; + u64 reserved2:45; + } __packed; +}; + +/* Define the format of the SIMP register */ +union hv_synic_simp { + u64 as_uint64; + struct { + u64 simp_enabled:1; + u64 preserved:11; + u64 base_simp_gpa:52; + } __packed; +}; + +/* Define the format of the SIEFP register */ +union hv_synic_siefp { + u64 as_uint64; + struct { + u64 siefp_enabled:1; + u64 preserved:11; + u64 base_siefp_gpa:52; + } __packed; +}; + +struct hv_vpset { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; +} __packed; + +/* HvCallSendSyntheticClusterIpi hypercall */ +struct hv_send_ipi { + u32 vector; + u32 reserved; + u64 cpu_mask; +} __packed; + +/* HvCallSendSyntheticClusterIpiEx hypercall */ +struct hv_send_ipi_ex { + u32 vector; + u32 reserved; + struct hv_vpset vp_set; +} __packed; + +/* HvFlushGuestPhysicalAddressSpace hypercalls */ +struct hv_guest_mapping_flush { + u64 address_space; + u64 flags; +} __packed; + +/* + * HV_MAX_FLUSH_PAGES = "additional_pages" + 1. It's limited + * by the bitwidth of "additional_pages" in union hv_gpa_page_range. + */ +#define HV_MAX_FLUSH_PAGES (2048) + +/* HvFlushGuestPhysicalAddressList hypercall */ +union hv_gpa_page_range { + u64 address_space; + struct { + u64 additional_pages:11; + u64 largepage:1; + u64 basepfn:52; + } page; +}; + +/* + * All input flush parameters should be in single page. The max flush + * count is equal with how many entries of union hv_gpa_page_range can + * be populated into the input parameter page. + */ +#define HV_MAX_FLUSH_REP_COUNT ((HV_HYP_PAGE_SIZE - 2 * sizeof(u64)) / \ + sizeof(union hv_gpa_page_range)) + +struct hv_guest_mapping_flush_list { + u64 address_space; + u64 flags; + union hv_gpa_page_range gpa_list[HV_MAX_FLUSH_REP_COUNT]; +}; + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_tlb_flush { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +} __packed; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_tlb_flush_ex { + u64 address_space; + u64 flags; + struct hv_vpset hv_vp_set; + u64 gva_list[]; +} __packed; + +/* HvRetargetDeviceInterrupt hypercall */ +union hv_msi_entry { + u64 as_uint64; + struct { + u32 address; + u32 data; + } __packed; +}; + +struct hv_interrupt_entry { + u32 source; /* 1 for MSI(-X) */ + u32 reserved1; + union hv_msi_entry msi_entry; +} __packed; + +/* + * flags for hv_device_interrupt_target.flags + */ +#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1 +#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2 + +struct hv_device_interrupt_target { + u32 vector; + u32 flags; + union { + u64 vp_mask; + struct hv_vpset vp_set; + }; +} __packed; + +struct hv_retarget_device_interrupt { + u64 partition_id; /* use "self" */ + u64 device_id; + struct hv_interrupt_entry int_entry; + u64 reserved2; + struct hv_device_interrupt_target int_target; +} __packed __aligned(8); + +#endif -- cgit v1.2.3 From 9b47c5275614a16fd64359fab73fe6c736bf57a0 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Mon, 18 May 2020 15:07:10 -0400 Subject: efi/libstub: Add definitions for console input and events Add the required typedefs etc for using con_in's simple text input protocol, and for using the boottime event services. Also add the prototype for the "stall" boot service. Signed-off-by: Arvind Sankar Link: https://lore.kernel.org/r/20200518190716.751506-19-nivedita@alum.mit.edu Signed-off-by: Ard Biesheuvel --- arch/x86/include/asm/efi.h | 10 +++++ arch/x86/xen/efi.c | 2 +- drivers/firmware/efi/libstub/efistub.h | 77 +++++++++++++++++++++++++++++++--- include/linux/efi.h | 3 +- 4 files changed, 85 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 6b9ab0d8b2a7..89dcc7aa7e2c 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -9,6 +9,7 @@ #include #include #include +#include extern unsigned long efi_fw_vendor, efi_config_table; @@ -293,6 +294,15 @@ static inline u32 efi64_convert_status(efi_status_t status) #define __efi64_argmap_allocate_pool(type, size, buffer) \ ((type), (size), efi64_zero_upper(buffer)) +#define __efi64_argmap_create_event(type, tpl, f, c, event) \ + ((type), (tpl), (f), (c), efi64_zero_upper(event)) + +#define __efi64_argmap_set_timer(event, type, time) \ + ((event), (type), lower_32_bits(time), upper_32_bits(time)) + +#define __efi64_argmap_wait_for_event(num, event, index) \ + ((num), (event), efi64_zero_upper(index)) + #define __efi64_argmap_handle_protocol(handle, protocol, interface) \ ((handle), (protocol), efi64_zero_upper(interface)) diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c index 1abe455d926a..205a9bc981b0 100644 --- a/arch/x86/xen/efi.c +++ b/arch/x86/xen/efi.c @@ -29,7 +29,7 @@ static efi_system_table_t efi_systab_xen __initdata = { .fw_vendor = EFI_INVALID_TABLE_ADDR, /* Initialized later. */ .fw_revision = 0, /* Initialized later. */ .con_in_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ - .con_in = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ + .con_in = NULL, /* Not used under Xen. */ .con_out_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ .con_out = NULL, /* Not used under Xen. */ .stderr_handle = EFI_INVALID_TABLE_ADDR, /* Not used under Xen. */ diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 3a323a009836..c7c03099367f 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -111,6 +111,16 @@ void efi_set_u64_split(u64 data, u32 *lo, u32 *hi) #define EFI_LOCATE_BY_REGISTER_NOTIFY 1 #define EFI_LOCATE_BY_PROTOCOL 2 +/* + * boottime->stall takes the time period in microseconds + */ +#define EFI_USEC_PER_SEC 1000000 + +/* + * boottime->set_timer takes the time in 100ns units + */ +#define EFI_100NSEC_PER_USEC ((u64)10) + struct efi_boot_memmap { efi_memory_desc_t **map; unsigned long *map_size; @@ -122,6 +132,39 @@ struct efi_boot_memmap { typedef struct efi_generic_dev_path efi_device_path_protocol_t; +typedef void *efi_event_t; +/* Note that notifications won't work in mixed mode */ +typedef void (__efiapi *efi_event_notify_t)(efi_event_t, void *); + +#define EFI_EVT_TIMER 0x80000000U +#define EFI_EVT_RUNTIME 0x40000000U +#define EFI_EVT_NOTIFY_WAIT 0x00000100U +#define EFI_EVT_NOTIFY_SIGNAL 0x00000200U + +/* + * boottime->wait_for_event takes an array of events as input. + * Provide a helper to set it up correctly for mixed mode. + */ +static inline +void efi_set_event_at(efi_event_t *events, size_t idx, efi_event_t event) +{ + if (efi_is_native()) + events[idx] = event; + else + ((u32 *)events)[idx] = (u32)(unsigned long)event; +} + +#define EFI_TPL_APPLICATION 4 +#define EFI_TPL_CALLBACK 8 +#define EFI_TPL_NOTIFY 16 +#define EFI_TPL_HIGH_LEVEL 31 + +typedef enum { + EfiTimerCancel, + EfiTimerPeriodic, + EfiTimerRelative +} EFI_TIMER_DELAY; + /* * EFI Boot Services table */ @@ -140,11 +183,16 @@ union efi_boot_services { efi_status_t (__efiapi *allocate_pool)(int, unsigned long, void **); efi_status_t (__efiapi *free_pool)(void *); - void *create_event; - void *set_timer; - void *wait_for_event; + efi_status_t (__efiapi *create_event)(u32, unsigned long, + efi_event_notify_t, void *, + efi_event_t *); + efi_status_t (__efiapi *set_timer)(efi_event_t, + EFI_TIMER_DELAY, u64); + efi_status_t (__efiapi *wait_for_event)(unsigned long, + efi_event_t *, + unsigned long *); void *signal_event; - void *close_event; + efi_status_t (__efiapi *close_event)(efi_event_t); void *check_event; void *install_protocol_interface; void *reinstall_protocol_interface; @@ -171,7 +219,7 @@ union efi_boot_services { efi_status_t (__efiapi *exit_boot_services)(efi_handle_t, unsigned long); void *get_next_monotonic_count; - void *stall; + efi_status_t (__efiapi *stall)(unsigned long); void *set_watchdog_timer; void *connect_controller; efi_status_t (__efiapi *disconnect_controller)(efi_handle_t, @@ -256,6 +304,25 @@ union efi_uga_draw_protocol { } mixed_mode; }; +typedef struct { + u16 scan_code; + efi_char16_t unicode_char; +} efi_input_key_t; + +union efi_simple_text_input_protocol { + struct { + void *reset; + efi_status_t (__efiapi *read_keystroke)(efi_simple_text_input_protocol_t *, + efi_input_key_t *); + efi_event_t wait_for_key; + }; + struct { + u32 reset; + u32 read_keystroke; + u32 wait_for_key; + } mixed_mode; +}; + union efi_simple_text_output_protocol { struct { void *reset; diff --git a/include/linux/efi.h b/include/linux/efi.h index 9b7c7ec319ac..974648db0c68 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -426,6 +426,7 @@ typedef struct { u32 tables; } efi_system_table_32_t; +typedef union efi_simple_text_input_protocol efi_simple_text_input_protocol_t; typedef union efi_simple_text_output_protocol efi_simple_text_output_protocol_t; typedef union { @@ -434,7 +435,7 @@ typedef union { unsigned long fw_vendor; /* physical addr of CHAR16 vendor string */ u32 fw_revision; unsigned long con_in_handle; - unsigned long con_in; + efi_simple_text_input_protocol_t *con_in; unsigned long con_out_handle; efi_simple_text_output_protocol_t *con_out; unsigned long stderr_handle; -- cgit v1.2.3 From c071b0f11e7fb944525b12c80e728af69648d967 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Fri, 22 May 2020 22:22:45 -0700 Subject: x86: bitops: fix build regression This is easily reproducible via CC=clang + CONFIG_STAGING=y + CONFIG_VT6656=m. It turns out that if your config tickles __builtin_constant_p via differences in choices to inline or not, these statements produce invalid assembly: $ cat foo.c long a(long b, long c) { asm("orb %1, %0" : "+q"(c): "r"(b)); return c; } $ gcc foo.c foo.c: Assembler messages: foo.c:2: Error: `%rax' not allowed with `orb' Use the `%b` "x86 Operand Modifier" to instead force register allocation to select a lower-8-bit GPR operand. The "q" constraint only has meaning on -m32 otherwise is treated as "r". Not all GPRs have low-8-bit aliases for -m32. Fixes: 1651e700664b4 ("x86: Fix bitops.h warning with a moved cast") Reported-by: kernelci.org bot Suggested-by: Andy Shevchenko Suggested-by: Brian Gerst Suggested-by: H. Peter Anvin Suggested-by: Ilie Halip Signed-off-by: Nick Desaulniers Signed-off-by: Andrew Morton Tested-by: Sedat Dilek Tested-by: Nathan Chancellor [build, clang-11] Reviewed-by: Nathan Chancellor Reviewed-By: Brian Gerst Reviewed-by: Jesse Brandeburg Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Marco Elver Cc: "Paul E. McKenney" Cc: Andrey Ryabinin Cc: Luc Van Oostenryck Cc: Masahiro Yamada Cc: Daniel Axtens Cc: "Peter Zijlstra (Intel)" Link: http://lkml.kernel.org/r/20200508183230.229464-1-ndesaulniers@google.com Link: https://github.com/ClangBuiltLinux/linux/issues/961 Link: https://lore.kernel.org/lkml/20200504193524.GA221287@google.com/ Link: https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html#x86Operandmodifiers Signed-off-by: Linus Torvalds --- arch/x86/include/asm/bitops.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 53f246e9df5a..0367efdc5b7a 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -52,9 +52,9 @@ static __always_inline void arch_set_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "orb %1,%0" + asm volatile(LOCK_PREFIX "orb %b1,%0" : CONST_MASK_ADDR(nr, addr) - : "iq" (CONST_MASK(nr) & 0xff) + : "iq" (CONST_MASK(nr)) : "memory"); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" @@ -72,9 +72,9 @@ static __always_inline void arch_clear_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "andb %1,%0" + asm volatile(LOCK_PREFIX "andb %b1,%0" : CONST_MASK_ADDR(nr, addr) - : "iq" (CONST_MASK(nr) ^ 0xff)); + : "iq" (~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); @@ -123,9 +123,9 @@ static __always_inline void arch_change_bit(long nr, volatile unsigned long *addr) { if (__builtin_constant_p(nr)) { - asm volatile(LOCK_PREFIX "xorb %1,%0" + asm volatile(LOCK_PREFIX "xorb %b1,%0" : CONST_MASK_ADDR(nr, addr) - : "iq" ((u8)CONST_MASK(nr))); + : "iq" (CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); -- cgit v1.2.3 From 700d3a5a664df267f01ec8887fd2d8ff98f67e7f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 8 May 2020 17:25:32 -0700 Subject: x86/syscalls: Revert "x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long" Revert 45e29d119e99 ("x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long") and add a comment to discourage someone else from making the same mistake again. It turns out that some user code fails to compile if __X32_SYSCALL_BIT is unsigned long. See, for example [1] below. [ bp: Massage and do the same thing in the respective tools/ header. ] Fixes: 45e29d119e99 ("x86/syscalls: Make __X32_SYSCALL_BIT be unsigned long") Reported-by: Thorsten Glaser Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: stable@kernel.org Link: [1] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=954294 Link: https://lkml.kernel.org/r/92e55442b744a5951fdc9cfee10badd0a5f7f828.1588983892.git.luto@kernel.org --- arch/x86/include/uapi/asm/unistd.h | 11 +++++++++-- tools/arch/x86/include/uapi/asm/unistd.h | 2 +- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/unistd.h b/arch/x86/include/uapi/asm/unistd.h index 196fdd02b8b1..be5e2e747f50 100644 --- a/arch/x86/include/uapi/asm/unistd.h +++ b/arch/x86/include/uapi/asm/unistd.h @@ -2,8 +2,15 @@ #ifndef _UAPI_ASM_X86_UNISTD_H #define _UAPI_ASM_X86_UNISTD_H -/* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000UL +/* + * x32 syscall flag bit. Some user programs expect syscall NR macros + * and __X32_SYSCALL_BIT to have type int, even though syscall numbers + * are, for practical purposes, unsigned long. + * + * Fortunately, expressions like (nr & ~__X32_SYSCALL_BIT) do the right + * thing regardless. + */ +#define __X32_SYSCALL_BIT 0x40000000 #ifndef __KERNEL__ # ifdef __i386__ diff --git a/tools/arch/x86/include/uapi/asm/unistd.h b/tools/arch/x86/include/uapi/asm/unistd.h index 196fdd02b8b1..30d7d04d72d6 100644 --- a/tools/arch/x86/include/uapi/asm/unistd.h +++ b/tools/arch/x86/include/uapi/asm/unistd.h @@ -3,7 +3,7 @@ #define _UAPI_ASM_X86_UNISTD_H /* x32 syscall flag bit */ -#define __X32_SYSCALL_BIT 0x40000000UL +#define __X32_SYSCALL_BIT 0x40000000 #ifndef __KERNEL__ # ifdef __i386__ -- cgit v1.2.3 From 003d80535180f74f262c40462b9fccd7f004901a Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 May 2020 12:09:43 +0200 Subject: x86/apb_timer: Drop unused TSC calibration Drop the APB-timer TSC calibration, which hasn't been used since the removal of Moorestown support by commit 1a8359e411eb ("x86/mid: Remove Intel Moorestown"). Signed-off-by: Johan Hovold Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200513100944.9171-1-johan@kernel.org --- arch/x86/include/asm/apb_timer.h | 2 -- arch/x86/kernel/apb_timer.c | 53 ---------------------------------------- 2 files changed, 55 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index 99bb207fc04c..0a9bf8b77a28 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -28,7 +28,6 @@ #define APBT_DEV_USED 1 extern void apbt_time_init(void); -extern unsigned long apbt_quick_calibrate(void); extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); @@ -38,7 +37,6 @@ extern int sfi_mtimer_num; #else /* CONFIG_APB_TIMER */ -static inline unsigned long apbt_quick_calibrate(void) {return 0; } static inline void apbt_time_init(void) { } #endif diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index fe698f96617c..263eeaddb0aa 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -345,56 +345,3 @@ out_noapbt: apb_timer_block_enabled = 0; panic("failed to enable APB timer\n"); } - -/* called before apb_timer_enable, use early map */ -unsigned long apbt_quick_calibrate(void) -{ - int i, scale; - u64 old, new; - u64 t1, t2; - unsigned long khz = 0; - u32 loop, shift; - - apbt_set_mapping(); - dw_apb_clocksource_start(clocksource_apbt); - - /* check if the timer can count down, otherwise return */ - old = dw_apb_clocksource_read(clocksource_apbt); - i = 10000; - while (--i) { - if (old != dw_apb_clocksource_read(clocksource_apbt)) - break; - } - if (!i) - goto failed; - - /* count 16 ms */ - loop = (apbt_freq / 1000) << 4; - - /* restart the timer to ensure it won't get to 0 in the calibration */ - dw_apb_clocksource_start(clocksource_apbt); - - old = dw_apb_clocksource_read(clocksource_apbt); - old += loop; - - t1 = rdtsc(); - - do { - new = dw_apb_clocksource_read(clocksource_apbt); - } while (new < old); - - t2 = rdtsc(); - - shift = 5; - if (unlikely(loop >> shift == 0)) { - printk(KERN_INFO - "APBT TSC calibration failed, not enough resolution\n"); - return 0; - } - scale = (int)div_u64((t2 - t1), loop >> shift); - khz = (scale * (apbt_freq / 1000)) >> shift; - printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz); - return khz; -failed: - return 0; -} -- cgit v1.2.3 From e027a2bc934fd05d52ec5b77d159efdfc485b5b3 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 13 May 2020 12:09:44 +0200 Subject: x86/apb_timer: Drop unused declaration and macro Drop an extern declaration that has never been used and a no longer needed macro. Signed-off-by: Johan Hovold Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20200513100944.9171-2-johan@kernel.org --- arch/x86/include/asm/apb_timer.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h index 0a9bf8b77a28..87ce8e963215 100644 --- a/arch/x86/include/asm/apb_timer.h +++ b/arch/x86/include/asm/apb_timer.h @@ -25,10 +25,7 @@ #define APBT_MIN_FREQ 1000000 #define APBT_MMAP_SIZE 1024 -#define APBT_DEV_USED 1 - extern void apbt_time_init(void); -extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); extern void apbt_setup_secondary_clock(void); extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); -- cgit v1.2.3 From ed3119e45519880a246b085ba6a96aa1074da532 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 18 May 2020 14:08:55 +0200 Subject: x86: Hide the archdata.iommu field behind generic IOMMU_API There is a generic, kernel wide configuration symbol for enabling the IOMMU specific bits: CONFIG_IOMMU_API. Implementations (including INTEL_IOMMU and AMD_IOMMU driver) select it so use it here as well. This makes the conditional archdata.iommu field consistent with other platforms and also fixes any compile test builds of other IOMMU drivers, when INTEL_IOMMU or AMD_IOMMU are not selected). For the case when INTEL_IOMMU/AMD_IOMMU and COMPILE_TEST are not selected, this should create functionally equivalent code/choice. With COMPILE_TEST this field could appear if other IOMMU drivers are chosen but neither INTEL_IOMMU nor AMD_IOMMU are not. Reported-by: kbuild test robot Fixes: e93a1695d7fb ("iommu: Enable compile testing for some of drivers") Signed-off-by: Krzysztof Kozlowski Acked-by: Borislav Petkov Link: https://lore.kernel.org/r/20200518120855.27822-2-krzk@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/include/asm/device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h index 7e31f7f1bb06..49bd6cf3eec9 100644 --- a/arch/x86/include/asm/device.h +++ b/arch/x86/include/asm/device.h @@ -3,7 +3,7 @@ #define _ASM_X86_DEVICE_H struct dev_archdata { -#if defined(CONFIG_INTEL_IOMMU) || defined(CONFIG_AMD_IOMMU) +#ifdef CONFIG_IOMMU_API void *iommu; /* hook for IOMMU specific extension */ #endif }; -- cgit v1.2.3 From cb97c2d680dd9cefd2b36cafd2426f52d85b27e7 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 18 Feb 2020 15:40:11 -0800 Subject: KVM: x86: Take an unsigned 32-bit int for has_emulated_msr()'s index Take a u32 for the index in has_emulated_msr() to match hardware, which treats MSR indices as unsigned 32-bit values. Functionally, taking a signed int doesn't cause problems with the current code base, but could theoretically cause problems with 32-bit KVM, e.g. if the index were checked via a less-than statement, which would evaluate incorrectly for MSR indices with bit 31 set. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Sean Christopherson Message-Id: <20200218234012.7110-3-sean.j.christopherson@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/vmx/vmx.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index fd78bd44b2d6..db261da578f3 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1064,7 +1064,7 @@ struct kvm_x86_ops { void (*hardware_disable)(void); void (*hardware_unsetup)(void); bool (*cpu_has_accelerated_tpr)(void); - bool (*has_emulated_msr)(int index); + bool (*has_emulated_msr)(u32 index); void (*cpuid_update)(struct kvm_vcpu *vcpu); unsigned int vm_size; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index e9c0fb68387d..d877a0f70cac 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3525,7 +3525,7 @@ static bool svm_cpu_has_accelerated_tpr(void) return false; } -static bool svm_has_emulated_msr(int index) +static bool svm_has_emulated_msr(u32 index) { switch (index) { case MSR_IA32_MCG_EXT_CTL: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 0ea5a225a579..ab31033121f9 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6434,7 +6434,7 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) handle_exception_nmi_irqoff(vmx); } -static bool vmx_has_emulated_msr(int index) +static bool vmx_has_emulated_msr(u32 index) { switch (index) { case MSR_IA32_SMBASE: -- cgit v1.2.3 From 5cde265384cad739b162cf08afba6da8857778bd Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 27 May 2020 15:46:59 -0700 Subject: perf/x86/rapl: Add AMD Fam17h RAPL support This patch enables AMD Fam17h RAPL support for the Package level metric. The support is as per AMD Fam17h Model31h (Zen2) and model 00-ffh (Zen1) PPR. The same output is available via the energy-pkg pseudo event: $ perf stat -a -I 1000 --per-socket -e power/energy-pkg/ Signed-off-by: Stephane Eranian Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20200527224659.206129-6-eranian@google.com --- arch/x86/events/rapl.c | 18 ++++++++++++++++++ arch/x86/include/asm/msr-index.h | 3 +++ 2 files changed, 21 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c index 8d17af4b1ca9..0f2bf59f4354 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -537,6 +537,16 @@ static struct perf_msr intel_rapl_msrs[] = { [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr }, }; +/* + * Force to PERF_RAPL_MAX size due to: + * - perf_msr_probe(PERF_RAPL_MAX) + * - want to use same event codes across both architectures + */ +static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = { + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr }, +}; + + static int rapl_cpu_offline(unsigned int cpu) { struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); @@ -740,6 +750,13 @@ static struct rapl_model model_skl = { .rapl_msrs = intel_rapl_msrs, }; +static struct rapl_model model_amd_fam17h = { + .events = BIT(PERF_RAPL_PKG), + .apply_quirk = false, + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_msrs = amd_rapl_msrs, +}; + static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb), X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep), @@ -770,6 +787,7 @@ static const struct x86_cpu_id rapl_model_match[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl), X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl), + X86_MATCH_VENDOR_FAM(AMD, 0x17, &model_amd_fam17h), {}, }; MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 12c9684d59ba..ef452b817f44 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -301,6 +301,9 @@ #define MSR_PP1_ENERGY_STATUS 0x00000641 #define MSR_PP1_POLICY 0x00000642 +#define MSR_AMD_PKG_ENERGY_STATUS 0xc001029b +#define MSR_AMD_RAPL_POWER_UNIT 0xc0010299 + /* Config TDP MSRs */ #define MSR_CONFIG_TDP_NOMINAL 0x00000648 #define MSR_CONFIG_TDP_LEVEL_1 0x00000649 -- cgit v1.2.3 From c9d40913ac5a21eb2b976bb221a4677540e84eba Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 22 May 2020 11:21:49 -0400 Subject: KVM: x86: enable event window in inject_pending_event In case an interrupt arrives after nested.check_events but before the call to kvm_cpu_has_injectable_intr, we could end up enabling the interrupt window even if the interrupt is actually going to be a vmexit. This is useless rather than harmful, but it really complicates reasoning about SVM's handling of the VINTR intercept. We'd like to never bother with the VINTR intercept if V_INTR_MASKING=1 && INTERCEPT_INTR=1, because in that case there is no interrupt window and we can just exit the nested guest whenever we want. This patch moves the opening of the interrupt window inside inject_pending_event. This consolidates the check for pending interrupt/NMI/SMI in one place, and makes KVM's usage of immediate exits more consistent, extending it beyond just nested virtualization. There are two functional changes here. They only affect corner cases, but overall they simplify the inject_pending_event. - re-injection of still-pending events will also use req_immediate_exit instead of using interrupt-window intercepts. This should have no impact on performance on Intel since it simply replaces an interrupt-window or NMI-window exit for a preemption-timer exit. On AMD, which has no equivalent of the preemption time, it may incur some overhead but an actual effect on performance should only be visible in pathological cases. - kvm_arch_interrupt_allowed and kvm_vcpu_has_events will return true if an interrupt, NMI or SMI is blocked by nested_run_pending. This makes sense because entering the VM will allow it to make progress and deliver the event. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 +-- arch/x86/kvm/svm/svm.c | 24 ++++----- arch/x86/kvm/vmx/vmx.c | 20 +++---- arch/x86/kvm/x86.c | 117 ++++++++++++++++++++++------------------ 4 files changed, 92 insertions(+), 77 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index db261da578f3..7707bd4b0593 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1136,8 +1136,8 @@ struct kvm_x86_ops { void (*set_nmi)(struct kvm_vcpu *vcpu); void (*queue_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); - bool (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); - bool (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); @@ -1234,10 +1234,10 @@ struct kvm_x86_ops { void (*setup_mce)(struct kvm_vcpu *vcpu); - bool (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); + int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); - int (*enable_smi_window)(struct kvm_vcpu *vcpu); + void (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index d877a0f70cac..5da494847a2f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3053,15 +3053,15 @@ bool svm_nmi_blocked(struct kvm_vcpu *vcpu) return ret; } -static bool svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) - return false; + return -EBUSY; /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) - return false; + return -EBUSY; return !svm_nmi_blocked(vcpu); } @@ -3112,18 +3112,18 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); } -static bool svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) - return false; + return -EBUSY; /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) - return false; + return -EBUSY; return !svm_interrupt_blocked(vcpu); } @@ -3793,15 +3793,15 @@ bool svm_smi_blocked(struct kvm_vcpu *vcpu) return is_smm(vcpu); } -static bool svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { struct vcpu_svm *svm = to_svm(vcpu); if (svm->nested.nested_run_pending) - return false; + return -EBUSY; /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) - return false; + return -EBUSY; return !svm_smi_blocked(vcpu); } @@ -3848,7 +3848,7 @@ static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 0; } -static int enable_smi_window(struct kvm_vcpu *vcpu) +static void enable_smi_window(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3856,9 +3856,9 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) if (vgif_enabled(svm)) set_intercept(svm, INTERCEPT_STGI); /* STGI will cause a vm exit */ - return 1; + } else { + /* We must be in SMM; RSM will cause a vmexit anyway. */ } - return 0; } static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 4e76e30b661c..b3a41645e157 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4551,14 +4551,14 @@ bool vmx_nmi_blocked(struct kvm_vcpu *vcpu) GUEST_INTR_STATE_NMI)); } -static bool vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) - return false; + return -EBUSY; /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu)) - return false; + return -EBUSY; return !vmx_nmi_blocked(vcpu); } @@ -4573,17 +4573,17 @@ bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu) (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)); } -static bool vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) { if (to_vmx(vcpu)->nested.nested_run_pending) - return false; + return -EBUSY; /* * An IRQ must not be injected into L2 if it's supposed to VM-Exit, * e.g. if the IRQ arrived asynchronously after checking nested events. */ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) - return false; + return -EBUSY; return !vmx_interrupt_blocked(vcpu); } @@ -7757,11 +7757,11 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEAT_CTL_LMCE_ENABLED; } -static bool vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) +static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) { /* we need a nested vmexit to enter SMM, postpone if run is pending */ if (to_vmx(vcpu)->nested.nested_run_pending) - return false; + return -EBUSY; return !is_smm(vcpu); } @@ -7799,9 +7799,9 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) return 0; } -static int enable_smi_window(struct kvm_vcpu *vcpu) +static void enable_smi_window(struct kvm_vcpu *vcpu) { - return 0; + /* RSM will cause a vmexit anyway. */ } static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 77b9b4e66673..0ee828f60d05 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7714,7 +7714,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu) kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr); } -static int inject_pending_event(struct kvm_vcpu *vcpu) +static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) { int r; bool can_inject = true; @@ -7760,8 +7760,8 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) */ if (is_guest_mode(vcpu)) { r = kvm_x86_ops.nested_ops->check_events(vcpu); - if (r != 0) - return r; + if (r < 0) + goto busy; } /* try to inject new event if pending */ @@ -7799,27 +7799,69 @@ static int inject_pending_event(struct kvm_vcpu *vcpu) can_inject = false; } - /* Finish re-injection before considering new events */ - if (!can_inject) - return 0; + /* + * Finally, inject interrupt events. If an event cannot be injected + * due to architectural conditions (e.g. IF=0) a window-open exit + * will re-request KVM_REQ_EVENT. Sometimes however an event is pending + * and can architecturally be injected, but we cannot do it right now: + * an interrupt could have arrived just now and we have to inject it + * as a vmexit, or there could already an event in the queue, which is + * indicated by can_inject. In that case we request an immediate exit + * in order to make progress and get back here for another iteration. + * The kvm_x86_ops hooks communicate this by returning -EBUSY. + */ + if (vcpu->arch.smi_pending) { + r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + vcpu->arch.smi_pending = false; + ++vcpu->arch.smi_count; + enter_smm(vcpu); + can_inject = false; + } else + kvm_x86_ops.enable_smi_window(vcpu); + } + + if (vcpu->arch.nmi_pending) { + r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + --vcpu->arch.nmi_pending; + vcpu->arch.nmi_injected = true; + kvm_x86_ops.set_nmi(vcpu); + can_inject = false; + WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0); + } + if (vcpu->arch.nmi_pending) + kvm_x86_ops.enable_nmi_window(vcpu); + } - if (vcpu->arch.smi_pending && - kvm_x86_ops.smi_allowed(vcpu, true)) { - vcpu->arch.smi_pending = false; - ++vcpu->arch.smi_count; - enter_smm(vcpu); - } else if (vcpu->arch.nmi_pending && - kvm_x86_ops.nmi_allowed(vcpu, true)) { - --vcpu->arch.nmi_pending; - vcpu->arch.nmi_injected = true; - kvm_x86_ops.set_nmi(vcpu); - } else if (kvm_cpu_has_injectable_intr(vcpu) && - kvm_x86_ops.interrupt_allowed(vcpu, true)) { - kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); - kvm_x86_ops.set_irq(vcpu); + if (kvm_cpu_has_injectable_intr(vcpu)) { + r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY; + if (r < 0) + goto busy; + if (r) { + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false); + kvm_x86_ops.set_irq(vcpu); + WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0); + } + if (kvm_cpu_has_injectable_intr(vcpu)) + kvm_x86_ops.enable_irq_window(vcpu); } - return 0; + if (is_guest_mode(vcpu) && + kvm_x86_ops.nested_ops->hv_timer_pending && + kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) + *req_immediate_exit = true; + + WARN_ON(vcpu->arch.exception.pending); + return; + +busy: + *req_immediate_exit = true; + return; } static void process_nmi(struct kvm_vcpu *vcpu) @@ -8357,36 +8399,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto out; } - if (inject_pending_event(vcpu) != 0) - req_immediate_exit = true; - else { - /* Enable SMI/NMI/IRQ window open exits if needed. - * - * SMIs have three cases: - * 1) They can be nested, and then there is nothing to - * do here because RSM will cause a vmexit anyway. - * 2) There is an ISA-specific reason why SMI cannot be - * injected, and the moment when this changes can be - * intercepted. - * 3) Or the SMI can be pending because - * inject_pending_event has completed the injection - * of an IRQ or NMI from the previous vmexit, and - * then we request an immediate exit to inject the - * SMI. - */ - if (vcpu->arch.smi_pending && !is_smm(vcpu)) - if (!kvm_x86_ops.enable_smi_window(vcpu)) - req_immediate_exit = true; - if (vcpu->arch.nmi_pending) - kvm_x86_ops.enable_nmi_window(vcpu); - if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win) - kvm_x86_ops.enable_irq_window(vcpu); - if (is_guest_mode(vcpu) && - kvm_x86_ops.nested_ops->hv_timer_pending && - kvm_x86_ops.nested_ops->hv_timer_pending(vcpu)) - req_immediate_exit = true; - WARN_ON(vcpu->arch.exception.pending); - } + inject_pending_event(vcpu, &req_immediate_exit); + if (req_int_win) + kvm_x86_ops.enable_irq_window(vcpu); if (kvm_lapic_enabled(vcpu)) { update_cr8_intercept(vcpu); -- cgit v1.2.3 From 7c86663b68bab393633d8312a0d25a3d004de182 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Sat, 16 May 2020 08:42:28 -0400 Subject: KVM: nSVM: inject exceptions via svm_check_nested_events This allows exceptions injected by the emulator to be properly delivered as vmexits. The code also becomes simpler, because we can just let all L0-intercepted exceptions go through the usual path. In particular, our emulation of the VMX #DB exit qualification is very much simplified, because the vmexit injection path can use kvm_deliver_exception_payload to update DR6. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/svm/nested.c | 136 ++++++++++++++++------------------------ arch/x86/kvm/svm/svm.c | 9 --- arch/x86/kvm/svm/svm.h | 1 + arch/x86/kvm/x86.c | 13 +--- 5 files changed, 58 insertions(+), 103 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7707bd4b0593..e6f2e1a2dab6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1495,6 +1495,8 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); +void kvm_update_dr7(struct kvm_vcpu *vcpu); + int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index f39e0d578b9b..1ae13fd17028 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -111,6 +111,8 @@ void recalc_intercepts(struct vcpu_svm *svm) h = &svm->nested.hsave->control; g = &svm->nested; + svm->nested.host_intercept_exceptions = h->intercept_exceptions; + c->intercept_cr = h->intercept_cr; c->intercept_dr = h->intercept_dr; c->intercept_exceptions = h->intercept_exceptions; @@ -616,50 +618,6 @@ static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) return (value & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } -/* DB exceptions for our internal use must not cause vmexit */ -static int nested_svm_intercept_db(struct vcpu_svm *svm) -{ - unsigned long dr6 = svm->vmcb->save.dr6; - - /* Always catch it and pass it to userspace if debugging. */ - if (svm->vcpu.guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - return NESTED_EXIT_HOST; - - /* if we're not singlestepping, it's not ours */ - if (!svm->nmi_singlestep) - goto reflected_db; - - /* if it's not a singlestep exception, it's not ours */ - if (!(dr6 & DR6_BS)) - goto reflected_db; - - /* if the guest is singlestepping, it should get the vmexit */ - if (svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF) { - disable_nmi_singlestep(svm); - goto reflected_db; - } - - /* it's ours, the nested hypervisor must not see this one */ - return NESTED_EXIT_HOST; - -reflected_db: - /* - * Synchronize guest DR6 here just like in kvm_deliver_exception_payload; - * it will be moved into the nested VMCB by nested_svm_vmexit. Once - * exceptions will be moved to svm_check_nested_events, all this stuff - * will just go away and we could just return NESTED_EXIT_HOST - * unconditionally. db_interception will queue the exception, which - * will be processed by svm_check_nested_events if a nested vmexit is - * required, and we will just use kvm_deliver_exception_payload to copy - * the payload to DR6 before vmexit. - */ - WARN_ON(svm->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT); - svm->vcpu.arch.dr6 &= ~(DR_TRAP_BITS | DR6_RTM); - svm->vcpu.arch.dr6 |= dr6 & ~DR6_FIXED_1; - return NESTED_EXIT_DONE; -} - static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { unsigned port, size, iopm_len; @@ -710,20 +668,12 @@ static int nested_svm_intercept(struct vcpu_svm *svm) break; } case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { - u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); - if (svm->nested.intercept_exceptions & excp_bits) { - if (exit_code == SVM_EXIT_EXCP_BASE + DB_VECTOR) - vmexit = nested_svm_intercept_db(svm); - else if (exit_code == SVM_EXIT_EXCP_BASE + BP_VECTOR && - svm->vcpu.guest_debug & KVM_GUESTDBG_USE_SW_BP) - vmexit = NESTED_EXIT_HOST; - else - vmexit = NESTED_EXIT_DONE; - } - /* async page fault always cause vmexit */ - else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) && - svm->vcpu.arch.exception.nested_apf != 0) - vmexit = NESTED_EXIT_DONE; + /* + * Host-intercepted exceptions have been checked already in + * nested_svm_exit_special. There is nothing to do here, + * the vmexit is injected by svm_check_nested_events. + */ + vmexit = NESTED_EXIT_DONE; break; } case SVM_EXIT_ERR: { @@ -768,35 +718,45 @@ int nested_svm_check_permissions(struct vcpu_svm *svm) return 0; } -int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, - bool has_error_code, u32 error_code) +static bool nested_exit_on_exception(struct vcpu_svm *svm) { - int vmexit; + unsigned int nr = svm->vcpu.arch.exception.nr; - if (!is_guest_mode(&svm->vcpu)) - return 0; + return (svm->nested.intercept_exceptions & (1 << nr)); +} - vmexit = nested_svm_intercept(svm); - if (vmexit != NESTED_EXIT_DONE) - return 0; +static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm) +{ + unsigned int nr = svm->vcpu.arch.exception.nr; svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; svm->vmcb->control.exit_code_hi = 0; - svm->vmcb->control.exit_info_1 = error_code; + + if (svm->vcpu.arch.exception.has_error_code) + svm->vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code; /* * EXITINFO2 is undefined for all exception intercepts other * than #PF. */ - if (svm->vcpu.arch.exception.nested_apf) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; - else if (svm->vcpu.arch.exception.has_payload) - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; - else - svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + if (nr == PF_VECTOR) { + if (svm->vcpu.arch.exception.nested_apf) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token; + else if (svm->vcpu.arch.exception.has_payload) + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload; + else + svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2; + } else if (nr == DB_VECTOR) { + /* See inject_pending_event. */ + kvm_deliver_exception_payload(&svm->vcpu); + if (svm->vcpu.arch.dr7 & DR7_GD) { + svm->vcpu.arch.dr7 &= ~DR7_GD; + kvm_update_dr7(&svm->vcpu); + } + } else + WARN_ON(svm->vcpu.arch.exception.has_payload); - svm->nested.exit_required = true; - return vmexit; + nested_svm_vmexit(svm); } static void nested_svm_smi(struct vcpu_svm *svm) @@ -835,6 +795,15 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu) kvm_event_needs_reinjection(vcpu) || svm->nested.exit_required || svm->nested.nested_run_pending; + if (vcpu->arch.exception.pending) { + if (block_nested_events) + return -EBUSY; + if (!nested_exit_on_exception(svm)) + return 0; + nested_svm_inject_exception_vmexit(svm); + return 0; + } + if (vcpu->arch.smi_pending && !svm_smi_blocked(vcpu)) { if (block_nested_events) return -EBUSY; @@ -872,18 +841,19 @@ int nested_svm_exit_special(struct vcpu_svm *svm) switch (exit_code) { case SVM_EXIT_INTR: case SVM_EXIT_NMI: - case SVM_EXIT_EXCP_BASE + MC_VECTOR: - return NESTED_EXIT_HOST; case SVM_EXIT_NPF: - /* For now we are always handling NPFs when using them */ - if (npt_enabled) + return NESTED_EXIT_HOST; + case SVM_EXIT_EXCP_BASE ... SVM_EXIT_EXCP_BASE + 0x1f: { + u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); + + if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) return NESTED_EXIT_HOST; - break; - case SVM_EXIT_EXCP_BASE + PF_VECTOR: - /* Trap async PF even if not shadowing */ - if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason) + else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && + svm->vcpu.arch.apf.host_apf_reason) + /* Trap async PF even if not shadowing */ return NESTED_EXIT_HOST; break; + } default: break; } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 5da494847a2f..89b6fb1e0abc 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -331,17 +331,8 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); unsigned nr = vcpu->arch.exception.nr; bool has_error_code = vcpu->arch.exception.has_error_code; - bool reinject = vcpu->arch.exception.injected; u32 error_code = vcpu->arch.exception.error_code; - /* - * If we are within a nested VM we'd better #VMEXIT and let the guest - * handle the exception - */ - if (!reinject && - nested_svm_check_exception(svm, nr, has_error_code, error_code)) - return; - kvm_deliver_exception_payload(&svm->vcpu); if (nr == BP_VECTOR && !nrips) { diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5cc559ab862d..8342032291fc 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -86,6 +86,7 @@ struct nested_state { u64 hsave_msr; u64 vm_cr_msr; u64 vmcb; + u32 host_intercept_exceptions; /* These are the merged vectors */ u32 *msrpm; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0ee828f60d05..f0fa610bed91 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1072,7 +1072,7 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) } } -static void kvm_update_dr7(struct kvm_vcpu *vcpu) +void kvm_update_dr7(struct kvm_vcpu *vcpu) { unsigned long dr7; @@ -1085,6 +1085,7 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) if (dr7 & DR7_BP_EN_MASK) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +EXPORT_SYMBOL_GPL(kvm_update_dr7); static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) { @@ -7778,16 +7779,6 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit X86_EFLAGS_RF); if (vcpu->arch.exception.nr == DB_VECTOR) { - /* - * This code assumes that nSVM doesn't use - * check_nested_events(). If it does, the - * DR6/DR7 changes should happen before L1 - * gets a #VMEXIT for an intercepted #DB in - * L2. (Under VMX, on the other hand, the - * DR6/DR7 changes should not happen in the - * event of a VM-exit to L1 for an intercepted - * #DB in L2.) - */ kvm_deliver_exception_payload(vcpu); if (vcpu->arch.dr7 & DR7_GD) { vcpu->arch.dr7 &= ~DR7_GD; -- cgit v1.2.3 From 431732651cc16caebcd334b7b7476bfe0c4a2903 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Sun, 23 Feb 2020 11:43:22 +0200 Subject: x86/mm: Drop deprecated DISCONTIGMEM support for 32-bit The DISCONTIGMEM support was marked as deprecated in v5.2 and since there were no complaints about it for almost 5 releases it can be completely removed. Signed-off-by: Mike Rapoport Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: https://lkml.kernel.org/r/20200223094322.15206-1-rppt@kernel.org --- arch/x86/Kconfig | 9 --------- arch/x86/include/asm/mmzone_32.h | 39 --------------------------------------- arch/x86/include/asm/pgtable_32.h | 3 +-- arch/x86/mm/numa_32.c | 34 ---------------------------------- 4 files changed, 1 insertion(+), 84 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d6104ea8af0..f0aa19469d4b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1610,19 +1610,10 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config ARCH_HAVE_MEMORY_PRESENT - def_bool y - depends on X86_32 && DISCONTIGMEM - config ARCH_FLATMEM_ENABLE def_bool y depends on X86_32 && !NUMA -config ARCH_DISCONTIGMEM_ENABLE - def_bool n - depends on NUMA && X86_32 - depends on BROKEN - config ARCH_SPARSEMEM_ENABLE def_bool y depends on X86_64 || NUMA || X86_32 || X86_32_NON_STANDARD diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index 73d8dd14dda2..2d4515e8b7df 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -14,43 +14,4 @@ extern struct pglist_data *node_data[]; #define NODE_DATA(nid) (node_data[nid]) #endif /* CONFIG_NUMA */ -#ifdef CONFIG_DISCONTIGMEM - -/* - * generic node memory support, the following assumptions apply: - * - * 1) memory comes in 64Mb contiguous chunks which are either present or not - * 2) we will not have more than 64Gb in total - * - * for now assume that 64Gb is max amount of RAM for whole system - * 64Gb / 4096bytes/page = 16777216 pages - */ -#define MAX_NR_PAGES 16777216 -#define MAX_SECTIONS 1024 -#define PAGES_PER_SECTION (MAX_NR_PAGES/MAX_SECTIONS) - -extern s8 physnode_map[]; - -static inline int pfn_to_nid(unsigned long pfn) -{ -#ifdef CONFIG_NUMA - return((int) physnode_map[(pfn) / PAGES_PER_SECTION]); -#else - return 0; -#endif -} - -static inline int pfn_valid(int pfn) -{ - int nid = pfn_to_nid(pfn); - - if (nid >= 0) - return (pfn < node_end_pfn(nid)); - return 0; -} - -#define early_pfn_valid(pfn) pfn_valid((pfn)) - -#endif /* CONFIG_DISCONTIGMEM */ - #endif /* _ASM_X86_MMZONE_32_H */ diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0dca7f7aeff2..be7b19646897 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -66,8 +66,7 @@ do { \ #endif /* !__ASSEMBLY__ */ /* - * kern_addr_valid() is (1) for FLATMEM and (0) for - * SPARSEMEM and DISCONTIGMEM + * kern_addr_valid() is (1) for FLATMEM and (0) for SPARSEMEM */ #ifdef CONFIG_FLATMEM #define kern_addr_valid(addr) (1) diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index f2bd3d61e16b..104544359d69 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -27,40 +27,6 @@ #include "numa_internal.h" -#ifdef CONFIG_DISCONTIGMEM -/* - * 4) physnode_map - the mapping between a pfn and owning node - * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 64Mb break (each element of the array will - * represent 64Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * - * physnode_map[0-15] = 0; - * physnode_map[16-31] = 1; - * physnode_map[32- ] = -1; - */ -s8 physnode_map[MAX_SECTIONS] __read_mostly = { [0 ... (MAX_SECTIONS - 1)] = -1}; -EXPORT_SYMBOL(physnode_map); - -void memory_present(int nid, unsigned long start, unsigned long end) -{ - unsigned long pfn; - - printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", - nid, start, end); - printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); - printk(KERN_DEBUG " "); - start = round_down(start, PAGES_PER_SECTION); - end = round_up(end, PAGES_PER_SECTION); - for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { - physnode_map[pfn / PAGES_PER_SECTION] = nid; - printk(KERN_CONT "%lx ", pfn); - } - printk(KERN_CONT "\n"); -} -#endif - extern unsigned long highend_pfn, highstart_pfn; void __init initmem_init(void) -- cgit v1.2.3 From 88743470668ef5eb6b7ba9e0f99888e5999bf172 Mon Sep 17 00:00:00 2001 From: Alexander Dahl Date: Tue, 26 May 2020 19:57:49 +0200 Subject: x86/dma: Fix max PFN arithmetic overflow on 32 bit systems MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The intermediate result of the old term (4UL * 1024 * 1024 * 1024) is 4 294 967 296 or 0x100000000 which is no problem on 64 bit systems. The patch does not change the later overall result of 0x100000 for MAX_DMA32_PFN (after it has been shifted by PAGE_SHIFT). The new calculation yields the same result, but does not require 64 bit arithmetic. On 32 bit systems the old calculation suffers from an arithmetic overflow in that intermediate term in braces: 4UL aka unsigned long int is 4 byte wide and an arithmetic overflow happens (the 0x100000000 does not fit in 4 bytes), the in braces result is truncated to zero, the following right shift does not alter that, so MAX_DMA32_PFN evaluates to 0 on 32 bit systems. That wrong value is a problem in a comparision against MAX_DMA32_PFN in the init code for swiotlb in pci_swiotlb_detect_4gb() to decide if swiotlb should be active. That comparison yields the opposite result, when compiling on 32 bit systems. This was not possible before 1b7e03ef7570 ("x86, NUMA: Enable emulation on 32bit too") when that MAX_DMA32_PFN was first made visible to x86_32 (and which landed in v3.0). In practice this wasn't a problem, unless CONFIG_SWIOTLB is active on x86-32. However if one has set CONFIG_IOMMU_INTEL, since c5a5dc4cbbf4 ("iommu/vt-d: Don't switch off swiotlb if bounce page is used") there's a dependency on CONFIG_SWIOTLB, which was not necessarily active before. That landed in v5.4, where we noticed it in the fli4l Linux distribution. We have CONFIG_IOMMU_INTEL active on both 32 and 64 bit kernel configs there (I could not find out why, so let's just say historical reasons). The effect is at boot time 64 MiB (default size) were allocated for bounce buffers now, which is a noticeable amount of memory on small systems like pcengines ALIX 2D3 with 256 MiB memory, which are still frequently used as home routers. We noticed this effect when migrating from kernel v4.19 (LTS) to v5.4 (LTS) in fli4l and got that kernel messages for example: Linux version 5.4.22 (buildroot@buildroot) (gcc version 7.3.0 (Buildroot 2018.02.8)) #1 SMP Mon Nov 26 23:40:00 CET 2018 … Memory: 183484K/261756K available (4594K kernel code, 393K rwdata, 1660K rodata, 536K init, 456K bss , 78272K reserved, 0K cma-reserved, 0K highmem) … PCI-DMA: Using software bounce buffering for IO (SWIOTLB) software IO TLB: mapped [mem 0x0bb78000-0x0fb78000] (64MB) The initial analysis and the suggested fix was done by user 'sourcejedi' at stackoverflow and explicitly marked as GPLv2 for inclusion in the Linux kernel: https://unix.stackexchange.com/a/520525/50007 The new calculation, which does not suffer from that overflow, is the same as for arch/mips now as suggested by Robin Murphy. The fix was tested by fli4l users on round about two dozen different systems, including both 32 and 64 bit archs, bare metal and virtualized machines. [ bp: Massage commit message. ] Fixes: 1b7e03ef7570 ("x86, NUMA: Enable emulation on 32bit too") Reported-by: Alan Jenkins Suggested-by: Robin Murphy Signed-off-by: Alexander Dahl Signed-off-by: Borislav Petkov Reviewed-by: Greg Kroah-Hartman Cc: stable@vger.kernel.org Link: https://unix.stackexchange.com/q/520065/50007 Link: https://web.nettworks.org/bugs/browse/FFL-2560 Link: https://lkml.kernel.org/r/20200526175749.20742-1-post@lespocky.de --- arch/x86/include/asm/dma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h index 00f7cf45e699..8e95aa4b0d17 100644 --- a/arch/x86/include/asm/dma.h +++ b/arch/x86/include/asm/dma.h @@ -74,7 +74,7 @@ #define MAX_DMA_PFN ((16UL * 1024 * 1024) >> PAGE_SHIFT) /* 4GB broken PCI/AGP hardware bus master zone */ -#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) +#define MAX_DMA32_PFN (1UL << (32 - PAGE_SHIFT)) #ifdef CONFIG_X86_32 /* The maximum address that we can perform a DMA transfer to on this platform */ -- cgit v1.2.3 From 2ca41f555e857ec5beef6063bfa43a17ee76d7ec Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Tue, 26 May 2020 08:20:14 -0400 Subject: x86/spinlock: Remove obsolete ticket spinlock macros and types Even though the x86 ticket spinlock code has been removed with cfd8983f03c7 ("x86, locking/spinlocks: Remove ticket (spin)lock implementation") a while ago, there are still some ticket spinlock specific macros and types left in the asm/spinlock_types.h header file that are no longer used. Remove those as well to avoid confusion. Signed-off-by: Waiman Long Signed-off-by: Borislav Petkov Acked-by: Thomas Gleixner Acked-by: Peter Zijlstra Link: https://lkml.kernel.org/r/20200526122014.25241-1-longman@redhat.com --- arch/x86/include/asm/spinlock_types.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index bf3e34b25afc..323db6c5852a 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -3,29 +3,7 @@ #define _ASM_X86_SPINLOCK_TYPES_H #include - -#ifdef CONFIG_PARAVIRT_SPINLOCKS -#define __TICKET_LOCK_INC 2 -#define TICKET_SLOWPATH_FLAG ((__ticket_t)1) -#else -#define __TICKET_LOCK_INC 1 -#define TICKET_SLOWPATH_FLAG ((__ticket_t)0) -#endif - -#if (CONFIG_NR_CPUS < (256 / __TICKET_LOCK_INC)) -typedef u8 __ticket_t; -typedef u16 __ticketpair_t; -#else -typedef u16 __ticket_t; -typedef u32 __ticketpair_t; -#endif - -#define TICKET_LOCK_INC ((__ticket_t)__TICKET_LOCK_INC) - -#define TICKET_SHIFT (sizeof(__ticket_t) * 8) - #include - #include #endif /* _ASM_X86_SPINLOCK_TYPES_H */ -- cgit v1.2.3 From 4bfe6cce133cad82cea04490c308795275857782 Mon Sep 17 00:00:00 2001 From: Jay Lang Date: Sun, 24 May 2020 12:27:39 -0400 Subject: x86/ioperm: Prevent a memory leak when fork fails In the copy_process() routine called by _do_fork(), failure to allocate a PID (or further along in the function) will trigger an invocation to exit_thread(). This is done to clean up from an earlier call to copy_thread_tls(). Naturally, the child task is passed into exit_thread(), however during the process, io_bitmap_exit() nullifies the parent's io_bitmap rather than the child's. As copy_thread_tls() has been called ahead of the failure, the reference count on the calling thread's io_bitmap is incremented as we would expect. However, io_bitmap_exit() doesn't accept any arguments, and thus assumes it should trash the current thread's io_bitmap reference rather than the child's. This is pretty sneaky in practice, because in all instances but this one, exit_thread() is called with respect to the current task and everything works out. A determined attacker can issue an appropriate ioctl (i.e. KDENABIO) to get a bitmap allocated, and force a clone3() syscall to fail by passing in a zeroed clone_args structure. The kernel handles the erroneous struct and the buggy code path is followed, and even though the parent's reference to the io_bitmap is trashed, the child still holds a reference and thus the structure will never be freed. Fix this by tweaking io_bitmap_exit() and its subroutines to accept a task_struct argument which to operate on. Fixes: ea5f1cd7ab49 ("x86/ioperm: Remove bitmap if all permissions dropped") Signed-off-by: Jay Lang Signed-off-by: Thomas Gleixner Cc: stable#@vger.kernel.org Link: https://lkml.kernel.org/r/20200524162742.253727-1-jaytlang@mit.edu --- arch/x86/include/asm/io_bitmap.h | 4 ++-- arch/x86/kernel/ioport.c | 22 +++++++++++----------- arch/x86/kernel/process.c | 4 ++-- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/io_bitmap.h b/arch/x86/include/asm/io_bitmap.h index 07344d82e88e..ac1a99ffbd8d 100644 --- a/arch/x86/include/asm/io_bitmap.h +++ b/arch/x86/include/asm/io_bitmap.h @@ -17,7 +17,7 @@ struct task_struct; #ifdef CONFIG_X86_IOPL_IOPERM void io_bitmap_share(struct task_struct *tsk); -void io_bitmap_exit(void); +void io_bitmap_exit(struct task_struct *tsk); void native_tss_update_io_bitmap(void); @@ -29,7 +29,7 @@ void native_tss_update_io_bitmap(void); #else static inline void io_bitmap_share(struct task_struct *tsk) { } -static inline void io_bitmap_exit(void) { } +static inline void io_bitmap_exit(struct task_struct *tsk) { } static inline void tss_update_io_bitmap(void) { } #endif diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index a53e7b4a7419..e2fab3ceb09f 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -33,15 +33,15 @@ void io_bitmap_share(struct task_struct *tsk) set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } -static void task_update_io_bitmap(void) +static void task_update_io_bitmap(struct task_struct *tsk) { - struct thread_struct *t = ¤t->thread; + struct thread_struct *t = &tsk->thread; if (t->iopl_emul == 3 || t->io_bitmap) { /* TSS update is handled on exit to user space */ - set_thread_flag(TIF_IO_BITMAP); + set_tsk_thread_flag(tsk, TIF_IO_BITMAP); } else { - clear_thread_flag(TIF_IO_BITMAP); + clear_tsk_thread_flag(tsk, TIF_IO_BITMAP); /* Invalidate TSS */ preempt_disable(); tss_update_io_bitmap(); @@ -49,12 +49,12 @@ static void task_update_io_bitmap(void) } } -void io_bitmap_exit(void) +void io_bitmap_exit(struct task_struct *tsk) { - struct io_bitmap *iobm = current->thread.io_bitmap; + struct io_bitmap *iobm = tsk->thread.io_bitmap; - current->thread.io_bitmap = NULL; - task_update_io_bitmap(); + tsk->thread.io_bitmap = NULL; + task_update_io_bitmap(tsk); if (iobm && refcount_dec_and_test(&iobm->refcnt)) kfree(iobm); } @@ -102,7 +102,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) if (!iobm) return -ENOMEM; refcount_set(&iobm->refcnt, 1); - io_bitmap_exit(); + io_bitmap_exit(current); } /* @@ -134,7 +134,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on) } /* All permissions dropped? */ if (max_long == UINT_MAX) { - io_bitmap_exit(); + io_bitmap_exit(current); return 0; } @@ -192,7 +192,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level) } t->iopl_emul = level; - task_update_io_bitmap(); + task_update_io_bitmap(current); return 0; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 9da70b279dad..35638f1c5791 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -96,7 +96,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) } /* - * Free current thread data structures etc.. + * Free thread data structures etc.. */ void exit_thread(struct task_struct *tsk) { @@ -104,7 +104,7 @@ void exit_thread(struct task_struct *tsk) struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) - io_bitmap_exit(); + io_bitmap_exit(tsk); free_vm86(t); -- cgit v1.2.3 From 0a5ea224b2fdf9dca9291ef7b5a12fd846a5dc34 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Feb 2020 16:50:00 -0500 Subject: x86: switch both 32bit and 64bit to providing csum_and_copy_from_user() ... rather than messing with the wrapper. As a side effect, 32bit variant gets access_ok() into it and can be switched to user_access_begin()/user_access_end() Signed-off-by: Al Viro --- arch/x86/include/asm/checksum.h | 1 + arch/x86/include/asm/checksum_32.h | 15 +++++++++------ arch/x86/include/asm/checksum_64.h | 5 +---- arch/x86/lib/csum-wrappers_64.c | 6 +++--- 4 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index d79d1e622dcf..1ab9572ae4c2 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 #ifdef CONFIG_X86_32 # include #else diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index f57b94e02c57..2487b7fc2d24 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -44,18 +44,21 @@ static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL); } -static inline __wsum csum_partial_copy_from_user(const void __user *src, - void *dst, - int len, __wsum sum, - int *err_ptr) +static inline __wsum csum_and_copy_from_user(const void __user *src, + void *dst, int len, + __wsum sum, int *err_ptr) { __wsum ret; might_sleep(); - stac(); + if (!user_access_begin(src, len)) { + if (len) + *err_ptr = -EFAULT; + return sum; + } ret = csum_partial_copy_generic((__force void *)src, dst, len, sum, err_ptr, NULL); - clac(); + user_access_end(); return ret; } diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index ac9c06494827..2f8435542376 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -129,7 +129,6 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); -#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 #define HAVE_CSUM_COPY_USER 1 @@ -139,15 +138,13 @@ extern __visible __wsum csum_partial_copy_generic(const void *src, const void *d int *src_err_ptr, int *dst_err_ptr); -extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst, +extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp); extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum isum, int *errp); extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum); -#define csum_and_copy_from_user csum_partial_copy_from_user - /** * ip_compute_csum - Compute an 16bit IP checksum. * @buff: buffer address. diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 7028d1dc5c6b..ee63d7576fd2 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -10,7 +10,7 @@ #include /** - * csum_partial_copy_from_user - Copy and checksum from user space. + * csum_and_copy_from_user - Copy and checksum from user space. * @src: source address (user space) * @dst: destination address * @len: number of bytes to be copied. @@ -21,7 +21,7 @@ * src and dst are best aligned to 64bits. */ __wsum -csum_partial_copy_from_user(const void __user *src, void *dst, +csum_and_copy_from_user(const void __user *src, void *dst, int len, __wsum isum, int *errp) { might_sleep(); @@ -68,7 +68,7 @@ out_err: return isum; } -EXPORT_SYMBOL(csum_partial_copy_from_user); +EXPORT_SYMBOL(csum_and_copy_from_user); /** * csum_and_copy_to_user - Copy and checksum to user space. -- cgit v1.2.3 From c281a6c1ac6b0867e4341ea801030fa9a62157f9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Feb 2020 17:11:48 -0500 Subject: x86: switch 32bit csum_and_copy_to_user() to user_access_{begin,end}() consolidate HAVE_CSUM_COPY_USER for 32bit and 64bit, while are at it Signed-off-by: Al Viro --- arch/x86/include/asm/checksum.h | 1 + arch/x86/include/asm/checksum_32.h | 6 ++---- arch/x86/include/asm/checksum_64.h | 3 --- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index 1ab9572ae4c2..0ada98d5d09f 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,5 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +#define HAVE_CSUM_COPY_USER #ifdef CONFIG_X86_32 # include #else diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 2487b7fc2d24..11624c8a9d8d 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -176,7 +176,6 @@ static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr, /* * Copy and checksum to user */ -#define HAVE_CSUM_COPY_USER static inline __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len, __wsum sum, @@ -185,11 +184,10 @@ static inline __wsum csum_and_copy_to_user(const void *src, __wsum ret; might_sleep(); - if (access_ok(dst, len)) { - stac(); + if (user_access_begin(dst, len)) { ret = csum_partial_copy_generic(src, (__force void *)dst, len, sum, NULL, err_ptr); - clac(); + user_access_end(); return ret; } diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h index 2f8435542376..0a289b87e872 100644 --- a/arch/x86/include/asm/checksum_64.h +++ b/arch/x86/include/asm/checksum_64.h @@ -129,9 +129,6 @@ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); -#define HAVE_CSUM_COPY_USER 1 - - /* Do not call this directly. Use the wrappers below */ extern __visible __wsum csum_partial_copy_generic(const void *src, const void *dst, int len, __wsum sum, -- cgit v1.2.3 From 7923ef4f6ec4a25a902bd827446eac860b01fd1c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 18 May 2020 15:24:46 -0400 Subject: KVM: nSVM: remove trailing padding for struct vmcb_control_area Allow placing the VMCB structs on the stack or in other structs without wasting too much space. Add BUILD_BUG_ON as a quick safeguard against typos. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 6ece8561ba66..8a1f5382a4ea 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -96,7 +96,6 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u8 reserved_6[8]; /* Offset 0xe8 */ u64 avic_logical_id; /* Offset 0xf0 */ u64 avic_physical_id; /* Offset 0xf8 */ - u8 reserved_7[768]; }; @@ -203,8 +202,16 @@ struct __attribute__ ((__packed__)) vmcb_save_area { u64 last_excp_to; }; + +static inline void __unused_size_checks(void) +{ + BUILD_BUG_ON(sizeof(struct vmcb_save_area) != 0x298); + BUILD_BUG_ON(sizeof(struct vmcb_control_area) != 256); +} + struct __attribute__ ((__packed__)) vmcb { struct vmcb_control_area control; + u8 reserved_control[1024 - sizeof(struct vmcb_control_area)]; struct vmcb_save_area save; }; -- cgit v1.2.3 From e9fd761a46b8655aa5ff84b4adc0c8cf43952145 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 May 2020 13:28:23 -0400 Subject: KVM: nSVM: remove HF_VINTR_MASK Now that the int_ctl field is stored in svm->nested.ctl.int_ctl, we can use it instead of vcpu->arch.hflags to check whether L2 is running in V_INTR_MASKING mode. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm/nested.c | 6 +----- arch/x86/kvm/svm/svm.c | 2 +- arch/x86/kvm/svm/svm.h | 4 +++- 4 files changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e6f2e1a2dab6..0dfc522f96cc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1596,7 +1596,6 @@ enum { #define HF_GIF_MASK (1 << 0) #define HF_HIF_MASK (1 << 1) -#define HF_VINTR_MASK (1 << 2) #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c3c04fef11df..6967fe884eaf 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -118,7 +118,7 @@ void recalc_intercepts(struct vcpu_svm *svm) c->intercept_exceptions = h->intercept_exceptions; c->intercept = h->intercept; - if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { + if (g->int_ctl & V_INTR_MASKING_MASK) { /* We only want the cr8 intercept bits of L1 */ c->intercept_cr &= ~(1U << INTERCEPT_CR8_READ); c->intercept_cr &= ~(1U << INTERCEPT_CR8_WRITE); @@ -338,10 +338,6 @@ static void nested_prepare_vmcb_control(struct vcpu_svm *svm) kvm_mmu_reset_context(&svm->vcpu); svm_flush_tlb(&svm->vcpu); - if (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) - svm->vcpu.arch.hflags |= HF_VINTR_MASK; - else - svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; svm->vmcb->control.tsc_offset = svm->vcpu.arch.tsc_offset = svm->vcpu.arch.l1_tsc_offset + svm->nested.ctl.tsc_offset; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 422b1cc62a20..6867dba4f736 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3104,7 +3104,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ - if ((svm->vcpu.arch.hflags & HF_VINTR_MASK) + if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) ? !(svm->vcpu.arch.hflags & HF_HIF_MASK) : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) return true; diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 10b7b55720a0..be8e830f83fa 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -367,7 +367,9 @@ void svm_set_gif(struct vcpu_svm *svm, bool value); static inline bool svm_nested_virtualize_tpr(struct kvm_vcpu *vcpu) { - return is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK); + struct vcpu_svm *svm = to_svm(vcpu); + + return is_guest_mode(vcpu) && (svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK); } static inline bool nested_exit_on_smi(struct vcpu_svm *svm) -- cgit v1.2.3 From 08245e6d2e589f2b6e9e275ddb343e2ec9ce94ec Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 19 May 2020 09:21:04 -0400 Subject: KVM: nSVM: remove HF_HIF_MASK The L1 flags can be found in the save area of svm->nested.hsave, fish it from there so that there is one fewer thing to migrate. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/svm/nested.c | 5 ----- arch/x86/kvm/svm/svm.c | 2 +- 3 files changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0dfc522f96cc..3485f8454088 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1595,7 +1595,6 @@ enum { }; #define HF_GIF_MASK (1 << 0) -#define HF_HIF_MASK (1 << 1) #define HF_NMI_MASK (1 << 3) #define HF_IRET_MASK (1 << 4) #define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */ diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 6967fe884eaf..65ecc8586f75 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -371,11 +371,6 @@ void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, struct vmcb *nested_vmcb) { svm->nested.vmcb = vmcb_gpa; - if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) - svm->vcpu.arch.hflags |= HF_HIF_MASK; - else - svm->vcpu.arch.hflags &= ~HF_HIF_MASK; - load_nested_vmcb_control(svm, &nested_vmcb->control); nested_prepare_vmcb_save(svm, nested_vmcb); nested_prepare_vmcb_control(svm); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 6867dba4f736..bc08221f6743 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3105,7 +3105,7 @@ bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) if (is_guest_mode(vcpu)) { /* As long as interrupts are being delivered... */ if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) - ? !(svm->vcpu.arch.hflags & HF_HIF_MASK) + ? !(svm->nested.hsave->save.rflags & X86_EFLAGS_IF) : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) return true; -- cgit v1.2.3 From cc440cdad5b7a4c1de12dace725209eb3e0cf663 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 May 2020 13:36:32 -0400 Subject: KVM: nSVM: implement KVM_GET_NESTED_STATE and KVM_SET_NESTED_STATE Similar to VMX, the state that is captured through the currently available IOCTLs is a mix of L1 and L2 state, dependent on whether the L2 guest was running at the moment when the process was interrupted to save its state. In particular, the SVM-specific state for nested virtualization includes the L1 saved state (including the interrupt flag), the cached L2 controls, and the GIF. Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 17 ++++- arch/x86/kvm/cpuid.h | 5 ++ arch/x86/kvm/svm/nested.c | 147 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/vmx/nested.c | 5 -- arch/x86/kvm/x86.c | 3 +- 6 files changed, 171 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 3f3f780c8c65..12075a9de1c1 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -385,18 +385,22 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT (1 << 4) #define KVM_STATE_NESTED_FORMAT_VMX 0 -#define KVM_STATE_NESTED_FORMAT_SVM 1 /* unused */ +#define KVM_STATE_NESTED_FORMAT_SVM 1 #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 #define KVM_STATE_NESTED_EVMCS 0x00000004 #define KVM_STATE_NESTED_MTF_PENDING 0x00000008 +#define KVM_STATE_NESTED_GIF_SET 0x00000100 #define KVM_STATE_NESTED_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_SMM_VMXON 0x00000002 #define KVM_STATE_NESTED_VMX_VMCS_SIZE 0x1000 +#define KVM_STATE_NESTED_SVM_VMCB_SIZE 0x1000 + + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -411,6 +415,15 @@ struct kvm_vmx_nested_state_hdr { } smm; }; +struct kvm_svm_nested_state_data { + /* Save area only used if KVM_STATE_NESTED_RUN_PENDING. */ + __u8 vmcb12[KVM_STATE_NESTED_SVM_VMCB_SIZE]; +}; + +struct kvm_svm_nested_state_hdr { + __u64 vmcb_pa; +}; + /* for KVM_CAP_NESTED_STATE */ struct kvm_nested_state { __u16 flags; @@ -419,6 +432,7 @@ struct kvm_nested_state { union { struct kvm_vmx_nested_state_hdr vmx; + struct kvm_svm_nested_state_hdr svm; /* Pad the header to 128 bytes. */ __u8 pad[120]; @@ -431,6 +445,7 @@ struct kvm_nested_state { */ union { struct kvm_vmx_nested_state_data vmx[0]; + struct kvm_svm_nested_state_data svm[0]; } data; }; diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 63a70f6a3df3..05434cd9342f 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -303,4 +303,9 @@ static __always_inline void kvm_cpu_cap_check_and_set(unsigned int x86_feature) kvm_cpu_cap_set(x86_feature); } +static inline bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); +} + #endif diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index c712fe577029..6b1049148c1b 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -25,6 +25,7 @@ #include "trace.h" #include "mmu.h" #include "x86.h" +#include "cpuid.h" #include "lapic.h" #include "svm.h" @@ -238,6 +239,8 @@ static void load_nested_vmcb_control(struct vcpu_svm *svm, { copy_vmcb_control_area(&svm->nested.ctl, control); + /* Copy it here because nested_svm_check_controls will check it. */ + svm->nested.ctl.asid = control->asid; svm->nested.ctl.msrpm_base_pa &= ~0x0fffULL; svm->nested.ctl.iopm_base_pa &= ~0x0fffULL; } @@ -930,6 +933,150 @@ int nested_svm_exit_special(struct vcpu_svm *svm) return NESTED_EXIT_CONTINUE; } +static int svm_get_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + u32 user_data_size) +{ + struct vcpu_svm *svm; + struct kvm_nested_state kvm_state = { + .flags = 0, + .format = KVM_STATE_NESTED_FORMAT_SVM, + .size = sizeof(kvm_state), + }; + struct vmcb __user *user_vmcb = (struct vmcb __user *) + &user_kvm_nested_state->data.svm[0]; + + if (!vcpu) + return kvm_state.size + KVM_STATE_NESTED_SVM_VMCB_SIZE; + + svm = to_svm(vcpu); + + if (user_data_size < kvm_state.size) + goto out; + + /* First fill in the header and copy it out. */ + if (is_guest_mode(vcpu)) { + kvm_state.hdr.svm.vmcb_pa = svm->nested.vmcb; + kvm_state.size += KVM_STATE_NESTED_SVM_VMCB_SIZE; + kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE; + + if (svm->nested.nested_run_pending) + kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING; + } + + if (gif_set(svm)) + kvm_state.flags |= KVM_STATE_NESTED_GIF_SET; + + if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state))) + return -EFAULT; + + if (!is_guest_mode(vcpu)) + goto out; + + /* + * Copy over the full size of the VMCB rather than just the size + * of the structs. + */ + if (clear_user(user_vmcb, KVM_STATE_NESTED_SVM_VMCB_SIZE)) + return -EFAULT; + if (copy_to_user(&user_vmcb->control, &svm->nested.ctl, + sizeof(user_vmcb->control))) + return -EFAULT; + if (copy_to_user(&user_vmcb->save, &svm->nested.hsave->save, + sizeof(user_vmcb->save))) + return -EFAULT; + +out: + return kvm_state.size; +} + +static int svm_set_nested_state(struct kvm_vcpu *vcpu, + struct kvm_nested_state __user *user_kvm_nested_state, + struct kvm_nested_state *kvm_state) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *hsave = svm->nested.hsave; + struct vmcb __user *user_vmcb = (struct vmcb __user *) + &user_kvm_nested_state->data.svm[0]; + struct vmcb_control_area ctl; + struct vmcb_save_area save; + u32 cr0; + + if (kvm_state->format != KVM_STATE_NESTED_FORMAT_SVM) + return -EINVAL; + + if (kvm_state->flags & ~(KVM_STATE_NESTED_GUEST_MODE | + KVM_STATE_NESTED_RUN_PENDING | + KVM_STATE_NESTED_GIF_SET)) + return -EINVAL; + + /* + * If in guest mode, vcpu->arch.efer actually refers to the L2 guest's + * EFER.SVME, but EFER.SVME still has to be 1 for VMRUN to succeed. + */ + if (!(vcpu->arch.efer & EFER_SVME)) { + /* GIF=1 and no guest mode are required if SVME=0. */ + if (kvm_state->flags != KVM_STATE_NESTED_GIF_SET) + return -EINVAL; + } + + /* SMM temporarily disables SVM, so we cannot be in guest mode. */ + if (is_smm(vcpu) && (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) + return -EINVAL; + + if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) { + svm_leave_nested(svm); + goto out_set_gif; + } + + if (!page_address_valid(vcpu, kvm_state->hdr.svm.vmcb_pa)) + return -EINVAL; + if (kvm_state->size < sizeof(*kvm_state) + KVM_STATE_NESTED_SVM_VMCB_SIZE) + return -EINVAL; + if (copy_from_user(&ctl, &user_vmcb->control, sizeof(ctl))) + return -EFAULT; + if (copy_from_user(&save, &user_vmcb->save, sizeof(save))) + return -EFAULT; + + if (!nested_vmcb_check_controls(&ctl)) + return -EINVAL; + + /* + * Processor state contains L2 state. Check that it is + * valid for guest mode (see nested_vmcb_checks). + */ + cr0 = kvm_read_cr0(vcpu); + if (((cr0 & X86_CR0_CD) == 0) && (cr0 & X86_CR0_NW)) + return -EINVAL; + + /* + * Validate host state saved from before VMRUN (see + * nested_svm_check_permissions). + * TODO: validate reserved bits for all saved state. + */ + if (!(save.cr0 & X86_CR0_PG)) + return -EINVAL; + + /* + * All checks done, we can enter guest mode. L1 control fields + * come from the nested save state. Guest state is already + * in the registers, the save area of the nested state instead + * contains saved L1 state. + */ + copy_vmcb_control_area(&hsave->control, &svm->vmcb->control); + hsave->save = save; + + svm->nested.vmcb = kvm_state->hdr.svm.vmcb_pa; + load_nested_vmcb_control(svm, &ctl); + nested_prepare_vmcb_control(svm); + +out_set_gif: + svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET)); + return 0; +} + struct kvm_x86_nested_ops svm_nested_ops = { .check_events = svm_check_nested_events, + .get_state = svm_get_nested_state, + .set_state = svm_set_nested_state, }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b4db9a980469..3871bfb40594 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1193,6 +1193,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm->avic_is_running = true; svm->nested.hsave = page_address(hsave_page); + clear_page(svm->nested.hsave); svm->msrpm = page_address(msrpm_pages); svm_vcpu_init_msrpm(svm->msrpm); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 51ebb60e1533..106fc6fceb97 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -437,11 +437,6 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, } } -static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa) -{ - return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu)); -} - static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f0fa610bed91..d4aa7dc662d5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4628,7 +4628,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (kvm_state.flags & ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE - | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING)) + | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING + | KVM_STATE_NESTED_GIF_SET)) break; /* nested_run_pending implies guest_mode. */ -- cgit v1.2.3 From 68fd66f100d196d35ab3008d4c69af3a0d7e7200 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:17 +0200 Subject: KVM: x86: extend struct kvm_vcpu_pv_apf_data with token info Currently, APF mechanism relies on the #PF abuse where the token is being passed through CR2. If we switch to using interrupts to deliver page-ready notifications we need a different way to pass the data. Extent the existing 'struct kvm_vcpu_pv_apf_data' with token information for page-ready notifications. While on it, rename 'reason' to 'flags'. This doesn't change the semantics as we only have reasons '1' and '2' and these can be treated as bit flags but KVM_PV_REASON_PAGE_READY is going away with interrupt based delivery making 'reason' name misleading. The newly introduced apf_put_user_ready() temporary puts both flags and token information, this will be changed to put token only when we switch to interrupt based notifications. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-3-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/include/asm/kvm_para.h | 4 ++-- arch/x86/include/uapi/asm/kvm_para.h | 5 +++-- arch/x86/kernel/kvm.c | 16 ++++++++-------- arch/x86/kvm/mmu/mmu.c | 6 +++--- arch/x86/kvm/svm/nested.c | 2 +- arch/x86/kvm/svm/svm.c | 3 ++- arch/x86/kvm/vmx/nested.c | 2 +- arch/x86/kvm/vmx/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 17 +++++++++++++---- 10 files changed, 36 insertions(+), 25 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3485f8454088..033f6173a857 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -770,7 +770,7 @@ struct kvm_vcpu_arch { u64 msr_val; u32 id; bool send_user_only; - u32 host_apf_reason; + u32 host_apf_flags; unsigned long nested_apf_token; bool delivery_as_pf_vmexit; } apf; diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 118e5c2379f9..57fd1966c4ea 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -90,7 +90,7 @@ unsigned int kvm_arch_para_features(void); unsigned int kvm_arch_para_hints(void); void kvm_async_pf_task_wait_schedule(u32 token); void kvm_async_pf_task_wake(u32 token); -u32 kvm_read_and_reset_pf_reason(void); +u32 kvm_read_and_reset_apf_flags(void); void kvm_disable_steal_time(void); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token); @@ -131,7 +131,7 @@ static inline unsigned int kvm_arch_para_hints(void) return 0; } -static inline u32 kvm_read_and_reset_pf_reason(void) +static inline u32 kvm_read_and_reset_apf_flags(void) { return 0; } diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 2a8e0b6b9805..d1cd5c0f431a 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -112,8 +112,9 @@ struct kvm_mmu_op_release_pt { #define KVM_PV_REASON_PAGE_READY 2 struct kvm_vcpu_pv_apf_data { - __u32 reason; - __u8 pad[60]; + __u32 flags; + __u32 token; /* Used for page ready notification only */ + __u8 pad[56]; __u32 enabled; }; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b3d9b0d7a37d..d6f22a3a1f7d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -218,23 +218,23 @@ again: } EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake); -u32 kvm_read_and_reset_pf_reason(void) +u32 kvm_read_and_reset_apf_flags(void) { - u32 reason = 0; + u32 flags = 0; if (__this_cpu_read(apf_reason.enabled)) { - reason = __this_cpu_read(apf_reason.reason); - __this_cpu_write(apf_reason.reason, 0); + flags = __this_cpu_read(apf_reason.flags); + __this_cpu_write(apf_reason.flags, 0); } - return reason; + return flags; } -EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason); -NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); +EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags); +NOKPROBE_SYMBOL(kvm_read_and_reset_apf_flags); bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token) { - u32 reason = kvm_read_and_reset_pf_reason(); + u32 reason = kvm_read_and_reset_apf_flags(); switch (reason) { case KVM_PV_REASON_PAGE_NOT_PRESENT: diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2e62a03410c7..5de1929cfc55 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4164,7 +4164,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, #endif vcpu->arch.l1tf_flush_l1d = true; - switch (vcpu->arch.apf.host_apf_reason) { + switch (vcpu->arch.apf.host_apf_flags) { default: trace_kvm_page_fault(fault_address, error_code); @@ -4174,13 +4174,13 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, insn_len); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: - vcpu->arch.apf.host_apf_reason = 0; + vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wait_schedule(fault_address); local_irq_enable(); break; case KVM_PV_REASON_PAGE_READY: - vcpu->arch.apf.host_apf_reason = 0; + vcpu->arch.apf.host_apf_flags = 0; local_irq_disable(); kvm_async_pf_task_wake(fault_address); local_irq_enable(); diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 6b1049148c1b..8a6db11dcb43 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -921,7 +921,7 @@ int nested_svm_exit_special(struct vcpu_svm *svm) if (get_host_vmcb(svm)->control.intercept_exceptions & excp_bits) return NESTED_EXIT_HOST; else if (exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR && - svm->vcpu.arch.apf.host_apf_reason) + svm->vcpu.arch.apf.host_apf_flags) /* Trap async PF even if not shadowing */ return NESTED_EXIT_HOST; break; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 3871bfb40594..9e333b91ff78 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3459,7 +3459,8 @@ static fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) /* if exit due to PF check for async PF */ if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) - svm->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + svm->vcpu.arch.apf.host_apf_flags = + kvm_read_and_reset_apf_flags(); if (npt_enabled) { vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 106fc6fceb97..119a2f7395d6 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -5652,7 +5652,7 @@ static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu, u32 exit_reason) if (is_nmi(intr_info)) return true; else if (is_page_fault(intr_info)) - return vcpu->arch.apf.host_apf_reason || !enable_ept; + return vcpu->arch.apf.host_apf_flags || !enable_ept; else if (is_debug(intr_info) && vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 7b55dc6230a9..5a43af0061ef 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -4765,7 +4765,7 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu) if (is_page_fault(intr_info)) { cr2 = vmx_get_exit_qual(vcpu); /* EPT won't cause page fault directly */ - WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept); + WARN_ON_ONCE(!vcpu->arch.apf.host_apf_flags && enable_ept); return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0); } @@ -6360,7 +6360,7 @@ static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx) /* if exit due to PF check for async PF */ if (is_page_fault(intr_info)) { - vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason(); + vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags(); /* Handle machine checks before interrupts are enabled */ } else if (is_machine_check(intr_info)) { kvm_machine_check(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b9a5ff7b922c..84aa3c1519ed 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2690,7 +2690,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) } if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, - sizeof(u32))) + sizeof(u64))) return 1; vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); @@ -10420,8 +10420,17 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) } } -static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) +static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) { + u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT; + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason, + sizeof(reason)); +} + +static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) +{ + u64 val = (u64)token << 32 | KVM_PV_REASON_PAGE_READY; return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, sizeof(val)); @@ -10466,7 +10475,7 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, kvm_add_async_pf_gfn(vcpu, work->arch.gfn); if (kvm_can_deliver_async_pf(vcpu) && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) { + !apf_put_user_notpresent(vcpu)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; @@ -10499,7 +10508,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && - !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) { + !apf_put_user_ready(vcpu, work->arch.token)) { fault.vector = PF_VECTOR; fault.error_code_valid = true; fault.error_code = 0; -- cgit v1.2.3 From 7c0ade6c9023b2b90b757e2927b306bec1cc4ca6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:18 +0200 Subject: KVM: rename kvm_arch_can_inject_async_page_present() to kvm_arch_can_dequeue_async_page_present() An innocent reader of the following x86 KVM code: bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) return true; ... may get very confused: if APF mechanism is not enabled, why do we report that we 'can inject async page present'? In reality, upon injection kvm_arch_async_page_present() will check the same condition again and, in case APF is disabled, will just drop the item. This is fine as the guest which deliberately disabled APF doesn't expect to get any APF notifications. Rename kvm_arch_can_inject_async_page_present() to kvm_arch_can_dequeue_async_page_present() to make it clear what we are checking: if the item can be dequeued (meaning either injected or just dropped). On s390 kvm_arch_can_inject_async_page_present() always returns 'true' so the rename doesn't matter much. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-4-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/s390/include/asm/kvm_host.h | 2 +- arch/s390/kvm/kvm-s390.c | 2 +- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/x86.c | 2 +- virt/kvm/async_pf.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 176f74cc2257..9d23bde1363b 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -973,7 +973,7 @@ struct kvm_arch_async_pf { unsigned long pfault_token; }; -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index a560a368f92c..06bde4bad205 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3943,7 +3943,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, /* s390 will always inject the page directly */ } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { /* * s390 will always inject the page directly, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 033f6173a857..f3897e417b69 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1660,7 +1660,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84aa3c1519ed..0e79b37b2b7e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10521,7 +10521,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu) +bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) return true; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 10b533f641a6..82e53f180a1a 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -134,7 +134,7 @@ void kvm_check_async_pf_completion(struct kvm_vcpu *vcpu) struct kvm_async_pf *work; while (!list_empty_careful(&vcpu->async_pf.done) && - kvm_arch_can_inject_async_page_present(vcpu)) { + kvm_arch_can_dequeue_async_page_present(vcpu)) { spin_lock(&vcpu->async_pf.lock); work = list_first_entry(&vcpu->async_pf.done, typeof(*work), link); -- cgit v1.2.3 From 2635b5c4a0e407b84f68e188c719f28ba0e9ae1b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:20 +0200 Subject: KVM: x86: interrupt based APF 'page ready' event delivery Concerns were expressed around APF delivery via synthetic #PF exception as in some cases such delivery may collide with real page fault. For 'page ready' notifications we can easily switch to using an interrupt instead. Introduce new MSR_KVM_ASYNC_PF_INT mechanism and deprecate the legacy one. One notable difference between the two mechanisms is that interrupt may not get handled immediately so whenever we would like to deliver next event (regardless of its type) we must be sure the guest had read and cleared previous event in the slot. While on it, get rid on 'type 1/type 2' names for APF events in the documentation as they are causing confusion. Use 'page not present' and 'page ready' everywhere instead. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-6-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/msr.rst | 102 ++++++++++++++++++++++++----------- arch/x86/include/asm/kvm_host.h | 4 +- arch/x86/include/uapi/asm/kvm_para.h | 12 ++++- arch/x86/kvm/x86.c | 93 +++++++++++++++++++++++--------- 4 files changed, 152 insertions(+), 59 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst index 33892036672d..be08df12f31a 100644 --- a/Documentation/virt/kvm/msr.rst +++ b/Documentation/virt/kvm/msr.rst @@ -190,41 +190,68 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02 data: - Bits 63-6 hold 64-byte aligned physical address of a - 64 byte memory area which must be in guest RAM and must be - zeroed. Bits 5-3 are reserved and should be zero. Bit 0 is 1 - when asynchronous page faults are enabled on the vcpu 0 when - disabled. Bit 1 is 1 if asynchronous page faults can be injected - when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults - are delivered to L1 as #PF vmexits. Bit 2 can be set only if - KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. - - First 4 byte of 64 byte memory location will be written to by - the hypervisor at the time of asynchronous page fault (APF) - injection to indicate type of asynchronous page fault. Value - of 1 means that the page referred to by the page fault is not - present. Value 2 means that the page is now available. Disabling - interrupt inhibits APFs. Guest must not enable interrupt - before the reason is read, or it may be overwritten by another - APF. Since APF uses the same exception vector as regular page - fault guest must reset the reason to 0 before it does - something that can generate normal page fault. If during page - fault APF reason is 0 it means that this is regular page - fault. - - During delivery of type 1 APF cr2 contains a token that will - be used to notify a guest when missing page becomes - available. When page becomes available type 2 APF is sent with - cr2 set to the token associated with the page. There is special - kind of token 0xffffffff which tells vcpu that it should wake - up all processes waiting for APFs and no individual type 2 APFs - will be sent. + Asynchronous page fault (APF) control MSR. + + Bits 63-6 hold 64-byte aligned physical address of a 64 byte memory area + which must be in guest RAM and must be zeroed. This memory is expected + to hold a copy of the following structure:: + + struct kvm_vcpu_pv_apf_data { + /* Used for 'page not present' events delivered via #PF */ + __u32 flags; + + /* Used for 'page ready' events delivered via interrupt notification */ + __u32 token; + + __u8 pad[56]; + __u32 enabled; + }; + + Bits 5-4 of the MSR are reserved and should be zero. Bit 0 is set to 1 + when asynchronous page faults are enabled on the vcpu, 0 when disabled. + Bit 1 is 1 if asynchronous page faults can be injected when vcpu is in + cpl == 0. Bit 2 is 1 if asynchronous page faults are delivered to L1 as + #PF vmexits. Bit 2 can be set only if KVM_FEATURE_ASYNC_PF_VMEXIT is + present in CPUID. Bit 3 enables interrupt based delivery of 'page ready' + events. + + 'Page not present' events are currently always delivered as synthetic + #PF exception. During delivery of these events APF CR2 register contains + a token that will be used to notify the guest when missing page becomes + available. Also, to make it possible to distinguish between real #PF and + APF, first 4 bytes of 64 byte memory location ('flags') will be written + to by the hypervisor at the time of injection. Only first bit of 'flags' + is currently supported, when set, it indicates that the guest is dealing + with asynchronous 'page not present' event. If during a page fault APF + 'flags' is '0' it means that this is regular page fault. Guest is + supposed to clear 'flags' when it is done handling #PF exception so the + next event can be delivered. + + Note, since APF 'page not present' events use the same exception vector + as regular page fault, guest must reset 'flags' to '0' before it does + something that can generate normal page fault. + + Bytes 5-7 of 64 byte memory location ('token') will be written to by the + hypervisor at the time of APF 'page ready' event injection. The content + of these bytes is a token which was previously delivered as 'page not + present' event. The event indicates the page in now available. Guest is + supposed to write '0' to 'token' when it is done handling 'page ready' + event so the next one can be delivered. + + Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page + ready' APF delivery needs to be written to before enabling APF mechanism + in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. + + Note, previously, 'page ready' events were delivered via the same #PF + exception as 'page not present' events but this is now deprecated. If + bit 3 (interrupt based delivery) is not set APF events are not delivered. If APF is disabled while there are outstanding APFs, they will not be delivered. - Currently type 2 APF will be always delivered on the same vcpu as - type 1 was, but guest should not rely on that. + Currently 'page ready' APF events will be always delivered on the + same vcpu as 'page not present' event was, but guest should not rely on + that. MSR_KVM_STEAL_TIME: 0x4b564d03 @@ -319,3 +346,16 @@ data: KVM guests can request the host not to poll on HLT, for example if they are performing polling themselves. + +MSR_KVM_ASYNC_PF_INT: + 0x4b564d06 + +data: + Second asynchronous page fault (APF) control MSR. + + Bits 0-7: APIC vector for delivery of 'page ready' APF events. + Bits 8-63: Reserved + + Interrupt vector for asynchnonous 'page ready' notifications delivery. + The vector has to be set up before asynchronous page fault mechanism + is enabled in MSR_KVM_ASYNC_PF_EN. diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f3897e417b69..2d39571451a0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -767,7 +767,9 @@ struct kvm_vcpu_arch { bool halted; gfn_t gfns[ASYNC_PF_PER_VCPU]; struct gfn_to_hva_cache data; - u64 msr_val; + u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */ + u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ + u16 vec; u32 id; bool send_user_only; u32 host_apf_flags; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index d1cd5c0f431a..1d37d616b1fc 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -50,6 +50,7 @@ #define MSR_KVM_STEAL_TIME 0x4b564d03 #define MSR_KVM_PV_EOI_EN 0x4b564d04 #define MSR_KVM_POLL_CONTROL 0x4b564d05 +#define MSR_KVM_ASYNC_PF_INT 0x4b564d06 struct kvm_steal_time { __u64 steal; @@ -81,6 +82,11 @@ struct kvm_clock_pairing { #define KVM_ASYNC_PF_ENABLED (1 << 0) #define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1) #define KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT (1 << 2) +#define KVM_ASYNC_PF_DELIVERY_AS_INT (1 << 3) + +/* MSR_KVM_ASYNC_PF_INT */ +#define KVM_ASYNC_PF_VEC_MASK GENMASK(7, 0) + /* Operations for KVM_HC_MMU_OP */ #define KVM_MMU_OP_WRITE_PTE 1 @@ -112,8 +118,12 @@ struct kvm_mmu_op_release_pt { #define KVM_PV_REASON_PAGE_READY 2 struct kvm_vcpu_pv_apf_data { + /* Used for 'page not present' events delivered via #PF */ __u32 flags; - __u32 token; /* Used for page ready notification only */ + + /* Used for 'page ready' events delivered via interrupt notification */ + __u32 token; + __u8 pad[56]; __u32 enabled; }; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e79b37b2b7e..e6f3ec5193b2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1248,7 +1248,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_TSC_EMULATION_STATUS, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, @@ -2673,17 +2673,24 @@ out: return r; } +static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) +{ + u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; + + return (vcpu->arch.apf.msr_en_val & mask) == mask; +} + static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) { gpa_t gpa = data & ~0x3f; - /* Bits 3:5 are reserved, Should be zero */ - if (data & 0x38) + /* Bits 4:5 are reserved, Should be zero */ + if (data & 0x30) return 1; - vcpu->arch.apf.msr_val = data; + vcpu->arch.apf.msr_en_val = data; - if (!(data & KVM_ASYNC_PF_ENABLED)) { + if (!kvm_pv_async_pf_enabled(vcpu)) { kvm_clear_async_pf_completion_queue(vcpu); kvm_async_pf_hash_reset(vcpu); return 0; @@ -2695,7 +2702,25 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + kvm_async_pf_wakeup_all(vcpu); + + return 0; +} + +static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) +{ + /* Bits 8-63 are reserved */ + if (data >> 8) + return 1; + + if (!lapic_in_kernel(vcpu)) + return 1; + + vcpu->arch.apf.msr_int_val = data; + + vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; + return 0; } @@ -2917,6 +2942,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pv_enable_async_pf(vcpu, data)) return 1; break; + case MSR_KVM_ASYNC_PF_INT: + if (kvm_pv_enable_async_pf_int(vcpu, data)) + return 1; + break; case MSR_KVM_STEAL_TIME: if (unlikely(!sched_info_on())) @@ -3191,7 +3220,10 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vcpu->arch.time; break; case MSR_KVM_ASYNC_PF_EN: - msr_info->data = vcpu->arch.apf.msr_val; + msr_info->data = vcpu->arch.apf.msr_en_val; + break; + case MSR_KVM_ASYNC_PF_INT: + msr_info->data = vcpu->arch.apf.msr_int_val; break; case MSR_KVM_STEAL_TIME: msr_info->data = vcpu->arch.st.msr_val; @@ -9553,7 +9585,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.cr2 = 0; kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.apf.msr_val = 0; + vcpu->arch.apf.msr_en_val = 0; + vcpu->arch.apf.msr_int_val = 0; vcpu->arch.st.msr_val = 0; kvmclock_reset(vcpu); @@ -10430,10 +10463,22 @@ static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) { - u64 val = (u64)token << 32 | KVM_PV_REASON_PAGE_READY; + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); - return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, - sizeof(val)); + return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &token, offset, sizeof(token)); +} + +static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu) +{ + unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); + u32 val; + + if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data, + &val, offset, sizeof(val))) + return false; + + return !val; } static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) @@ -10441,9 +10486,8 @@ static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) return false; - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) || - (vcpu->arch.apf.send_user_only && - kvm_x86_ops.get_cpl(vcpu) == 0)) + if (!kvm_pv_async_pf_enabled(vcpu) || + (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0)) return false; return true; @@ -10499,7 +10543,10 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work) { - struct x86_exception fault; + struct kvm_lapic_irq irq = { + .delivery_mode = APIC_DM_FIXED, + .vector = vcpu->arch.apf.vec + }; if (work->wakeup_all) work->arch.token = ~0; /* broadcast wakeup */ @@ -10507,26 +10554,20 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, kvm_del_async_pf_gfn(vcpu, work->arch.gfn); trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); - if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED && - !apf_put_user_ready(vcpu, work->arch.token)) { - fault.vector = PF_VECTOR; - fault.error_code_valid = true; - fault.error_code = 0; - fault.nested_page_fault = false; - fault.address = work->arch.token; - fault.async_page_fault = true; - kvm_inject_page_fault(vcpu, &fault); - } + if (kvm_pv_async_pf_enabled(vcpu) && + !apf_put_user_ready(vcpu, work->arch.token)) + kvm_apic_set_irq(vcpu, &irq, NULL); + vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { - if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED)) + if (!kvm_pv_async_pf_enabled(vcpu)) return true; else - return kvm_can_do_async_pf(vcpu); + return apf_pageready_slot_free(vcpu); } void kvm_arch_start_assignment(struct kvm *kvm) -- cgit v1.2.3 From 557a961abbe06ed9dfd3b55ef7bd6e68295cda3d Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:21 +0200 Subject: KVM: x86: acknowledgment mechanism for async pf page ready notifications If two page ready notifications happen back to back the second one is not delivered and the only mechanism we currently have is kvm_check_async_pf_completion() check in vcpu_run() loop. The check will only be performed with the next vmexit when it happens and in some cases it may take a while. With interrupt based page ready notification delivery the situation is even worse: unlike exceptions, interrupts are not handled immediately so we must check if the slot is empty. This is slow and unnecessary. Introduce dedicated MSR_KVM_ASYNC_PF_ACK MSR to communicate the fact that the slot is free and host should check its notification queue. Mandate using it for interrupt based 'page ready' APF event delivery. As kvm_check_async_pf_completion() is going away from vcpu_run() we need a way to communicate the fact that vcpu->async_pf.done queue has transitioned from empty to non-empty state. Introduce kvm_arch_async_page_present_queued() and KVM_REQ_APF_READY to do the job. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-7-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/msr.rst | 15 ++++++++++++++- arch/s390/include/asm/kvm_host.h | 2 ++ arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kvm/x86.c | 26 ++++++++++++++++++++++---- virt/kvm/async_pf.c | 10 ++++++++++ 6 files changed, 52 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst index be08df12f31a..9b107889b033 100644 --- a/Documentation/virt/kvm/msr.rst +++ b/Documentation/virt/kvm/msr.rst @@ -236,7 +236,9 @@ data: of these bytes is a token which was previously delivered as 'page not present' event. The event indicates the page in now available. Guest is supposed to write '0' to 'token' when it is done handling 'page ready' - event so the next one can be delivered. + event and to write 1' to MSR_KVM_ASYNC_PF_ACK after clearing the location; + writing to the MSR forces KVM to re-scan its queue and deliver the next + pending notification. Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page ready' APF delivery needs to be written to before enabling APF mechanism @@ -359,3 +361,14 @@ data: Interrupt vector for asynchnonous 'page ready' notifications delivery. The vector has to be set up before asynchronous page fault mechanism is enabled in MSR_KVM_ASYNC_PF_EN. + +MSR_KVM_ASYNC_PF_ACK: + 0x4b564d07 + +data: + Asynchronous page fault (APF) acknowledgment. + + When the guest is done processing 'page ready' APF event and 'token' + field in 'struct kvm_vcpu_pv_apf_data' is cleared it is supposed to + write '1' to bit 0 of the MSR, this causes the host to re-scan its queue + and check if there are more notifications pending. diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9d23bde1363b..3d554887794e 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -984,6 +984,8 @@ void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); +static inline void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) {} + void kvm_arch_crypto_clear_masks(struct kvm *kvm); void kvm_arch_crypto_set_masks(struct kvm *kvm, unsigned long *apm, unsigned long *aqm, unsigned long *adm); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2d39571451a0..f345bcf38cfb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -86,6 +86,7 @@ #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ @@ -775,6 +776,7 @@ struct kvm_vcpu_arch { u32 host_apf_flags; unsigned long nested_apf_token; bool delivery_as_pf_vmexit; + bool pageready_pending; } apf; /* OSVW MSRs (AMD only) */ @@ -1662,6 +1664,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 1d37d616b1fc..7ac20df80ba8 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -51,6 +51,7 @@ #define MSR_KVM_PV_EOI_EN 0x4b564d04 #define MSR_KVM_POLL_CONTROL 0x4b564d05 #define MSR_KVM_ASYNC_PF_INT 0x4b564d06 +#define MSR_KVM_ASYNC_PF_ACK 0x4b564d07 struct kvm_steal_time { __u64 steal; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e6f3ec5193b2..c9d709a672f3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1248,7 +1248,7 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_TSC_EMULATION_STATUS, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, - MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, + MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, @@ -2946,6 +2946,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pv_enable_async_pf_int(vcpu, data)) return 1; break; + case MSR_KVM_ASYNC_PF_ACK: + if (data & 0x1) { + vcpu->arch.apf.pageready_pending = false; + kvm_check_async_pf_completion(vcpu); + } + break; case MSR_KVM_STEAL_TIME: if (unlikely(!sched_info_on())) @@ -3225,6 +3231,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_KVM_ASYNC_PF_INT: msr_info->data = vcpu->arch.apf.msr_int_val; break; + case MSR_KVM_ASYNC_PF_ACK: + msr_info->data = 0; + break; case MSR_KVM_STEAL_TIME: msr_info->data = vcpu->arch.st.msr_val; break; @@ -8413,6 +8422,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) kvm_hv_process_stimers(vcpu); if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) kvm_vcpu_update_apicv(vcpu); + if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) + kvm_check_async_pf_completion(vcpu); } if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { @@ -8664,8 +8675,6 @@ static int vcpu_run(struct kvm_vcpu *vcpu) break; } - kvm_check_async_pf_completion(vcpu); - if (signal_pending(current)) { r = -EINTR; vcpu->run->exit_reason = KVM_EXIT_INTR; @@ -10555,13 +10564,22 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa); if (kvm_pv_async_pf_enabled(vcpu) && - !apf_put_user_ready(vcpu, work->arch.token)) + !apf_put_user_ready(vcpu, work->arch.token)) { + vcpu->arch.apf.pageready_pending = true; kvm_apic_set_irq(vcpu, &irq, NULL); + } vcpu->arch.apf.halted = false; vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } +void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) +{ + kvm_make_request(KVM_REQ_APF_READY, vcpu); + if (!vcpu->arch.apf.pageready_pending) + kvm_vcpu_kick(vcpu); +} + bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) { if (!kvm_pv_async_pf_enabled(vcpu)) diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index 82e53f180a1a..f1e07fae84e9 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -51,6 +51,7 @@ static void async_pf_execute(struct work_struct *work) unsigned long addr = apf->addr; gpa_t cr2_or_gpa = apf->cr2_or_gpa; int locked = 1; + bool first; might_sleep(); @@ -69,10 +70,14 @@ static void async_pf_execute(struct work_struct *work) kvm_arch_async_page_present(vcpu, apf); spin_lock(&vcpu->async_pf.lock); + first = list_empty(&vcpu->async_pf.done); list_add_tail(&apf->link, &vcpu->async_pf.done); apf->vcpu = NULL; spin_unlock(&vcpu->async_pf.lock); + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) + kvm_arch_async_page_present_queued(vcpu); + /* * apf may be freed by kvm_check_async_pf_completion() after * this point @@ -201,6 +206,7 @@ retry_sync: int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) { struct kvm_async_pf *work; + bool first; if (!list_empty_careful(&vcpu->async_pf.done)) return 0; @@ -213,9 +219,13 @@ int kvm_async_pf_wakeup_all(struct kvm_vcpu *vcpu) INIT_LIST_HEAD(&work->queue); /* for list_del to work */ spin_lock(&vcpu->async_pf.lock); + first = list_empty(&vcpu->async_pf.done); list_add_tail(&work->link, &vcpu->async_pf.done); spin_unlock(&vcpu->async_pf.lock); + if (!IS_ENABLED(CONFIG_KVM_ASYNC_PF_SYNC) && first) + kvm_arch_async_page_present_queued(vcpu); + vcpu->async_pf.queued++; return 0; } -- cgit v1.2.3 From 72de5fa4c16195827252b961ba44028a39dfeaff Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Mon, 25 May 2020 16:41:22 +0200 Subject: KVM: x86: announce KVM_FEATURE_ASYNC_PF_INT Introduce new capability to indicate that KVM supports interrupt based delivery of 'page ready' APF events. This includes support for both MSR_KVM_ASYNC_PF_INT and MSR_KVM_ASYNC_PF_ACK. Signed-off-by: Vitaly Kuznetsov Message-Id: <20200525144125.143875-8-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/cpuid.rst | 6 ++++++ Documentation/virt/kvm/msr.rst | 12 ++++++++---- arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 1 + include/uapi/linux/kvm.h | 1 + 6 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/virt/kvm/cpuid.rst b/Documentation/virt/kvm/cpuid.rst index f721c89327ec..a7dff9186bed 100644 --- a/Documentation/virt/kvm/cpuid.rst +++ b/Documentation/virt/kvm/cpuid.rst @@ -86,6 +86,12 @@ KVM_FEATURE_PV_SCHED_YIELD 13 guest checks this feature bit before using paravirtualized sched yield. +KVM_FEATURE_ASYNC_PF_INT 14 guest checks this feature bit + before using the second async + pf control msr 0x4b564d06 and + async pf acknowledgment msr + 0x4b564d07. + KVM_FEATURE_CLOCSOURCE_STABLE_BIT 24 host will warn if no guest-side per-cpu warps are expeced in kvmclock diff --git a/Documentation/virt/kvm/msr.rst b/Documentation/virt/kvm/msr.rst index 9b107889b033..e37a14c323d2 100644 --- a/Documentation/virt/kvm/msr.rst +++ b/Documentation/virt/kvm/msr.rst @@ -213,7 +213,8 @@ data: cpl == 0. Bit 2 is 1 if asynchronous page faults are delivered to L1 as #PF vmexits. Bit 2 can be set only if KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. Bit 3 enables interrupt based delivery of 'page ready' - events. + events. Bit 3 can only be set if KVM_FEATURE_ASYNC_PF_INT is present in + CPUID. 'Page not present' events are currently always delivered as synthetic #PF exception. During delivery of these events APF CR2 register contains @@ -242,7 +243,8 @@ data: Note, MSR_KVM_ASYNC_PF_INT MSR specifying the interrupt vector for 'page ready' APF delivery needs to be written to before enabling APF mechanism - in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. + in MSR_KVM_ASYNC_PF_EN or interrupt #0 can get injected. The MSR is + available if KVM_FEATURE_ASYNC_PF_INT is present in CPUID. Note, previously, 'page ready' events were delivered via the same #PF exception as 'page not present' events but this is now deprecated. If @@ -360,7 +362,8 @@ data: Interrupt vector for asynchnonous 'page ready' notifications delivery. The vector has to be set up before asynchronous page fault mechanism - is enabled in MSR_KVM_ASYNC_PF_EN. + is enabled in MSR_KVM_ASYNC_PF_EN. The MSR is only available if + KVM_FEATURE_ASYNC_PF_INT is present in CPUID. MSR_KVM_ASYNC_PF_ACK: 0x4b564d07 @@ -371,4 +374,5 @@ data: When the guest is done processing 'page ready' APF event and 'token' field in 'struct kvm_vcpu_pv_apf_data' is cleared it is supposed to write '1' to bit 0 of the MSR, this causes the host to re-scan its queue - and check if there are more notifications pending. + and check if there are more notifications pending. The MSR is available + if KVM_FEATURE_ASYNC_PF_INT is present in CPUID. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 7ac20df80ba8..812e9b4c1114 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -31,6 +31,7 @@ #define KVM_FEATURE_PV_SEND_IPI 11 #define KVM_FEATURE_POLL_CONTROL 12 #define KVM_FEATURE_PV_SCHED_YIELD 13 +#define KVM_FEATURE_ASYNC_PF_INT 14 #define KVM_HINTS_REALTIME 0 diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index cd708b0b460a..a9f1905896fb 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -711,7 +711,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) | (1 << KVM_FEATURE_PV_SEND_IPI) | (1 << KVM_FEATURE_POLL_CONTROL) | - (1 << KVM_FEATURE_PV_SCHED_YIELD); + (1 << KVM_FEATURE_PV_SCHED_YIELD) | + (1 << KVM_FEATURE_ASYNC_PF_INT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c9d709a672f3..ae3a7f2fbda2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3446,6 +3446,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_ASYNC_PF_INT: case KVM_CAP_GET_TSC_KHZ: case KVM_CAP_KVMCLOCK_CTRL: case KVM_CAP_READONLY_MEM: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index ac9eba0289d1..fc0075988b80 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1018,6 +1018,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_S390_PROTECTED 180 #define KVM_CAP_PPC_SECURE_GUEST 181 #define KVM_CAP_HALT_POLL 182 +#define KVM_CAP_ASYNC_PF_INT 183 #ifdef KVM_CAP_IRQ_ROUTING -- cgit v1.2.3 From 27461da31089ace6966e4f1695cba7eb87ffbe4f Mon Sep 17 00:00:00 2001 From: Like Xu Date: Fri, 29 May 2020 15:43:45 +0800 Subject: KVM: x86/pmu: Support full width counting Intel CPUs have a new alternative MSR range (starting from MSR_IA32_PMC0) for GP counters that allows writing the full counter width. Enable this range from a new capability bit (IA32_PERF_CAPABILITIES.FW_WRITE[bit 13]). The guest would query CPUID to get the counter width, and sign extends the counter values as needed. The traditional MSRs always limit to 32bit, even though the counter internally is larger (48 or 57 bits). When the new capability is set, use the alternative range which do not have these restrictions. This lowers the overhead of perf stat slightly because it has to do less interrupts to accumulate the counter value. Signed-off-by: Like Xu Message-Id: <20200529074347.124619-3-like.xu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/vmx/capabilities.h | 11 +++++++++ arch/x86/kvm/vmx/pmu_intel.c | 52 +++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/vmx/vmx.c | 3 +++ arch/x86/kvm/x86.c | 2 ++ 6 files changed, 66 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index f345bcf38cfb..b878fcd164ce 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -601,6 +601,7 @@ struct kvm_vcpu_arch { u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; + u64 perf_capabilities; /* * Paging state of the vcpu diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a9f1905896fb..253b8e875ccd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -296,7 +296,7 @@ void kvm_set_cpu_caps(void) F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ | 0 /* DS-CPL, VMX, SMX, EST */ | 0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ | - F(FMA) | F(CX16) | 0 /* xTPR Update, PDCM */ | + F(FMA) | F(CX16) | 0 /* xTPR Update */ | F(PDCM) | F(PCID) | 0 /* Reserved, DCA */ | F(XMM4_1) | F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) | diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h index 8903475f751e..4bbd8b448d22 100644 --- a/arch/x86/kvm/vmx/capabilities.h +++ b/arch/x86/kvm/vmx/capabilities.h @@ -18,6 +18,8 @@ extern int __read_mostly pt_mode; #define PT_MODE_SYSTEM 0 #define PT_MODE_HOST_GUEST 1 +#define PMU_CAP_FW_WRITES (1ULL << 13) + struct nested_vmx_msrs { /* * We only store the "true" versions of the VMX capability MSRs. We @@ -367,4 +369,13 @@ static inline bool vmx_pt_mode_is_host_guest(void) return pt_mode == PT_MODE_HOST_GUEST; } +static inline u64 vmx_get_perf_capabilities(void) +{ + /* + * Since counters are virtualized, KVM would support full + * width counting unconditionally, even if the host lacks it. + */ + return PMU_CAP_FW_WRITES; +} + #endif /* __KVM_X86_VMX_CAPS_H */ diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index e1a303fefc16..d33d890b605f 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -18,6 +18,8 @@ #include "nested.h" #include "pmu.h" +#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0) + static struct kvm_event_hw_type_mapping intel_arch_events[] = { /* Index must match CPUID 0x0A.EBX bit vector */ [0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES }, @@ -150,6 +152,22 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, return &counters[array_index_nospec(idx, num_counters)]; } +static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu) +{ + if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return false; + + return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES; +} + +static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr) +{ + if (!fw_writes_is_enabled(pmu_to_vcpu(pmu))) + return NULL; + + return get_gp_pmc(pmu, msr, MSR_IA32_PMC0); +} + static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); @@ -162,10 +180,13 @@ static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: ret = pmu->version > 1; break; + case MSR_IA32_PERF_CAPABILITIES: + ret = guest_cpuid_has(vcpu, X86_FEATURE_PDCM); + break; default: ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) || get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) || - get_fixed_pmc(pmu, msr); + get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr); break; } @@ -203,8 +224,15 @@ static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_CORE_PERF_GLOBAL_OVF_CTRL: msr_info->data = pmu->global_ovf_ctrl; return 0; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + return 1; + msr_info->data = vcpu->arch.perf_capabilities; + return 0; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { u64 val = pmc_read_counter(pmc); msr_info->data = val & pmu->counter_bitmask[KVM_PMC_GP]; @@ -261,9 +289,22 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 0; } break; + case MSR_IA32_PERF_CAPABILITIES: + if (!msr_info->host_initiated) + return 1; + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM) ? + (data & ~vmx_get_perf_capabilities()) : data) + return 1; + vcpu->arch.perf_capabilities = data; + return 0; default: - if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) { - if (!msr_info->host_initiated) + if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) || + (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) { + if ((msr & MSR_PMC_FULL_WIDTH_BIT) && + (data & ~pmu->counter_bitmask[KVM_PMC_GP])) + return 1; + if (!msr_info->host_initiated && + !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); if (pmc->perf_event) @@ -303,6 +344,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) pmu->counter_bitmask[KVM_PMC_FIXED] = 0; pmu->version = 0; pmu->reserved_bits = 0xffffffff00200000ull; + vcpu->arch.perf_capabilities = 0; entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); if (!entry) @@ -315,6 +357,8 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu) return; perf_get_x86_pmu_capability(&x86_pmu); + if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) + vcpu->arch.perf_capabilities = vmx_get_perf_capabilities(); pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters, x86_pmu.num_counters_gp); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 5a43af0061ef..170cc76a581f 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1788,6 +1788,9 @@ static int vmx_get_msr_feature(struct kvm_msr_entry *msr) if (!nested) return 1; return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data); + case MSR_IA32_PERF_CAPABILITIES: + msr->data = vmx_get_perf_capabilities(); + return 0; default: return 1; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7e34ff784695..ca8a57312291 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1253,6 +1253,7 @@ static const u32 emulated_msrs_all[] = { MSR_IA32_TSC_ADJUST, MSR_IA32_TSCDEADLINE, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1319,6 +1320,7 @@ static const u32 msr_based_features_all[] = { MSR_F10H_DECFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, + MSR_IA32_PERF_CAPABILITIES, }; static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; -- cgit v1.2.3 From 850448f35aaf45215276468d63c91ab1e230cf06 Mon Sep 17 00:00:00 2001 From: Peter Shier Date: Tue, 26 May 2020 14:51:06 -0700 Subject: KVM: nVMX: Fix VMX preemption timer migration Add new field to hold preemption timer expiration deadline appended to struct kvm_vmx_nested_state_hdr. This is to prevent the first VM-Enter after migration from incorrectly restarting the timer with the full timer value instead of partially decayed timer value. KVM_SET_NESTED_STATE restarts timer using migrated state regardless of whether L1 sets VM_EXIT_SAVE_VMX_PREEMPTION_TIMER. Fixes: cf8b84f48a593 ("kvm: nVMX: Prepare for checkpointing L2 state") Signed-off-by: Peter Shier Signed-off-by: Makarand Sonare Message-Id: <20200526215107.205814-2-makarandsonare@google.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 4 +++ arch/x86/include/uapi/asm/kvm.h | 3 +++ arch/x86/kvm/vmx/nested.c | 55 +++++++++++++++++++++++++++++++++++------ arch/x86/kvm/vmx/vmx.h | 2 ++ 4 files changed, 56 insertions(+), 8 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index d0569db2b1e2..db22e834ce2a 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -4334,9 +4334,13 @@ Errors: #define KVM_STATE_NESTED_VMX_SMM_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_VMX_SMM_VMXON 0x00000002 +#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 + struct kvm_vmx_nested_state_hdr { + __u32 flags; __u64 vmxon_pa; __u64 vmcs12_pa; + __u64 preemption_timer_deadline; struct { __u16 flags; diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 12075a9de1c1..17c5a038f42d 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -400,6 +400,7 @@ struct kvm_sync_regs { #define KVM_STATE_NESTED_SVM_VMCB_SIZE 0x1000 +#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; @@ -407,8 +408,10 @@ struct kvm_vmx_nested_state_data { }; struct kvm_vmx_nested_state_hdr { + __u32 flags; __u64 vmxon_pa; __u64 vmcs12_pa; + __u64 preemption_timer_deadline; struct { __u16 flags; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 119a2f7395d6..da87bb8670bb 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2087,9 +2087,29 @@ static enum hrtimer_restart vmx_preemption_timer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; } -static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu) +static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u64 timer_value = 0; + + u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >> + VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE; + + if (!vmx->nested.has_preemption_timer_deadline) { + timer_value = vmcs12->vmx_preemption_timer_value; + vmx->nested.preemption_timer_deadline = timer_value + + l1_scaled_tsc; + vmx->nested.has_preemption_timer_deadline = true; + } else if (l1_scaled_tsc < vmx->nested.preemption_timer_deadline) + timer_value = vmx->nested.preemption_timer_deadline - + l1_scaled_tsc; + return timer_value; +} + +static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu, + u64 preemption_timeout) { - u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value; struct vcpu_vmx *vmx = to_vmx(vcpu); /* @@ -3348,8 +3368,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, * the timer. */ vmx->nested.preemption_timer_expired = false; - if (nested_cpu_has_preemption_timer(vmcs12)) - vmx_start_preemption_timer(vcpu); + if (nested_cpu_has_preemption_timer(vmcs12)) { + u64 timer_value = vmx_calc_preemption_timer_value(vcpu); + vmx_start_preemption_timer(vcpu, timer_value); + } /* * Note no nested_vmx_succeed or nested_vmx_fail here. At this point @@ -3457,6 +3479,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the nested entry. */ vmx->nested.nested_run_pending = 1; + vmx->nested.has_preemption_timer_deadline = false; status = nested_vmx_enter_non_root_mode(vcpu, true); if (unlikely(status != NVMX_VMENTRY_SUCCESS)) goto vmentry_failed; @@ -3957,9 +3980,10 @@ static void sync_vmcs02_to_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE; if (nested_cpu_has_preemption_timer(vmcs12) && - vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER) - vmcs12->vmx_preemption_timer_value = - vmx_get_preemption_timer_value(vcpu); + vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER && + !vmx->nested.nested_run_pending) + vmcs12->vmx_preemption_timer_value = + vmx_get_preemption_timer_value(vcpu); /* * In some cases (usually, nested EPT), L2 is allowed to change its @@ -5891,8 +5915,10 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, .flags = 0, .format = KVM_STATE_NESTED_FORMAT_VMX, .size = sizeof(kvm_state), + .hdr.vmx.flags = 0, .hdr.vmx.vmxon_pa = -1ull, .hdr.vmx.vmcs12_pa = -1ull, + .hdr.vmx.preemption_timer_deadline = 0, }; struct kvm_vmx_nested_state_data __user *user_vmx_nested_state = &user_kvm_nested_state->data.vmx[0]; @@ -5934,6 +5960,14 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, if (vmx->nested.mtf_pending) kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING; + + if (nested_cpu_has_preemption_timer(vmcs12) && + vmx->nested.has_preemption_timer_deadline) { + kvm_state.hdr.vmx.flags |= + KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE; + kvm_state.hdr.vmx.preemption_timer_deadline = + vmx->nested.preemption_timer_deadline; + } } } @@ -5979,7 +6013,6 @@ static int vmx_get_nested_state(struct kvm_vcpu *vcpu, get_shadow_vmcs12(vcpu), VMCS12_SIZE)) return -EFAULT; } - out: return kvm_state.size; } @@ -6141,6 +6174,12 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, goto error_guest_mode; } + if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) { + vmx->nested.has_preemption_timer_deadline = true; + vmx->nested.preemption_timer_deadline = + kvm_state->hdr.vmx.preemption_timer_deadline; + } + if (nested_vmx_check_controls(vcpu, vmcs12) || nested_vmx_check_host_state(vcpu, vmcs12) || nested_vmx_check_guest_state(vcpu, vmcs12, &ignored)) diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 298ddef79d00..672c28f17e49 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -169,6 +169,8 @@ struct nested_vmx { u16 posted_intr_nv; struct hrtimer preemption_timer; + u64 preemption_timer_deadline; + bool has_preemption_timer_deadline; bool preemption_timer_expired; /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ -- cgit v1.2.3 From 22ad0026d0978d4211eb07a2be77f49fb40d86eb Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Fri, 29 May 2020 16:45:39 +0300 Subject: x86/hyper-v: Add synthetic debugger definitions Hyper-V synthetic debugger has two modes, one that uses MSRs and the other that use Hypercalls. Add all the required definitions to both types of synthetic debugger interface. Some of the required new CPUIDs and MSRs are not documented in the TLFS so they are in hyperv.h instead. The reason they are not documented is because they are subjected to be removed in future versions of Windows. Reviewed-by: Michael Kelley Signed-off-by: Jon Doron Message-Id: <20200529134543.1127440-3-arilou@gmail.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/hyperv-tlfs.h | 6 ++++++ arch/x86/kvm/hyperv.h | 27 +++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h index 29336574d0bc..53ef6b7bd380 100644 --- a/arch/x86/include/asm/hyperv-tlfs.h +++ b/arch/x86/include/asm/hyperv-tlfs.h @@ -131,6 +131,8 @@ #define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE BIT(8) /* Crash MSR available */ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE BIT(10) +/* Support for debug MSRs available */ +#define HV_FEATURE_DEBUG_MSRS_AVAILABLE BIT(11) /* stimer Direct Mode is available */ #define HV_STIMER_DIRECT_MODE_AVAILABLE BIT(19) @@ -376,6 +378,9 @@ struct hv_tsc_emulation_status { #define HVCALL_SEND_IPI_EX 0x0015 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d +#define HVCALL_POST_DEBUG_DATA 0x0069 +#define HVCALL_RETRIEVE_DEBUG_DATA 0x006a +#define HVCALL_RESET_DEBUG_SESSION 0x006b #define HVCALL_RETARGET_INTERRUPT 0x007e #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af #define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_LIST 0x00b0 @@ -422,6 +427,7 @@ enum HV_GENERIC_SET_FORMAT { #define HV_STATUS_INVALID_HYPERCALL_INPUT 3 #define HV_STATUS_INVALID_ALIGNMENT 4 #define HV_STATUS_INVALID_PARAMETER 5 +#define HV_STATUS_OPERATION_DENIED 8 #define HV_STATUS_INSUFFICIENT_MEMORY 11 #define HV_STATUS_INVALID_PORT_ID 17 #define HV_STATUS_INVALID_CONNECTION_ID 18 diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 757cb578101c..7f50ff0bad07 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -23,6 +23,33 @@ #include +/* + * The #defines related to the synthetic debugger are required by KDNet, but + * they are not documented in the Hyper-V TLFS because the synthetic debugger + * functionality has been deprecated and is subject to removal in future + * versions of Windows. + */ +#define HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS 0x40000080 +#define HYPERV_CPUID_SYNDBG_INTERFACE 0x40000081 +#define HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES 0x40000082 + +/* + * Hyper-V synthetic debugger platform capabilities + * These are HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX bits. + */ +#define HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING BIT(1) + +/* Hyper-V Synthetic debug options MSR */ +#define HV_X64_MSR_SYNDBG_CONTROL 0x400000F1 +#define HV_X64_MSR_SYNDBG_STATUS 0x400000F2 +#define HV_X64_MSR_SYNDBG_SEND_BUFFER 0x400000F3 +#define HV_X64_MSR_SYNDBG_RECV_BUFFER 0x400000F4 +#define HV_X64_MSR_SYNDBG_PENDING_BUFFER 0x400000F5 +#define HV_X64_MSR_SYNDBG_OPTIONS 0x400000FF + +/* Hyper-V HV_X64_MSR_SYNDBG_OPTIONS bits */ +#define HV_X64_SYNDBG_OPTION_USE_HCALLS BIT(2) + static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu) { return &vcpu->arch.hyperv; -- cgit v1.2.3 From f97f5a56f5977311f3833056a73cdbb0ee56cb1e Mon Sep 17 00:00:00 2001 From: Jon Doron Date: Fri, 29 May 2020 16:45:40 +0300 Subject: x86/kvm/hyper-v: Add support for synthetic debugger interface Add support for Hyper-V synthetic debugger (syndbg) interface. The syndbg interface is using MSRs to emulate a way to send/recv packets data. The debug transport dll (kdvm/kdnet) will identify if Hyper-V is enabled and if it supports the synthetic debugger interface it will attempt to use it, instead of trying to initialize a network adapter. Reviewed-by: Vitaly Kuznetsov Signed-off-by: Jon Doron Message-Id: <20200529134543.1127440-4-arilou@gmail.com> Signed-off-by: Paolo Bonzini --- Documentation/virt/kvm/api.rst | 16 ++++ arch/x86/include/asm/kvm_host.h | 13 ++++ arch/x86/kvm/hyperv.c | 158 +++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/hyperv.h | 5 ++ arch/x86/kvm/trace.h | 51 +++++++++++++ arch/x86/kvm/x86.c | 8 ++ include/uapi/linux/kvm.h | 10 +++ 7 files changed, 258 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index db22e834ce2a..aad60be4884e 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -5070,6 +5070,7 @@ EOI was received. struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 + #define KVM_EXIT_HYPERV_SYNDBG 3 __u32 type; __u32 pad1; union { @@ -5085,6 +5086,15 @@ EOI was received. __u64 result; __u64 params[2]; } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; } u; }; /* KVM_EXIT_HYPERV */ @@ -5101,6 +5111,12 @@ Hyper-V SynIC state change. Notification is used to remap SynIC event/message pages and to enable/disable SynIC messages/events processing in userspace. + - KVM_EXIT_HYPERV_SYNDBG -- synchronously notify user-space about + +Hyper-V Synthetic debugger state change. Notification is used to either update +the pending_page location or to send a control command (send the buffer located +in send_page or recv a buffer to recv_page). + :: /* KVM_EXIT_ARM_NISV */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b878fcd164ce..58337a25396a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -863,6 +863,18 @@ struct kvm_apic_map { struct kvm_lapic *phys_map[]; }; +/* Hyper-V synthetic debugger (SynDbg)*/ +struct kvm_hv_syndbg { + struct { + u64 control; + u64 status; + u64 send_page; + u64 recv_page; + u64 pending_page; + } control; + u64 options; +}; + /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; @@ -886,6 +898,7 @@ struct kvm_hv { atomic_t num_mismatched_vp_indexes; struct hv_partition_assist_pg *hv_pa_pg; + struct kvm_hv_syndbg hv_syndbg; }; enum kvm_irqchip_mode { diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index f9d3b919823c..c21f99357ad5 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -21,6 +21,7 @@ #include "x86.h" #include "lapic.h" #include "ioapic.h" +#include "cpuid.h" #include "hyperv.h" #include @@ -266,6 +267,123 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic, return ret; } +static bool kvm_hv_is_syndbg_enabled(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + + entry = kvm_find_cpuid_entry(vcpu, + HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES, + 0); + if (!entry) + return false; + + return entry->eax & HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; +} + +static int kvm_hv_syndbg_complete_userspace(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + struct kvm_hv *hv = &kvm->arch.hyperv; + + if (vcpu->run->hyperv.u.syndbg.msr == HV_X64_MSR_SYNDBG_CONTROL) + hv->hv_syndbg.control.status = + vcpu->run->hyperv.u.syndbg.status; + return 1; +} + +static void syndbg_exit(struct kvm_vcpu *vcpu, u32 msr) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv; + + hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNDBG; + hv_vcpu->exit.u.syndbg.msr = msr; + hv_vcpu->exit.u.syndbg.control = syndbg->control.control; + hv_vcpu->exit.u.syndbg.send_page = syndbg->control.send_page; + hv_vcpu->exit.u.syndbg.recv_page = syndbg->control.recv_page; + hv_vcpu->exit.u.syndbg.pending_page = syndbg->control.pending_page; + vcpu->arch.complete_userspace_io = + kvm_hv_syndbg_complete_userspace; + + kvm_make_request(KVM_REQ_HV_EXIT, vcpu); +} + +static int syndbg_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + trace_kvm_hv_syndbg_set_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, data); + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + syndbg->control.control = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_STATUS: + syndbg->control.status = data; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + syndbg->control.send_page = data; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + syndbg->control.recv_page = data; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + syndbg->control.pending_page = data; + if (!host) + syndbg_exit(vcpu, msr); + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + syndbg->options = data; + break; + default: + break; + } + + return 0; +} + +static int syndbg_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) +{ + struct kvm_hv_syndbg *syndbg = vcpu_to_hv_syndbg(vcpu); + + if (!kvm_hv_is_syndbg_enabled(vcpu) && !host) + return 1; + + switch (msr) { + case HV_X64_MSR_SYNDBG_CONTROL: + *pdata = syndbg->control.control; + break; + case HV_X64_MSR_SYNDBG_STATUS: + *pdata = syndbg->control.status; + break; + case HV_X64_MSR_SYNDBG_SEND_BUFFER: + *pdata = syndbg->control.send_page; + break; + case HV_X64_MSR_SYNDBG_RECV_BUFFER: + *pdata = syndbg->control.recv_page; + break; + case HV_X64_MSR_SYNDBG_PENDING_BUFFER: + *pdata = syndbg->control.pending_page; + break; + case HV_X64_MSR_SYNDBG_OPTIONS: + *pdata = syndbg->options; + break; + default: + break; + } + + trace_kvm_hv_syndbg_get_msr(vcpu->vcpu_id, + vcpu_to_hv_vcpu(vcpu)->vp_index, msr, + *pdata); + + return 0; +} + static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata, bool host) { @@ -800,6 +918,8 @@ static bool kvm_hv_msr_partition_wide(u32 msr) case HV_X64_MSR_REENLIGHTENMENT_CONTROL: case HV_X64_MSR_TSC_EMULATION_CONTROL: case HV_X64_MSR_TSC_EMULATION_STATUS: + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: r = true; break; } @@ -1061,6 +1181,9 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data, if (!host) return 1; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_set_msr(vcpu, msr, data, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); @@ -1190,7 +1313,8 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 0; } -static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, + bool host) { u64 data = 0; struct kvm *kvm = vcpu->kvm; @@ -1227,6 +1351,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case HV_X64_MSR_TSC_EMULATION_STATUS: data = hv->hv_tsc_emulation_status; break; + case HV_X64_MSR_SYNDBG_OPTIONS: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + return syndbg_get_msr(vcpu, msr, pdata, host); default: vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1316,7 +1443,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) int r; mutex_lock(&vcpu->kvm->arch.hyperv.hv_lock); - r = kvm_hv_get_msr_pw(vcpu, msr, pdata); + r = kvm_hv_get_msr_pw(vcpu, msr, pdata, host); mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock); return r; } else @@ -1795,6 +1922,9 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, { .function = HYPERV_CPUID_FEATURES }, { .function = HYPERV_CPUID_ENLIGHTMENT_INFO }, { .function = HYPERV_CPUID_IMPLEMENT_LIMITS }, + { .function = HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS }, + { .function = HYPERV_CPUID_SYNDBG_INTERFACE }, + { .function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES }, { .function = HYPERV_CPUID_NESTED_FEATURES }, }; int i, nent = ARRAY_SIZE(cpuid_entries); @@ -1820,7 +1950,7 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, case HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS: memcpy(signature, "Linux KVM Hv", 12); - ent->eax = HYPERV_CPUID_NESTED_FEATURES; + ent->eax = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES; ent->ebx = signature[0]; ent->ecx = signature[1]; ent->edx = signature[2]; @@ -1859,6 +1989,10 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->edx |= HV_FEATURE_FREQUENCY_MSRS_AVAILABLE; ent->edx |= HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE; + ent->ebx |= HV_X64_DEBUGGING; + ent->edx |= HV_X64_GUEST_DEBUGGING_AVAILABLE; + ent->edx |= HV_FEATURE_DEBUG_MSRS_AVAILABLE; + /* * Direct Synthetic timers only make sense with in-kernel * LAPIC @@ -1902,6 +2036,24 @@ int kvm_vcpu_ioctl_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, break; + case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS: + memcpy(signature, "Linux KVM Hv", 12); + + ent->eax = 0; + ent->ebx = signature[0]; + ent->ecx = signature[1]; + ent->edx = signature[2]; + break; + + case HYPERV_CPUID_SYNDBG_INTERFACE: + memcpy(signature, "VS#1\0\0\0\0\0\0\0\0", 12); + ent->eax = signature[0]; + break; + + case HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES: + ent->eax |= HV_X64_SYNDBG_CAP_ALLOW_KERNEL_DEBUGGING; + break; + default: break; } diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h index 7f50ff0bad07..e68c6c2e9649 100644 --- a/arch/x86/kvm/hyperv.h +++ b/arch/x86/kvm/hyperv.h @@ -73,6 +73,11 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic) return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic)); } +static inline struct kvm_hv_syndbg *vcpu_to_hv_syndbg(struct kvm_vcpu *vcpu) +{ + return &vcpu->kvm->arch.hyperv.hv_syndbg; +} + int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host); int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 54a10c98d746..b66432b015d2 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1541,6 +1541,57 @@ TRACE_EVENT(kvm_nested_vmenter_failed, __print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS)) ); +/* + * Tracepoint for syndbg_set_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_set_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); + +/* + * Tracepoint for syndbg_get_msr. + */ +TRACE_EVENT(kvm_hv_syndbg_get_msr, + TP_PROTO(int vcpu_id, u32 vp_index, u32 msr, u64 data), + TP_ARGS(vcpu_id, vp_index, msr, data), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(u32, vp_index) + __field(u32, msr) + __field(u64, data) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->vp_index = vp_index; + __entry->msr = msr; + __entry->data = data; + ), + + TP_printk("vcpu_id %d vp_index %u msr 0x%x data 0x%llx", + __entry->vcpu_id, __entry->vp_index, __entry->msr, + __entry->data) +); #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca8a57312291..9e41b5135340 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1246,6 +1246,10 @@ static const u32 emulated_msrs_all[] = { HV_X64_MSR_VP_ASSIST_PAGE, HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, HV_X64_MSR_TSC_EMULATION_STATUS, + HV_X64_MSR_SYNDBG_OPTIONS, + HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, + HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, + HV_X64_MSR_SYNDBG_PENDING_BUFFER, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, @@ -3011,6 +3015,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) */ break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: @@ -3272,6 +3278,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x20000000; break; case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: + case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: + case HV_X64_MSR_SYNDBG_OPTIONS: case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: case HV_X64_MSR_CRASH_CTL: case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cc04aceb8793..6721eb563eda 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -188,6 +188,7 @@ struct kvm_s390_cmma_log { struct kvm_hyperv_exit { #define KVM_EXIT_HYPERV_SYNIC 1 #define KVM_EXIT_HYPERV_HCALL 2 +#define KVM_EXIT_HYPERV_SYNDBG 3 __u32 type; __u32 pad1; union { @@ -203,6 +204,15 @@ struct kvm_hyperv_exit { __u64 result; __u64 params[2]; } hcall; + struct { + __u32 msr; + __u32 pad2; + __u64 control; + __u64 status; + __u64 send_page; + __u64 recv_page; + __u64 pending_page; + } syndbg; } u; }; -- cgit v1.2.3 From 78bb17f76edc3959152334947deec4dcb56e3764 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:50:15 -0700 Subject: x86/hyperv: use vmalloc_exec for the hypercall page Patch series "decruft the vmalloc API", v2. Peter noticed that with some dumb luck you can toast the kernel address space with exported vmalloc symbols. I used this as an opportunity to decruft the vmalloc.c API and make it much more systematic. This also removes any chance to create vmalloc mappings outside the designated areas or using executable permissions from modules. Besides that it removes more than 300 lines of code. This patch (of 29): Use the designated helper for allocating executable kernel memory, and remove the now unused PAGE_KERNEL_RX define. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Michael Kelley Acked-by: Wei Liu Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Catalin Marinas Cc: Will Deacon Cc: Heiko Carstens Cc: Vasily Gorbik Link: http://lkml.kernel.org/r/20200414131348.444715-1-hch@lst.de Link: http://lkml.kernel.org/r/20200414131348.444715-2-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/hyperv/hv_init.c | 2 +- arch/x86/include/asm/pgtable_types.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index acf76b466db6..697ddd2afef9 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -379,7 +379,7 @@ void __init hyperv_init(void) guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + hv_hypercall_pg = vmalloc_exec(PAGE_SIZE); if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); goto remove_cpuhp_state; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b6606fe6cfdf..947867f112ea 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -194,7 +194,6 @@ enum page_cache_mode { #define _PAGE_TABLE_NOENC (__PP|__RW|_USR|___A| 0|___D| 0| 0) #define _PAGE_TABLE (__PP|__RW|_USR|___A| 0|___D| 0| 0| _ENC) #define __PAGE_KERNEL_RO (__PP| 0| 0|___A|__NX|___D| 0|___G) -#define __PAGE_KERNEL_RX (__PP| 0| 0|___A| 0|___D| 0|___G) #define __PAGE_KERNEL_NOCACHE (__PP|__RW| 0|___A|__NX|___D| 0|___G| __NC) #define __PAGE_KERNEL_VVAR (__PP| 0|_USR|___A|__NX|___D| 0|___G) #define __PAGE_KERNEL_LARGE (__PP|__RW| 0|___A|__NX|___D|_PSE|___G) @@ -220,7 +219,6 @@ enum page_cache_mode { #define PAGE_KERNEL_RO __pgprot_mask(__PAGE_KERNEL_RO | _ENC) #define PAGE_KERNEL_EXEC __pgprot_mask(__PAGE_KERNEL_EXEC | _ENC) #define PAGE_KERNEL_EXEC_NOENC __pgprot_mask(__PAGE_KERNEL_EXEC | 0) -#define PAGE_KERNEL_RX __pgprot_mask(__PAGE_KERNEL_RX | _ENC) #define PAGE_KERNEL_NOCACHE __pgprot_mask(__PAGE_KERNEL_NOCACHE | _ENC) #define PAGE_KERNEL_LARGE __pgprot_mask(__PAGE_KERNEL_LARGE | _ENC) #define PAGE_KERNEL_LARGE_EXEC __pgprot_mask(__PAGE_KERNEL_LARGE_EXEC | _ENC) -- cgit v1.2.3 From cca98e9f8b5ebcd9640846a675172578249b11a0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:32 -0700 Subject: mm: enforce that vmap can't map pages executable To help enforcing the W^X protection don't allow remapping existing pages as executable. x86 bits from Peter Zijlstra, arm64 bits from Mark Rutland. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Mark Rutland . Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-20-hch@lst.de Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/pgtable.h | 3 +++ arch/x86/include/asm/pgtable_types.h | 6 ++++++ include/asm-generic/pgtable.h | 4 ++++ mm/vmalloc.c | 2 +- 4 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 538c85e62f86..47095216d6a8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -407,6 +407,9 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) +#define pgprot_nx(prot) \ + __pgprot_modify(prot, 0, PTE_PXN) + /* * Mark the prot value as uncacheable and unbufferable. */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 947867f112ea..2e7c442cc618 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -282,6 +282,12 @@ typedef struct pgprot { pgprotval_t pgprot; } pgprot_t; typedef struct { pgdval_t pgd; } pgd_t; +static inline pgprot_t pgprot_nx(pgprot_t prot) +{ + return __pgprot(pgprot_val(prot) | _PAGE_NX); +} +#define pgprot_nx pgprot_nx + #ifdef CONFIG_X86_PAE /* diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 329b8c8ca703..8c5f9c29698b 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -491,6 +491,10 @@ static inline int arch_unmap_one(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif +#ifndef pgprot_nx +#define pgprot_nx(prot) (prot) +#endif + #ifndef pgprot_noncached #define pgprot_noncached(prot) (prot) #endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9848156a1c6a..4d7c7108181a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2391,7 +2391,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_kernel_range((unsigned long)area->addr, size, prot, + if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot), pages) < 0) { vunmap(area->addr); return NULL; -- cgit v1.2.3 From 88dca4ca5a93d2c09e5bbc6a62fbfc3af83c4fca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:40 -0700 Subject: mm: remove the pgprot argument to __vmalloc The pgprot argument to __vmalloc is always PAGE_KERNEL now, so remove it. Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Reviewed-by: Michael Kelley [hyperv] Acked-by: Gao Xiang [erofs] Acked-by: Peter Zijlstra (Intel) Acked-by: Wei Liu Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-22-hch@lst.de Signed-off-by: Linus Torvalds --- arch/x86/hyperv/hv_init.c | 3 +-- arch/x86/include/asm/kvm_host.h | 3 +-- arch/x86/kvm/svm/sev.c | 3 +-- drivers/block/drbd/drbd_bitmap.c | 4 +--- drivers/gpu/drm/etnaviv/etnaviv_dump.c | 4 ++-- drivers/lightnvm/pblk-init.c | 5 ++--- drivers/md/dm-bufio.c | 4 ++-- drivers/mtd/ubi/io.c | 4 ++-- drivers/scsi/sd_zbc.c | 3 +-- fs/gfs2/dir.c | 9 ++++----- fs/gfs2/quota.c | 2 +- fs/nfs/blocklayout/extent_tree.c | 2 +- fs/ntfs/malloc.h | 2 +- fs/ubifs/debug.c | 2 +- fs/ubifs/lprops.c | 2 +- fs/ubifs/lpt_commit.c | 4 ++-- fs/ubifs/orphan.c | 2 +- fs/xfs/kmem.c | 2 +- include/linux/vmalloc.h | 2 +- kernel/bpf/core.c | 6 +++--- kernel/groups.c | 2 +- kernel/module.c | 3 +-- mm/nommu.c | 15 +++++++-------- mm/page_alloc.c | 2 +- mm/percpu.c | 2 +- mm/vmalloc.c | 4 ++-- net/bridge/netfilter/ebtables.c | 6 ++---- sound/core/memalloc.c | 2 +- sound/core/pcm_memory.c | 2 +- 29 files changed, 47 insertions(+), 59 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 697ddd2afef9..e2137070386a 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -97,8 +97,7 @@ static int hv_cpu_init(unsigned int cpu) * not be stopped in the case of CPU offlining and the VM will hang. */ if (!*hvp) { - *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + *hvp = __vmalloc(PAGE_SIZE, GFP_KERNEL | __GFP_ZERO); } if (*hvp) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0a6b35353fc7..e94b3de564d6 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1279,8 +1279,7 @@ extern struct kmem_cache *x86_fpu_cache; #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { - return __vmalloc(kvm_x86_ops.vm_size, - GFP_KERNEL_ACCOUNT | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } void kvm_arch_free_vm(struct kvm *kvm); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 89f7f3aebd31..5573a97f1520 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -336,8 +336,7 @@ static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, /* Avoid using vmalloc for smaller buffers. */ size = npages * sizeof(struct page *); if (size > PAGE_SIZE) - pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO, - PAGE_KERNEL); + pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); else pages = kmalloc(size, GFP_KERNEL_ACCOUNT); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 15e99697234a..df53dca5d02c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -396,9 +396,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) bytes = sizeof(struct page *)*want; new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN); if (!new_pages) { - new_pages = __vmalloc(bytes, - GFP_NOIO | __GFP_ZERO, - PAGE_KERNEL); + new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO); if (!new_pages) return NULL; } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_dump.c b/drivers/gpu/drm/etnaviv/etnaviv_dump.c index 648cf0207309..706af0304ca4 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_dump.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_dump.c @@ -154,8 +154,8 @@ void etnaviv_core_dump(struct etnaviv_gem_submit *submit) file_size += sizeof(*iter.hdr) * n_obj; /* Allocate the file in vmalloc memory, it's likely to be big */ - iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, - PAGE_KERNEL); + iter.start = __vmalloc(file_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_NORETRY); if (!iter.start) { mutex_unlock(&gpu->mmu_context->lock); dev_warn(gpu->dev, "failed to allocate devcoredump file\n"); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 9a967a2e83dd..6e677ff62cc9 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -145,9 +145,8 @@ static int pblk_l2p_init(struct pblk *pblk, bool factory_init) int ret = 0; map_size = pblk_trans_map_size(pblk); - pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN - | __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM, - PAGE_KERNEL); + pblk->trans_map = __vmalloc(map_size, GFP_KERNEL | __GFP_NOWARN | + __GFP_RETRY_MAYFAIL | __GFP_HIGHMEM); if (!pblk->trans_map) { pblk_err(pblk, "failed to allocate L2P (need %zu of memory)\n", map_size); diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 2d519c223562..d1786cfd7f22 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -400,13 +400,13 @@ static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask, */ if (gfp_mask & __GFP_NORETRY) { unsigned noio_flag = memalloc_noio_save(); - void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + void *ptr = __vmalloc(c->block_size, gfp_mask); memalloc_noio_restore(noio_flag); return ptr; } - return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + return __vmalloc(c->block_size, gfp_mask); } /* diff --git a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c index b57b84fb97d0..14d890b00d2c 100644 --- a/drivers/mtd/ubi/io.c +++ b/drivers/mtd/ubi/io.c @@ -1297,7 +1297,7 @@ static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum, if (!ubi_dbg_chk_io(ubi)) return 0; - buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf1 = __vmalloc(len, GFP_NOFS); if (!buf1) { ubi_err(ubi, "cannot allocate memory to check writes"); return 0; @@ -1361,7 +1361,7 @@ int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len) if (!ubi_dbg_chk_io(ubi)) return 0; - buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(len, GFP_NOFS); if (!buf) { ubi_err(ubi, "cannot allocate memory to check for 0xFFs"); return 0; diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index f45c22b09726..8be27426aa66 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -136,8 +136,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, while (bufsize >= SECTOR_SIZE) { buf = __vmalloc(bufsize, - GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY, - PAGE_KERNEL); + GFP_KERNEL | __GFP_ZERO | __GFP_NORETRY); if (buf) { *buflen = bufsize; return buf; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3f7732415be..c0f2875c946c 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,7 +354,7 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); if (hc == NULL) - hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + hc = __vmalloc(hsize, GFP_NOFS); if (hc == NULL) return ERR_PTR(-ENOMEM); @@ -1166,7 +1166,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) hc2 = kmalloc_array(hsize_bytes, 2, GFP_NOFS | __GFP_NOWARN); if (hc2 == NULL) - hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS); if (!hc2) return -ENOMEM; @@ -1327,7 +1327,7 @@ static void *gfs2_alloc_sort_buffer(unsigned size) if (size < KMALLOC_MAX_SIZE) ptr = kmalloc(size, GFP_NOFS | __GFP_NOWARN); if (!ptr) - ptr = __vmalloc(size, GFP_NOFS, PAGE_KERNEL); + ptr = __vmalloc(size, GFP_NOFS); return ptr; } @@ -1987,8 +1987,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN); if (ht == NULL) - ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO, - PAGE_KERNEL); + ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO); if (!ht) return -ENOMEM; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8259fef3f986..4b67d47a7e00 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1365,7 +1365,7 @@ int gfs2_quota_init(struct gfs2_sbd *sdp) sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | - __GFP_ZERO, PAGE_KERNEL); + __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index 7a57ff2528af..8f7cff7a4293 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -582,7 +582,7 @@ retry: if (!arg->layoutupdate_pages) return -ENOMEM; - start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + start_p = __vmalloc(buffer_size, GFP_NOFS); if (!start_p) { kfree(arg->layoutupdate_pages); return -ENOMEM; diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index 842b0bfc3ac9..7068425735f1 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -34,7 +34,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) /* return (void *)__get_free_page(gfp_mask); */ } if (likely((size >> PAGE_SHIFT) < totalram_pages())) - return __vmalloc(size, gfp_mask, PAGE_KERNEL); + return __vmalloc(size, gfp_mask); return NULL; } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 0f5a480fe264..31288d8fa2ce 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -815,7 +815,7 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 29826c51883a..22bfda158f7f 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -1095,7 +1095,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; } - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index ff5e0411cf2d..d76a19e460cd 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -1596,7 +1596,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) if (!dbg_is_chk_lprops(c)) return 0; - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; @@ -1845,7 +1845,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) void *buf, *p; pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); - buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = p = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to dump LPT"); return; diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 283f9eb48410..2c294085ffed 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -977,7 +977,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) if (c->no_orphs) return 0; - buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); + buf = __vmalloc(c->leb_size, GFP_NOFS); if (!buf) { ubifs_err(c, "cannot allocate memory to check orphans"); return 0; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 1da94237a8cf..f1366475c389 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ __kmem_vmalloc(size_t size, xfs_km_flags_t flags) if (flags & KM_NOFS) nofs_flag = memalloc_nofs_save(); - ptr = __vmalloc(size, lflags, PAGE_KERNEL); + ptr = __vmalloc(size, lflags); if (flags & KM_NOFS) memalloc_nofs_restore(nofs_flag); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 5488cea5ef11..1c278e030599 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -110,7 +110,7 @@ extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); +extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 14aa1f74dd10..cf6fe9107f5c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -82,7 +82,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *fp; size = round_up(size, PAGE_SIZE); - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) return NULL; @@ -232,7 +232,7 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (ret) return NULL; - fp = __vmalloc(size, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(size, gfp_flags); if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { @@ -1089,7 +1089,7 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); + fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags); if (fp != NULL) { /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, diff --git a/kernel/groups.c b/kernel/groups.c index daae2f2dc6d4..6ee6691f6839 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -20,7 +20,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT); if (!gi) return NULL; diff --git a/kernel/module.c b/kernel/module.c index 646f1e2330d2..086618a0058f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2946,8 +2946,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return err; /* Suck in entire file: we'll want most of it. */ - info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); + info->hdr = __vmalloc(info->len, GFP_KERNEL | __GFP_NOWARN); if (!info->hdr) return -ENOMEM; diff --git a/mm/nommu.c b/mm/nommu.c index 4f07b7ef0297..2df549adb22b 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -140,7 +140,7 @@ void vfree(const void *addr) } EXPORT_SYMBOL(vfree); -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { /* * You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc() @@ -152,14 +152,14 @@ EXPORT_SYMBOL(__vmalloc); void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) { - return __vmalloc(size, flags, PAGE_KERNEL); + return __vmalloc(size, flags); } static void *__vmalloc_user_flags(unsigned long size, gfp_t flags) { void *ret; - ret = __vmalloc(size, flags, PAGE_KERNEL); + ret = __vmalloc(size, flags); if (ret) { struct vm_area_struct *vma; @@ -230,7 +230,7 @@ long vwrite(char *buf, char *addr, unsigned long count) */ void *vmalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); @@ -248,8 +248,7 @@ EXPORT_SYMBOL(vmalloc); */ void *vzalloc(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -302,7 +301,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM); } /** @@ -314,7 +313,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL); } EXPORT_SYMBOL(vmalloc_32); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cc406ee17ad9..45ad73122e82 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -8244,7 +8244,7 @@ void *__init alloc_large_system_hash(const char *tablename, table = memblock_alloc_raw(size, SMP_CACHE_BYTES); } else if (get_order(size) >= MAX_ORDER || hashdist) { - table = __vmalloc(size, gfp_flags, PAGE_KERNEL); + table = __vmalloc(size, gfp_flags); virt = true; } else { /* diff --git a/mm/percpu.c b/mm/percpu.c index 7da7d7737dab..696367b18222 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -482,7 +482,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) if (size <= PAGE_SIZE) return kzalloc(size, gfp); else - return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO); } /** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d7c7108181a..11194ae18f23 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2564,9 +2564,9 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_mask, prot, 0, node, caller); } -void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) +void *__vmalloc(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE, + return __vmalloc_node(size, 1, gfp_mask, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 78db58c7aec2..7e869284e052 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1095,16 +1095,14 @@ static int do_replace(struct net *net, const void __user *user, tmp.name[sizeof(tmp.name) - 1] = 0; countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; - newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo = __vmalloc(sizeof(*newinfo) + countersize, GFP_KERNEL_ACCOUNT); if (!newinfo) return -ENOMEM; if (countersize) memset(newinfo->counters, 0, countersize); - newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT, - PAGE_KERNEL); + newinfo->entries = __vmalloc(tmp.entries_size, GFP_KERNEL_ACCOUNT); if (!newinfo->entries) { ret = -ENOMEM; goto free_newinfo; diff --git a/sound/core/memalloc.c b/sound/core/memalloc.c index a83553fbedf0..bea46ed157a6 100644 --- a/sound/core/memalloc.c +++ b/sound/core/memalloc.c @@ -143,7 +143,7 @@ int snd_dma_alloc_pages(int type, struct device *device, size_t size, break; case SNDRV_DMA_TYPE_VMALLOC: gfp = snd_mem_get_gfp_flags(device, GFP_KERNEL | __GFP_HIGHMEM); - dmab->area = __vmalloc(size, gfp, PAGE_KERNEL); + dmab->area = __vmalloc(size, gfp); dmab->addr = 0; break; #ifdef CONFIG_HAS_DMA diff --git a/sound/core/pcm_memory.c b/sound/core/pcm_memory.c index fcab37ea6641..860935e3aea4 100644 --- a/sound/core/pcm_memory.c +++ b/sound/core/pcm_memory.c @@ -460,7 +460,7 @@ int _snd_pcm_lib_alloc_vmalloc_buffer(struct snd_pcm_substream *substream, return 0; /* already large enough */ vfree(runtime->dma_area); } - runtime->dma_area = __vmalloc(size, gfp_flags, PAGE_KERNEL); + runtime->dma_area = __vmalloc(size, gfp_flags); if (!runtime->dma_area) return -ENOMEM; runtime->dma_bytes = size; -- cgit v1.2.3 From 8e19843c36abae08e1e541a65ce53fd2e88499fc Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:29 -0700 Subject: x86/mm/64: implement arch_sync_kernel_mappings() Implement the function to sync changes in vmalloc and ioremap ranges to all page-tables. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-5-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64_types.h | 2 ++ arch/x86/mm/init_64.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 52e5f5f2240d..8f63efb2a2cc 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -159,4 +159,6 @@ extern unsigned int ptrs_per_p4d; #define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t)) +#define ARCH_PAGE_TABLE_SYNC_MASK (pgtable_l5_enabled() ? PGTBL_PGD_MODIFIED : PGTBL_P4D_MODIFIED) + #endif /* _ASM_X86_PGTABLE_64_DEFS_H */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8b5f73f5e207..96274a90c5ff 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -218,6 +218,11 @@ void sync_global_pgds(unsigned long start, unsigned long end) sync_global_pgds_l4(start, end); } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + sync_global_pgds(start, end); +} + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. -- cgit v1.2.3 From 86cf69f1d893d48fdb0382a940f2523409406423 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:33 -0700 Subject: x86/mm/32: implement arch_sync_kernel_mappings() Implement the function to sync changes in vmalloc and ioremap ranges to all page-tables. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-6-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level_types.h | 2 ++ arch/x86/include/asm/pgtable-3level_types.h | 2 ++ arch/x86/mm/fault.c | 25 ++++++++++++++++--------- 3 files changed, 20 insertions(+), 9 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 6deb6cd236e3..7f6ccff0ba72 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -20,6 +20,8 @@ typedef union { #define SHARED_KERNEL_PMD 0 +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED + /* * traditional i386 two-level paging structure: */ diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index 33845d36897c..80fbb4a9ed87 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -27,6 +27,8 @@ typedef union { #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) #endif +#define ARCH_PAGE_TABLE_SYNC_MASK (SHARED_KERNEL_PMD ? 0 : PGTBL_PMD_MODIFIED) + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a51df516b87b..edeb2adaf31f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -190,16 +190,13 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) return pmd_k; } -static void vmalloc_sync(void) +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) { - unsigned long address; - - if (SHARED_KERNEL_PMD) - return; + unsigned long addr; - for (address = VMALLOC_START & PMD_MASK; - address >= TASK_SIZE_MAX && address < VMALLOC_END; - address += PMD_SIZE) { + for (addr = start & PMD_MASK; + addr >= TASK_SIZE_MAX && addr < VMALLOC_END; + addr += PMD_SIZE) { struct page *page; spin_lock(&pgd_lock); @@ -210,13 +207,23 @@ static void vmalloc_sync(void) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - vmalloc_sync_one(page_address(page), address); + vmalloc_sync_one(page_address(page), addr); spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); } } +static void vmalloc_sync(void) +{ + unsigned long address; + + if (SHARED_KERNEL_PMD) + return; + + arch_sync_kernel_mappings(VMALLOC_START, VMALLOC_END); +} + void vmalloc_sync_mappings(void) { vmalloc_sync(); -- cgit v1.2.3 From 7f0a002b5a21302d9f4b29ba83c96cd433ff3769 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 1 Jun 2020 21:52:40 -0700 Subject: x86/mm: remove vmalloc faulting Remove fault handling on vmalloc areas, as the vmalloc code now takes care of synchronizing changes to all page-tables in the system. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Acked-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: Christoph Hellwig Cc: Dave Hansen Cc: "H . Peter Anvin" Cc: Ingo Molnar Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: "Rafael J. Wysocki" Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Cc: Vlastimil Babka Link: http://lkml.kernel.org/r/20200515140023.25469-8-joro@8bytes.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/switch_to.h | 23 ------- arch/x86/kernel/setup_percpu.c | 6 +- arch/x86/mm/fault.c | 134 --------------------------------------- arch/x86/mm/pti.c | 8 +-- arch/x86/mm/tlb.c | 37 ----------- 5 files changed, 4 insertions(+), 204 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 0e059b73437b..9f69cc497f4b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,27 +12,6 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -/* This runs runs on the previous thread's stack. */ -static inline void prepare_switch_to(struct task_struct *next) -{ -#ifdef CONFIG_VMAP_STACK - /* - * If we switch to a stack that has a top-level paging entry - * that is not present in the current mm, the resulting #PF will - * will be promoted to a double-fault and we'll panic. Probe - * the new stack now so that vmalloc_fault can fix up the page - * tables if needed. This can only happen if we use a stack - * in vmap space. - * - * We assume that the stack is aligned so that it never spans - * more than one top-level paging entry. - * - * To minimize cache pollution, just follow the stack pointer. - */ - READ_ONCE(*(unsigned char *)next->thread.sp); -#endif -} - asmlinkage void ret_from_fork(void); /* @@ -67,8 +46,6 @@ struct fork_frame { #define switch_to(prev, next, last) \ do { \ - prepare_switch_to(next); \ - \ ((last) = __switch_to_asm((prev), (next))); \ } while (0) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index e6d7894ad127..fd945ce78554 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void) /* * Sync back kernel address range again. We already did this in * setup_arch(), but percpu data also needs to be available in - * the smpboot asm. We can't reliably pick up percpu mappings - * using vmalloc_fault(), because exception dispatch needs - * percpu data. + * the smpboot asm and arch_sync_kernel_mappings() doesn't sync to + * swapper_pg_dir on 32-bit. The per-cpu mappings need to be available + * there too. * * FIXME: Can the later sync in setup_cpu_entry_areas() replace * this call? diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 255fc631b042..dffe8e4d3140 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -214,44 +214,6 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } -/* - * 32-bit: - * - * Handle a fault on the vmalloc or module mapping area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - unsigned long pgd_paddr; - pmd_t *pmd_k; - pte_t *pte_k; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Synchronize this task's top level page-table - * with the 'reference' page table. - * - * Do _not_ use "current" here. We might be inside - * an interrupt in the middle of a task switch.. - */ - pgd_paddr = read_cr3_pa(); - pmd_k = vmalloc_sync_one(__va(pgd_paddr), address); - if (!pmd_k) - return -1; - - if (pmd_large(*pmd_k)) - return 0; - - pte_k = pte_offset_kernel(pmd_k, address); - if (!pte_present(*pte_k)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - /* * Did it hit the DOS screen memory VA from vm86 mode? */ @@ -316,79 +278,6 @@ out: #else /* CONFIG_X86_64: */ -/* - * 64-bit: - * - * Handle a fault on the vmalloc area - */ -static noinline int vmalloc_fault(unsigned long address) -{ - pgd_t *pgd, *pgd_k; - p4d_t *p4d, *p4d_k; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* Make sure we are in vmalloc area: */ - if (!(address >= VMALLOC_START && address < VMALLOC_END)) - return -1; - - /* - * Copy kernel mappings over when needed. This can also - * happen within a race in page table update. In the later - * case just flush: - */ - pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address); - pgd_k = pgd_offset_k(address); - if (pgd_none(*pgd_k)) - return -1; - - if (pgtable_l5_enabled()) { - if (pgd_none(*pgd)) { - set_pgd(pgd, *pgd_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_k)); - } - } - - /* With 4-level paging, copying happens on the p4d level. */ - p4d = p4d_offset(pgd, address); - p4d_k = p4d_offset(pgd_k, address); - if (p4d_none(*p4d_k)) - return -1; - - if (p4d_none(*p4d) && !pgtable_l5_enabled()) { - set_p4d(p4d, *p4d_k); - arch_flush_lazy_mmu_mode(); - } else { - BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_k)); - } - - BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4); - - pud = pud_offset(p4d, address); - if (pud_none(*pud)) - return -1; - - if (pud_large(*pud)) - return 0; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - return -1; - - if (pmd_large(*pmd)) - return 0; - - pte = pte_offset_kernel(pmd, address); - if (!pte_present(*pte)) - return -1; - - return 0; -} -NOKPROBE_SYMBOL(vmalloc_fault); - #ifdef CONFIG_CPU_SUP_AMD static const char errata93_warning[] = KERN_ERR @@ -1227,29 +1116,6 @@ do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, */ WARN_ON_ONCE(hw_error_code & X86_PF_PK); - /* - * We can fault-in kernel-space virtual memory on-demand. The - * 'reference' page table is init_mm.pgd. - * - * NOTE! We MUST NOT take any locks for this case. We may - * be in an interrupt or a critical region, and should - * only copy the information from the master page table, - * nothing more. - * - * Before doing this on-demand faulting, ensure that the - * fault is not any of the following: - * 1. A fault on a PTE with a reserved bit set. - * 2. A fault caused by a user-mode access. (Do not demand- - * fault kernel memory due to user-mode accesses). - * 3. A fault caused by a page-level protection violation. - * (A demand fault would be on a non-present page which - * would have X86_PF_PROT==0). - */ - if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { - if (vmalloc_fault(address) >= 0) - return; - } - /* Was the fault spurious, caused by lazy TLB invalidation? */ if (spurious_kernel_fault(hw_error_code, address)) return; diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 843aa10a4cb6..da0fb17a1a36 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -448,13 +448,7 @@ static void __init pti_clone_user_shared(void) * the sp1 and sp2 slots. * * This is done for all possible CPUs during boot to ensure - * that it's propagated to all mms. If we were to add one of - * these mappings during CPU hotplug, we would need to take - * some measure to make sure that every mm that subsequently - * ran on that CPU would have the relevant PGD entry in its - * pagetables. The usual vmalloc_fault() mechanism would not - * work for page faults taken in entry_SYSCALL_64 before RSP - * is set up. + * that it's propagated to all mms. */ unsigned long va = (unsigned long)&per_cpu(cpu_tss_rw, cpu); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 66f96f21a7b6..f3fe261e5936 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -161,34 +161,6 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next, local_irq_restore(flags); } -static void sync_current_stack_to_mm(struct mm_struct *mm) -{ - unsigned long sp = current_stack_pointer; - pgd_t *pgd = pgd_offset(mm, sp); - - if (pgtable_l5_enabled()) { - if (unlikely(pgd_none(*pgd))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - - set_pgd(pgd, *pgd_ref); - } - } else { - /* - * "pgd" is faked. The top level entries are "p4d"s, so sync - * the p4d. This compiles to approximately the same code as - * the 5-level case. - */ - p4d_t *p4d = p4d_offset(pgd, sp); - - if (unlikely(p4d_none(*p4d))) { - pgd_t *pgd_ref = pgd_offset_k(sp); - p4d_t *p4d_ref = p4d_offset(pgd_ref, sp); - - set_p4d(p4d, *p4d_ref); - } - } -} - static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next) { unsigned long next_tif = task_thread_info(next)->flags; @@ -377,15 +349,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, */ cond_ibpb(tsk); - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - /* - * If our current stack is in vmalloc space and isn't - * mapped in the new pgd, we'll double-fault. Forcibly - * map it. - */ - sync_current_stack_to_mm(next); - } - /* * Stop remote flushes for the previous mm. * Skip kernel threads; we never send init_mm TLB flushing IPIs, -- cgit v1.2.3 From 86977da9cb71c8293263c630ac920dd1537de9e5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Feb 2020 19:39:36 -0500 Subject: TEST_ACCESS_OK _never_ had been checked anywhere Once upon a time the predecessor of that thing (TEST_VERIFY_AREA) used to be. However, that had been gone for years now (and the patch that introduced TEST_ACCESS_OK has not touched any ifdefs - they got gradually removed later). Just bury it... Signed-off-by: Al Viro --- arch/x86/include/asm/pgtable_32.h | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index 0dca7f7aeff2..fb10f2f8f4f0 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -32,13 +32,6 @@ extern pmd_t initial_pg_pmd[]; void paging_init(void); void sync_initial_page_table(void); -/* - * Define this if things work differently on an i386 and an i486: - * it will (on an i486) warn about kernel memory accesses that are - * done without a 'access_ok( ..)' - */ -#undef TEST_ACCESS_OK - #ifdef CONFIG_X86_PAE # include #else -- cgit v1.2.3 From b0eae98c66fe4ccac3a5a79a1479c057f2c7cdd7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 3 Jun 2020 16:01:01 -0700 Subject: mm/hugetlb: define a generic fallback for is_hugepage_only_range() There are multiple similar definitions for is_hugepage_only_range() on various platforms. Lets just add it's generic fallback definition for platforms that do not override. This help reduce code duplication. Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Acked-by: Mike Kravetz Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1588907271-11920-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 6 ------ arch/arm64/include/asm/hugetlb.h | 6 ------ arch/ia64/include/asm/hugetlb.h | 1 + arch/mips/include/asm/hugetlb.h | 7 ------- arch/parisc/include/asm/hugetlb.h | 6 ------ arch/powerpc/include/asm/hugetlb.h | 1 + arch/riscv/include/asm/hugetlb.h | 6 ------ arch/s390/include/asm/hugetlb.h | 7 ------- arch/sh/include/asm/hugetlb.h | 6 ------ arch/sparc/include/asm/hugetlb.h | 6 ------ arch/x86/include/asm/hugetlb.h | 6 ------ include/linux/hugetlb.h | 9 +++++++++ 12 files changed, 11 insertions(+), 56 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 318dcf5921ab..9ecd516d1ff7 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -14,12 +14,6 @@ #include #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index b88878ddc88b..8f58e052697a 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -17,12 +17,6 @@ extern bool arch_hugetlb_migration_supported(struct hstate *h); #endif -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, unsigned long len) -{ - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 36cc0396b214..6ef50b9a4bdf 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -20,6 +20,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return (REGION_NUMBER(addr) == RGN_HPAGE || REGION_NUMBER((addr)+(len)-1) == RGN_HPAGE); } +#define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 425bb6fc3bda..8b201e281f67 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -11,13 +11,6 @@ #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) -{ - return 0; -} - #define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 7cb595dcb7d7..411d9d867baa 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -12,12 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index bd6504c28c2f..b167c869d72d 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -30,6 +30,7 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, return slice_is_hugepage_only_range(mm, addr, len); return 0; } +#define is_hugepage_only_range is_hugepage_only_range #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 728a5db66597..866f6ae6467c 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -5,12 +5,6 @@ #include #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index de8f0bf5f238..7d27ea96ec2f 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -21,13 +21,6 @@ pte_t huge_ptep_get(pte_t *ptep); pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline bool is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) -{ - return false; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 6f025fe18146..536ad2cb8aa4 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -5,12 +5,6 @@ #include #include -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index 3963f80d1cb3..a056fe1119f5 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -20,12 +20,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index f65cfb48cfdd..cc98f79074d0 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -7,12 +7,6 @@ #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) -static inline int is_hugepage_only_range(struct mm_struct *mm, - unsigned long addr, - unsigned long len) { - return 0; -} - static inline void arch_clear_hugepage_flags(struct page *page) { } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0c13706054ef..bf97b17ab206 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -591,6 +591,15 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h) #include +#ifndef is_hugepage_only_range +static inline int is_hugepage_only_range(struct mm_struct *mm, + unsigned long addr, unsigned long len) +{ + return 0; +} +#define is_hugepage_only_range is_hugepage_only_range +#endif + #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) -- cgit v1.2.3 From 5be993432821750f382809df5e20bf4c129b24f7 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 3 Jun 2020 16:01:05 -0700 Subject: mm/hugetlb: define a generic fallback for arch_clear_hugepage_flags() There are multiple similar definitions for arch_clear_hugepage_flags() on various platforms. Lets just add it's generic fallback definition for platforms that do not override. This help reduce code duplication. Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Acked-by: Mike Kravetz Cc: Russell King Cc: Catalin Marinas Cc: Will Deacon Cc: Tony Luck Cc: Fenghua Yu Cc: Thomas Bogendoerfer Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Link: http://lkml.kernel.org/r/1588907271-11920-4-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/arm/include/asm/hugetlb.h | 1 + arch/arm64/include/asm/hugetlb.h | 1 + arch/ia64/include/asm/hugetlb.h | 4 ---- arch/mips/include/asm/hugetlb.h | 4 ---- arch/parisc/include/asm/hugetlb.h | 4 ---- arch/powerpc/include/asm/hugetlb.h | 4 ---- arch/riscv/include/asm/hugetlb.h | 4 ---- arch/s390/include/asm/hugetlb.h | 1 + arch/sh/include/asm/hugetlb.h | 1 + arch/sparc/include/asm/hugetlb.h | 4 ---- arch/x86/include/asm/hugetlb.h | 4 ---- include/linux/hugetlb.h | 5 +++++ 12 files changed, 9 insertions(+), 28 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h index 9ecd516d1ff7..d02d6ca88e92 100644 --- a/arch/arm/include/asm/hugetlb.h +++ b/arch/arm/include/asm/hugetlb.h @@ -18,5 +18,6 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags #endif /* _ASM_ARM_HUGETLB_H */ diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 8f58e052697a..94ba0c5bced2 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -21,6 +21,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags extern pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable); diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h index 6ef50b9a4bdf..7e46ebde8c0c 100644 --- a/arch/ia64/include/asm/hugetlb.h +++ b/arch/ia64/include/asm/hugetlb.h @@ -28,10 +28,6 @@ static inline void huge_ptep_clear_flush(struct vm_area_struct *vma, { } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #endif /* _ASM_IA64_HUGETLB_H */ diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index 8b201e281f67..10e3be870df7 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -75,10 +75,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #endif /* __ASM_HUGETLB_H */ diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 411d9d867baa..a69cf9efb0c1 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -42,10 +42,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #endif /* _ASM_PARISC64_HUGETLB_H */ diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h index b167c869d72d..e6dfa63da552 100644 --- a/arch/powerpc/include/asm/hugetlb.h +++ b/arch/powerpc/include/asm/hugetlb.h @@ -61,10 +61,6 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty); -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #include #else /* ! CONFIG_HUGETLB_PAGE */ diff --git a/arch/riscv/include/asm/hugetlb.h b/arch/riscv/include/asm/hugetlb.h index 866f6ae6467c..a5c2ca1d1cd8 100644 --- a/arch/riscv/include/asm/hugetlb.h +++ b/arch/riscv/include/asm/hugetlb.h @@ -5,8 +5,4 @@ #include #include -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #endif /* _ASM_RISCV_HUGETLB_H */ diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 7d27ea96ec2f..9ddf4a43a590 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -39,6 +39,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_arch_1, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 536ad2cb8aa4..ae4de7b89210 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -30,6 +30,7 @@ static inline void arch_clear_hugepage_flags(struct page *page) { clear_bit(PG_dcache_clean, &page->flags); } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags #include diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h index a056fe1119f5..53838a173f62 100644 --- a/arch/sparc/include/asm/hugetlb.h +++ b/arch/sparc/include/asm/hugetlb.h @@ -47,10 +47,6 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #define __HAVE_ARCH_HUGETLB_FREE_PGD_RANGE void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h index cc98f79074d0..1721b1aadeb1 100644 --- a/arch/x86/include/asm/hugetlb.h +++ b/arch/x86/include/asm/hugetlb.h @@ -7,8 +7,4 @@ #define hugepages_supported() boot_cpu_has(X86_FEATURE_PSE) -static inline void arch_clear_hugepage_flags(struct page *page) -{ -} - #endif /* _ASM_X86_HUGETLB_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index bf97b17ab206..2e66b71c1236 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -600,6 +600,11 @@ static inline int is_hugepage_only_range(struct mm_struct *mm, #define is_hugepage_only_range is_hugepage_only_range #endif +#ifndef arch_clear_hugepage_flags +static inline void arch_clear_hugepage_flags(struct page *page) { } +#define arch_clear_hugepage_flags arch_clear_hugepage_flags +#endif + #ifndef arch_make_huge_pte static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) -- cgit v1.2.3 From 86ec2da037b85436b63afe3df43ed48fa0e52b0e Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 3 Jun 2020 16:03:45 -0700 Subject: mm/thp: rename pmd_mknotpresent() as pmd_mkinvalid() pmd_present() is expected to test positive after pmdp_mknotpresent() as the PMD entry still points to a valid huge page in memory. pmdp_mknotpresent() implies that given PMD entry is just invalidated from MMU perspective while still holding on to pmd_page() referred valid huge page thus also clearing pmd_present() test. This creates the following situation which is counter intuitive. [pmd_present(pmd_mknotpresent(pmd)) = true] This renames pmd_mknotpresent() as pmd_mkinvalid() reflecting the helper's functionality more accurately while changing the above mentioned situation as follows. This does not create any functional change. [pmd_present(pmd_mkinvalid(pmd)) = true] This is not applicable for platforms that define own pmdp_invalidate() via __HAVE_ARCH_PMDP_INVALIDATE. Suggestion for renaming came during a previous discussion here. https://patchwork.kernel.org/patch/11019637/ [anshuman.khandual@arm.com: change pmd_mknotvalid() to pmd_mkinvalid() per Will] Link: http://lkml.kernel.org/r/1587520326-10099-3-git-send-email-anshuman.khandual@arm.com Suggested-by: Catalin Marinas Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Acked-by: Will Deacon Cc: Vineet Gupta Cc: Russell King Cc: Catalin Marinas Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Steven Rostedt Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Paul Mackerras Link: http://lkml.kernel.org/r/1584680057-13753-3-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/hugepage.h | 2 +- arch/arm/include/asm/pgtable-3level.h | 2 +- arch/arm64/include/asm/pgtable.h | 2 +- arch/mips/include/asm/pgtable.h | 2 +- arch/x86/include/asm/pgtable.h | 2 +- arch/x86/mm/kmmio.c | 2 +- mm/pgtable-generic.c | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 30ac40fed2c5..4eef17c5c1da 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -26,7 +26,7 @@ static inline pmd_t pte_pmd(pte_t pte) #define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) #define pmd_mkhuge(pmd) pte_pmd(pte_mkhuge(pmd_pte(pmd))) -#define pmd_mknotpresent(pmd) pte_pmd(pte_mknotpresent(pmd_pte(pmd))) +#define pmd_mkinvalid(pmd) pte_pmd(pte_mknotpresent(pmd_pte(pmd))) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_write(pmd) pte_write(pmd_pte(pmd)) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 36805f94939e..1933aed9f68d 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -221,7 +221,7 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pmdp_establish generic_pmdp_establish /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */ -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t pmd_mkinvalid(pmd_t pmd) { return __pmd(pmd_val(pmd) & ~L_PMD_SECT_VALID); } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index dae0466d19d6..9ce000f22d9e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -366,7 +366,7 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_mkclean(pmd) pte_pmd(pte_mkclean(pmd_pte(pmd))) #define pmd_mkdirty(pmd) pte_pmd(pte_mkdirty(pmd_pte(pmd))) #define pmd_mkyoung(pmd) pte_pmd(pte_mkyoung(pmd_pte(pmd))) -#define pmd_mknotpresent(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) +#define pmd_mkinvalid(pmd) (__pmd(pmd_val(pmd) & ~PMD_SECT_VALID)) #define pmd_thp_or_huge(pmd) (pmd_huge(pmd) || pmd_trans_huge(pmd)) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index f1801e7a4b15..bfbab6652e20 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -631,7 +631,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) return pmd; } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t pmd_mkinvalid(pmd_t pmd) { pmd_val(pmd) &= ~(_PAGE_PRESENT | _PAGE_VALID | _PAGE_DIRTY); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4d02e64af1b3..f51d8997ed00 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -624,7 +624,7 @@ static inline pud_t pfn_pud(unsigned long page_nr, pgprot_t pgprot) return __pud(pfn | check_pgprot(pgprot)); } -static inline pmd_t pmd_mknotpresent(pmd_t pmd) +static inline pmd_t pmd_mkinvalid(pmd_t pmd) { return pfn_pmd(pmd_pfn(pmd), __pgprot(pmd_flags(pmd) & ~(_PAGE_PRESENT|_PAGE_PROTNONE))); diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 9994353fb75d..22bae5828c3d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -130,7 +130,7 @@ static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) pmdval_t v = pmd_val(*pmd); if (clear) { *old = v; - new_pmd = pmd_mknotpresent(*pmd); + new_pmd = pmd_mkinvalid(*pmd); } else { /* Presume this has been called with clear==true previously */ new_pmd = __pmd(*old); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3d7c01e76efc..d18f0e1b6792 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return old; } -- cgit v1.2.3 From 5bfea2d9b17f1034a68147a8b03b9789af5700f9 Mon Sep 17 00:00:00 2001 From: Fan Yang Date: Thu, 4 Jun 2020 18:22:07 +0800 Subject: mm: Fix mremap not considering huge pmd devmap The original code in mm/mremap.c checks huge pmd by: if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { However, a DAX mapped nvdimm is mapped as huge page (by default) but it is not transparent huge page (_PAGE_PSE | PAGE_DEVMAP). This commit changes the condition to include the case. This addresses CVE-2020-10757. Fixes: 5c7fb56e5e3f ("mm, dax: dax-pmd vs thp-pmd vs hugetlbfs-pmd") Cc: Reported-by: Fan Yang Signed-off-by: Fan Yang Tested-by: Fan Yang Tested-by: Dan Williams Reviewed-by: Dan Williams Acked-by: Kirill A. Shutemov Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 1 + mm/mremap.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f51d8997ed00..b8f46bbe69f4 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -257,6 +257,7 @@ static inline int pmd_large(pmd_t pte) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* NOTE: when predicate huge page, consider also pmd_devmap, or use pmd_large */ static inline int pmd_trans_huge(pmd_t pmd) { return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; diff --git a/mm/mremap.c b/mm/mremap.c index 6aa6ea605068..57b1f999f789 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -266,7 +266,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; - if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) { + if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { if (extent == HPAGE_PMD_SIZE) { bool moved; /* See comment in move_ptes() */ -- cgit v1.2.3 From 8898ad58a0195e2d3f5ffb770fbe9c9dc978a171 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 4 Jun 2020 16:47:12 -0700 Subject: x86/mm: define mm_p4d_folded() Patch series "mm/debug: Add tests validating architecture page table helpers", v18. This adds a test validation for architecture exported page table helpers. Patch adds basic transformation tests at various levels of the page table. This test was originally suggested by Catalin during arm64 THP migration RFC discussion earlier. Going forward it can include more specific tests with respect to various generic MM functions like THP, HugeTLB etc and platform specific tests. https://lore.kernel.org/linux-mm/20190628102003.GA56463@arrakis.emea.arm.com/ This patch (of 2): This just defines mm_p4d_folded() to check whether P4D page table level is folded at runtime. Signed-off-by: Kirill A. Shutemov Signed-off-by: Anshuman Khandual Signed-off-by: Andrew Morton Cc: Thomas Gleixner Cc: Ingo Molnar Link: http://lkml.kernel.org/r/1587436495-22033-2-git-send-email-anshuman.khandual@arm.com Link: http://lkml.kernel.org/r/1588564865-31160-2-git-send-email-anshuman.khandual@arm.com Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index df1373415f11..8d03ffd43794 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -53,6 +53,12 @@ static inline void sync_initial_page_table(void) { } struct mm_struct; +#define mm_p4d_folded mm_p4d_folded +static inline bool mm_p4d_folded(struct mm_struct *mm) +{ + return !pgtable_l5_enabled(); +} + void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); -- cgit v1.2.3 From 525aaf9bad00e7454b9f9b3873e92795afb59f8e Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:30 -0700 Subject: arch/kmap: remove redundant arch specific kmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The kmap code for all the architectures is almost 100% identical. Lift the common code to the core. Use ARCH_HAS_KMAP_FLUSH_TLB to indicate if an arch defines kmap_flush_tlb() and call if if needed. This also has the benefit of changing kmap() on a number of architectures to be an inline call rather than an actual function. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-4-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 2 -- arch/arc/mm/highmem.c | 10 ---------- arch/arm/include/asm/highmem.h | 2 -- arch/arm/mm/highmem.c | 9 --------- arch/csky/include/asm/highmem.h | 4 ++-- arch/csky/mm/highmem.c | 14 ++++---------- arch/microblaze/include/asm/highmem.h | 9 --------- arch/mips/include/asm/highmem.h | 4 ++-- arch/mips/mm/highmem.c | 14 +++----------- arch/nds32/include/asm/highmem.h | 2 -- arch/nds32/mm/highmem.c | 12 ------------ arch/powerpc/include/asm/highmem.h | 9 --------- arch/sparc/include/asm/highmem.h | 9 --------- arch/x86/include/asm/highmem.h | 2 -- arch/x86/mm/highmem_32.c | 9 --------- arch/xtensa/include/asm/highmem.h | 9 --------- include/linux/highmem.h | 18 ++++++++++++++++++ 17 files changed, 29 insertions(+), 109 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 042e92921c4c..96eb67c86961 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,8 +30,6 @@ #include -extern void *kmap(struct page *page); -extern void *kmap_high(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void kunmap_high(struct page *page); diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 39ef7b9a3aa9..4db13a6b9f3b 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,16 +49,6 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void *kmap_atomic(struct page *page) { int idx, cpu_idx; diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index eb4e4207cd3c..c917522541de 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -20,7 +20,6 @@ extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); /* @@ -63,7 +62,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index cc6eb79ef20c..e8ba37c36590 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,15 +31,6 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index a345a2f2c22e..9d0516e38110 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,10 +30,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap(struct page *page); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 690d678649d1..4a3c273bc8b9 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -13,18 +13,12 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } +EXPORT_SYMBOL(kmap_flush_tlb); + EXPORT_SYMBOL(kmap); void kunmap(struct page *page) diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 99ced7278b5c..8c5bfd228bd8 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,19 +51,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 9d84aafc33d0..1f741e3ecabf 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,10 +46,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * kmap_high(struct page *page); extern void kunmap_high(struct page *page); -extern void *kmap(struct page *page); +#define ARCH_HAS_KMAP_FLUSH_TLB +extern void kmap_flush_tlb(unsigned long addr); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index edd889f6cede..c72058bfead6 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -12,19 +12,11 @@ static pte_t *kmap_pte; unsigned long highstart_pfn, highend_pfn; -void *kmap(struct page *page) +void kmap_flush_tlb(unsigned long addr) { - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - addr = kmap_high(page); - flush_tlb_one((unsigned long)addr); - - return addr; + flush_tlb_one(addr); } -EXPORT_SYMBOL(kmap); +EXPORT_SYMBOL(kmap_flush_tlb); void kunmap(struct page *page) { diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b3a82c97ded3..b13654a79069 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -44,7 +44,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void kmap_init(void); @@ -54,7 +53,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap(struct page *page); extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index 4c7c28e994ea..d0cde53b84ae 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,18 +10,6 @@ #include #include -void *kmap(struct page *page) -{ - unsigned long vaddr; - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - vaddr = (unsigned long)kmap_high(page); - return (void *)vaddr; -} - -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 529512f6d65a..f14e4feef6d5 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,19 +59,10 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 7dd2d4b3f980..2ff1192047f7 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,17 +50,8 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_high(struct page *page); void kunmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index a8059930056d..c916a28a9738 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,10 +58,8 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_high(struct page *page); extern void kunmap_high(struct page *page); -void *kmap(struct page *page); void kunmap(struct page *page); void *kmap_atomic_prot(struct page *page, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 8af66382672b..12591a81b85c 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,15 +4,6 @@ #include /* for totalram_pages */ #include -void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} -EXPORT_SYMBOL(kmap); - void kunmap(struct page *page) { might_sleep(); diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index a9587c85be85..2546b88ddecf 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,17 +63,8 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void *kmap_high(struct page *page); void kunmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_high(page); -} - static inline void kunmap(struct page *page) { might_sleep(); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index ea5cdbd8c2c3..fc3adc51254a 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -34,6 +34,24 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifdef CONFIG_HIGHMEM #include +#ifndef ARCH_HAS_KMAP_FLUSH_TLB +static inline void kmap_flush_tlb(unsigned long addr) { } +#endif + +void *kmap_high(struct page *page); +static inline void *kmap(struct page *page) +{ + void *addr; + + might_sleep(); + if (!PageHighMem(page)) + addr = page_address(page); + else + addr = kmap_high(page); + kmap_flush_tlb((unsigned long)addr); + return addr; +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; -- cgit v1.2.3 From e23c45976f82ac789469c37e4d5a72ea2ce30bba Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:34 -0700 Subject: arch/kunmap: remove duplicate kunmap implementations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All architectures do exactly the same thing for kunmap(); remove all the duplicate definitions and lift the call to the core. This also has the benefit of changing kmap_unmap() on a number of architectures to be an inline call rather than an actual function. [akpm@linux-foundation.org: fix CONFIG_HIGHMEM=n build on various architectures] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-5-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 10 ---------- arch/arm/include/asm/highmem.h | 3 --- arch/arm/mm/highmem.c | 9 --------- arch/csky/include/asm/highmem.h | 3 --- arch/csky/mm/highmem.c | 9 --------- arch/microblaze/include/asm/highmem.h | 9 --------- arch/mips/include/asm/highmem.h | 3 --- arch/mips/mm/highmem.c | 9 --------- arch/nds32/include/asm/highmem.h | 3 --- arch/nds32/mm/highmem.c | 10 ---------- arch/powerpc/include/asm/highmem.h | 9 --------- arch/sparc/include/asm/highmem.h | 10 ---------- arch/x86/include/asm/highmem.h | 4 ---- arch/x86/mm/highmem_32.c | 9 --------- arch/xtensa/include/asm/highmem.h | 10 ---------- include/linux/highmem.h | 14 ++++++++++++++ 16 files changed, 14 insertions(+), 110 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 96eb67c86961..8387a5596a91 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -32,7 +32,6 @@ extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); -extern void kunmap_high(struct page *page); extern void kmap_init(void); @@ -41,15 +40,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - - #endif #endif diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index c917522541de..736f65283e7b 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -20,8 +20,6 @@ extern pte_t *pkmap_page_table; -extern void kunmap_high(struct page *page); - /* * The reason for kmap_high_get() is to ensure that the currently kmap'd * page usage count does not decrease to zero while we're using its @@ -62,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index e8ba37c36590..c700b32350ee 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,15 +31,6 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned int idx; diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 9d0516e38110..be11c5b67122 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -30,11 +30,8 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 4a3c273bc8b9..e9952211264b 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,15 +21,6 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned long vaddr; diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 8c5bfd228bd8..0c94046f2d58 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,18 +51,9 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 1f741e3ecabf..24e7e7e5cc7b 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -46,11 +46,8 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index c72058bfead6..eb8ec8493f2f 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,15 +18,6 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index b13654a79069..c93c7368bb3f 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -44,8 +44,6 @@ extern unsigned long highstart_pfn, highend_pfn; extern pte_t *pkmap_page_table; -extern void kunmap_high(struct page *page); - extern void kmap_init(void); /* @@ -53,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void kunmap(struct page *page); extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index d0cde53b84ae..f9348bec0ecb 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,16 +10,6 @@ #include #include -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - -EXPORT_SYMBOL(kunmap); - void *kmap_atomic(struct page *page) { unsigned int idx; diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index f14e4feef6d5..ba3371977d49 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,18 +59,9 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); extern void __kunmap_atomic(void *kvaddr); -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void *kmap_atomic(struct page *page) { return kmap_atomic_prot(page, kmap_prot); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 2ff1192047f7..4bdb79fed02c 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,16 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index c916a28a9738..90b96594d6c5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,10 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void kunmap_high(struct page *page); - -void kunmap(struct page *page); - void *kmap_atomic_prot(struct page *page, pgprot_t prot); void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 12591a81b85c..c4ebfd0ae401 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,15 +4,6 @@ #include /* for totalram_pages */ #include -void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} -EXPORT_SYMBOL(kunmap); - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 2546b88ddecf..5a481f7def0b 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -63,16 +63,6 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) extern pte_t *pkmap_page_table; -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} - static inline void flush_cache_kmaps(void) { flush_cache_all(); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index fc3adc51254a..216a647ed7db 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -52,6 +52,16 @@ static inline void *kmap(struct page *page) return addr; } +void kunmap_high(struct page *page); + +static inline void kunmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; @@ -102,6 +112,10 @@ static inline void *kmap(struct page *page) return page_address(page); } +static inline void kunmap_high(struct page *page) +{ +} + static inline void kunmap(struct page *page) { } -- cgit v1.2.3 From ee9bc5fdf5b6d24875fc55d43d5a0728bc2add21 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:38 -0700 Subject: {x86,powerpc,microblaze}/kmap: move preempt disable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During this kmap() conversion series we must maintain bisect-ability. To do this, kmap_atomic_prot() in x86, powerpc, and microblaze need to remain functional. Create a temporary inline version of kmap_atomic_prot within these architectures so we can rework their kmap_atomic() calls and then lift kmap_atomic_prot() to the core. Suggested-by: Al Viro Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-6-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/microblaze/include/asm/highmem.h | 11 ++++++++++- arch/microblaze/mm/highmem.c | 10 ++-------- arch/powerpc/include/asm/highmem.h | 11 ++++++++++- arch/powerpc/mm/highmem.c | 9 ++------- arch/x86/include/asm/highmem.h | 11 ++++++++++- arch/x86/mm/highmem_32.c | 10 ++-------- 6 files changed, 36 insertions(+), 26 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 0c94046f2d58..c38d920a1171 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,7 +51,16 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + return kmap_atomic_high_prot(page, prot); +} extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic(struct page *page) diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index d7569f77fa15..0e3efaa8a004 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -32,18 +32,12 @@ */ #include -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -55,7 +49,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index ba3371977d49..d049806a8354 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,7 +59,16 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_prot(struct page *page, pgprot_t prot); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + return kmap_atomic_high_prot(page, prot); +} extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic(struct page *page) diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 320c1672b2ae..f075cef6d663 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -30,16 +30,11 @@ * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -49,7 +44,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 90b96594d6c5..61f47fef40e5 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,7 +58,16 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -void *kmap_atomic_prot(struct page *page, pgprot_t prot); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + + return kmap_atomic_high_prot(page, prot); +} void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index c4ebfd0ae401..48b56b1af902 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -12,17 +12,11 @@ * However when holding an atomic kmap it is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic_prot(struct page *page, pgprot_t prot) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -32,7 +26,7 @@ void *kmap_atomic_prot(struct page *page, pgprot_t prot) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_prot); +EXPORT_SYMBOL(kmap_atomic_high_prot); void *kmap_atomic(struct page *page) { -- cgit v1.2.3 From 78b6d91ec7bbfc5bcc2dd05bb2cf13c9de1dc7cd Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:42 -0700 Subject: arch/kmap_atomic: consolidate duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every arch has the same code to ensure atomic operations and a check for !HIGHMEM page. Remove the duplicate code by defining a core kmap_atomic() which only calls the arch specific kmap_atomic_high() when the page is high memory. [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-7-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 1 - arch/arc/mm/highmem.c | 9 ++------- arch/arm/include/asm/highmem.h | 1 - arch/arm/mm/highmem.c | 9 ++------- arch/csky/include/asm/highmem.h | 1 - arch/csky/mm/highmem.c | 9 ++------- arch/microblaze/include/asm/highmem.h | 4 ++-- arch/mips/include/asm/highmem.h | 1 - arch/mips/mm/cache.c | 2 +- arch/mips/mm/highmem.c | 18 ++---------------- arch/nds32/include/asm/highmem.h | 1 - arch/nds32/mm/highmem.c | 11 ++--------- arch/powerpc/include/asm/highmem.h | 4 ++-- arch/powerpc/mm/highmem.c | 6 ------ arch/sparc/include/asm/highmem.h | 1 - arch/sparc/mm/highmem.c | 9 ++------- arch/x86/include/asm/highmem.h | 5 ++++- arch/x86/mm/highmem_32.c | 14 -------------- arch/xtensa/include/asm/highmem.h | 1 - arch/xtensa/mm/highmem.c | 9 ++------- include/linux/highmem.h | 23 +++++++++++++++++++++++ 21 files changed, 46 insertions(+), 93 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 8387a5596a91..db425cd38545 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,7 +30,6 @@ #include -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void kmap_init(void); diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 4db13a6b9f3b..0964b011c29f 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,16 +49,11 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { int idx, cpu_idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - cpu_idx = kmap_atomic_idx_push(); idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); vaddr = FIXMAP_ADDR(idx); @@ -68,7 +63,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kv) { diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 736f65283e7b..8c80bfe18a34 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -60,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index c700b32350ee..075fdc235091 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,18 +31,13 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned int idx; unsigned long vaddr; void *kmap; int type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - #ifdef CONFIG_DEBUG_HIGHMEM /* * There is no cache coherency issue when non VIVT, so force the @@ -76,7 +71,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index be11c5b67122..8ceee12f9bc1 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -32,7 +32,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index e9952211264b..63d74b47eee6 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,16 +21,11 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -42,7 +37,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index c38d920a1171..f7c5467df5ad 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -63,9 +63,9 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) } extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_high(struct page *page) { - return kmap_atomic_prot(page, kmap_prot); + return kmap_atomic_high_prot(page, kmap_prot); } #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 24e7e7e5cc7b..8bdbbfc322ad 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -48,7 +48,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index ad6df1cea866..295bfda7da97 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -14,9 +14,9 @@ #include #include #include +#include #include -#include #include #include #include diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index eb8ec8493f2f..2bda56372995 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,25 +18,11 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap is is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ - -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; int idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -48,7 +34,7 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index c93c7368bb3f..a3970e566ede 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -51,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic(struct page *page); extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index f9348bec0ecb..d4387d835870 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,18 +10,13 @@ #include #include -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned int idx; unsigned long vaddr, pte; int type; pte_t *ptep; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR * smp_processor_id(); @@ -36,8 +31,7 @@ void *kmap_atomic(struct page *page) __nds32__isb(); return (void *)vaddr; } - -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { @@ -53,5 +47,4 @@ void __kunmap_atomic(void *kvaddr) pagefault_enable(); preempt_enable(); } - EXPORT_SYMBOL(__kunmap_atomic); diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index d049806a8354..74fa2c726fde 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -71,9 +71,9 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) } extern void __kunmap_atomic(void *kvaddr); -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_high(struct page *page) { - return kmap_atomic_prot(page, kmap_prot); + return kmap_atomic_high_prot(page, kmap_prot); } diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index f075cef6d663..67aaa5217f7f 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -24,12 +24,6 @@ #include #include -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 4bdb79fed02c..458210c5bc38 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,7 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); #define flush_cache_kmaps() flush_cache_all() diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index d4a80adea7e5..b53070ab6a31 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -53,16 +53,11 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { unsigned long vaddr; long idx, type; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - type = kmap_atomic_idx_push(); idx = type + KM_TYPE_NR*smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -87,7 +82,7 @@ void *kmap_atomic(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 61f47fef40e5..9393d55a2adb 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -68,7 +68,10 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -void *kmap_atomic(struct page *page); +static inline void *kmap_atomic_high(struct page *page) +{ + return kmap_atomic_high_prot(page, kmap_prot); +} void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 48b56b1af902..c3e272a759e0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,14 +4,6 @@ #include /* for totalram_pages */ #include -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. - * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. - */ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; @@ -28,12 +20,6 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void *kmap_atomic(struct page *page) -{ - return kmap_atomic_prot(page, kmap_prot); -} -EXPORT_SYMBOL(kmap_atomic); - /* * This is the same as kmap_atomic() but can map memory that doesn't * have a struct page associated with it. diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 5a481f7def0b..1e6aa15c4bdf 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -68,7 +68,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -void *kmap_atomic(struct page *page); void __kunmap_atomic(void *kvaddr); void kmap_init(void); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index da734a2ed641..90b85a897cb0 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,16 +37,11 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic(struct page *page) +void *kmap_atomic_high(struct page *page) { enum fixed_addresses idx; unsigned long vaddr; - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - idx = kmap_idx(kmap_atomic_idx_push(), DCACHE_ALIAS(page_to_phys(page))); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); @@ -57,7 +52,7 @@ void *kmap_atomic(struct page *page) return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic); +EXPORT_SYMBOL(kmap_atomic_high); void __kunmap_atomic(void *kvaddr) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 216a647ed7db..d2209ae8be99 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,6 +32,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include #ifdef CONFIG_HIGHMEM +extern void *kmap_atomic_high(struct page *page); #include #ifndef ARCH_HAS_KMAP_FLUSH_TLB @@ -62,6 +63,28 @@ static inline void kunmap(struct page *page) kunmap_high(page); } +/* + * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because + * no global lock is needed and because the kmap code must perform a global TLB + * invalidation when the kmap pool wraps. + * + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + * + * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap + * gives a more generic (and caching) interface. But kmap_atomic can + * be used in IRQ contexts, so in some (very limited) cases we need + * it. + */ +static inline void *kmap_atomic(struct page *page) +{ + preempt_disable(); + pagefault_disable(); + if (!PageHighMem(page)) + return page_address(page); + return kmap_atomic_high(page); +} + /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); extern atomic_long_t _totalhigh_pages; -- cgit v1.2.3 From abca2500c0c1b20c3e552f259da4c4a99db3b4d1 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:46 -0700 Subject: arch/kunmap_atomic: consolidate duplicate code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every single architecture (including !CONFIG_HIGHMEM) calls... pagefault_enable(); preempt_enable(); ... before returning from __kunmap_atomic(). Lift this code into the kunmap_atomic() macro. While we are at it rename __kunmap_atomic() to kunmap_atomic_high() to be consistent. [ira.weiny@intel.com: don't enable pagefault/preempt twice] Link: http://lkml.kernel.org/r/20200518184843.3029640-1-ira.weiny@intel.com [akpm@linux-foundation.org: coding style fixes] Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Guenter Roeck Link: http://lkml.kernel.org/r/20200507150004.1423069-8-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 2 -- arch/arc/mm/highmem.c | 7 ++----- arch/arm/include/asm/highmem.h | 1 - arch/arm/mm/highmem.c | 6 ++---- arch/csky/include/asm/highmem.h | 1 - arch/csky/mm/highmem.c | 9 +++------ arch/microblaze/include/asm/highmem.h | 1 - arch/microblaze/mm/highmem.c | 11 +++-------- arch/mips/include/asm/highmem.h | 1 - arch/mips/mm/cache.c | 4 ++-- arch/mips/mm/highmem.c | 11 +++-------- arch/nds32/include/asm/highmem.h | 1 - arch/nds32/mm/highmem.c | 6 ++---- arch/parisc/include/asm/cacheflush.h | 4 +--- arch/powerpc/include/asm/highmem.h | 1 - arch/powerpc/mm/highmem.c | 11 +++-------- arch/sparc/include/asm/highmem.h | 2 -- arch/sparc/mm/highmem.c | 11 +++-------- arch/x86/include/asm/highmem.h | 1 - arch/x86/mm/highmem_32.c | 7 ++----- arch/xtensa/include/asm/highmem.h | 2 -- arch/xtensa/mm/highmem.c | 7 ++----- include/linux/highmem.h | 13 +++++++++---- 23 files changed, 37 insertions(+), 83 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index db425cd38545..70900a73bfc8 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -30,8 +30,6 @@ #include -extern void __kunmap_atomic(void *kvaddr); - extern void kmap_init(void); static inline void flush_cache_kmaps(void) diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 0964b011c29f..5d3eab4ac0b0 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -65,7 +65,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kv) +void kunmap_atomic_high(void *kv) { unsigned long kvaddr = (unsigned long)kv; @@ -87,11 +87,8 @@ void __kunmap_atomic(void *kv) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 8c80bfe18a34..b0d4bd8dc3c1 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -60,7 +60,6 @@ static inline void *kmap_high_get(struct page *page) * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #endif diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index 075fdc235091..ac8394655a6e 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -73,7 +73,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx, type; @@ -95,10 +95,8 @@ void __kunmap_atomic(void *kvaddr) /* this address was obtained through kmap_high_get() */ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void *kmap_atomic_pfn(unsigned long pfn) { diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 8ceee12f9bc1..263fbddcd0a3 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -32,7 +32,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 63d74b47eee6..0aafbbbe651c 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -39,13 +39,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int idx; if (vaddr < FIXADDR_START) - goto out; + return; #ifdef CONFIG_DEBUG_HIGHMEM idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); @@ -58,11 +58,8 @@ void __kunmap_atomic(void *kvaddr) (void) idx; /* to kill a warning */ #endif kmap_atomic_idx_pop(); -out: - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index f7c5467df5ad..c3cbda90391d 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -61,7 +61,6 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic_high(struct page *page) { diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c index 0e3efaa8a004..92e0890416c9 100644 --- a/arch/microblaze/mm/highmem.c +++ b/arch/microblaze/mm/highmem.c @@ -51,17 +51,14 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; unsigned int idx; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } type = kmap_atomic_idx(); @@ -77,7 +74,5 @@ void __kunmap_atomic(void *kvaddr) local_flush_tlb_page(NULL, vaddr); kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 8bdbbfc322ad..76dec0bd4f59 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -48,7 +48,6 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 295bfda7da97..3e81ba000096 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -103,7 +103,7 @@ void __flush_dcache_page(struct page *page) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); } EXPORT_SYMBOL(__flush_dcache_page); @@ -146,7 +146,7 @@ void __update_cache(unsigned long address, pte_t pte) flush_data_cache_page(addr); if (PageHighMem(page)) - __kunmap_atomic((void *)addr); + kunmap_atomic((void *)addr); ClearPageDcacheDirty(page); } diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 2bda56372995..73ef4004fe5f 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -36,16 +36,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type __maybe_unused; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); #ifdef CONFIG_DEBUG_HIGHMEM @@ -63,10 +60,8 @@ void __kunmap_atomic(void *kvaddr) } #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); /* * This is the same as kmap_atomic() but can map memory that doesn't diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index a3970e566ede..4d21308549c9 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -51,7 +51,6 @@ extern void kmap_init(void); * when CONFIG_HIGHMEM is not set. */ #ifdef CONFIG_HIGHMEM -extern void __kunmap_atomic(void *kvaddr); extern void *kmap_atomic_pfn(unsigned long pfn); extern struct page *kmap_atomic_to_page(void *ptr); #endif diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index d4387d835870..d25c815fda21 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -33,7 +33,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START) { unsigned long vaddr = (unsigned long)kvaddr; @@ -44,7 +44,5 @@ void __kunmap_atomic(void *kvaddr) ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, 0); } - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/parisc/include/asm/cacheflush.h b/arch/parisc/include/asm/cacheflush.h index 0c83644bfa5c..119c9a7681bc 100644 --- a/arch/parisc/include/asm/cacheflush.h +++ b/arch/parisc/include/asm/cacheflush.h @@ -122,11 +122,9 @@ static inline void *kmap_atomic(struct page *page) return page_address(page); } -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { flush_kernel_dcache_page_addr(addr); - pagefault_enable(); - preempt_enable(); } #define kmap_atomic_prot(page, prot) kmap_atomic(page) diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 74fa2c726fde..373a470df205 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -69,7 +69,6 @@ static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) return kmap_atomic_high_prot(page, prot); } -extern void __kunmap_atomic(void *kvaddr); static inline void *kmap_atomic_high(struct page *page) { diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c index 67aaa5217f7f..624b4438aff9 100644 --- a/arch/powerpc/mm/highmem.c +++ b/arch/powerpc/mm/highmem.c @@ -40,15 +40,12 @@ void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) } EXPORT_SYMBOL(kmap_atomic_high_prot); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - if (vaddr < __fix_to_virt(FIX_KMAP_END)) { - pagefault_enable(); - preempt_enable(); + if (vaddr < __fix_to_virt(FIX_KMAP_END)) return; - } if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { int type = kmap_atomic_idx(); @@ -66,7 +63,5 @@ void __kunmap_atomic(void *kvaddr) } kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 458210c5bc38..f4babe67cb5d 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -50,8 +50,6 @@ void kmap_init(void) __init; #define PKMAP_END (PKMAP_ADDR(LAST_PKMAP)) -void __kunmap_atomic(void *kvaddr); - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index b53070ab6a31..06798ae813b9 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -84,16 +84,13 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; int type; - if (vaddr < FIXADDR_START) { // FIXME - pagefault_enable(); - preempt_enable(); + if (vaddr < FIXADDR_START) return; - } type = kmap_atomic_idx(); @@ -126,7 +123,5 @@ void __kunmap_atomic(void *kvaddr) #endif kmap_atomic_idx_pop(); - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 9393d55a2adb..be66b77885a0 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -72,7 +72,6 @@ static inline void *kmap_atomic_high(struct page *page) { return kmap_atomic_high_prot(page, kmap_prot); } -void __kunmap_atomic(void *kvaddr); void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index c3e272a759e0..075fe51317b0 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -30,7 +30,7 @@ void *kmap_atomic_pfn(unsigned long pfn) } EXPORT_SYMBOL_GPL(kmap_atomic_pfn); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; @@ -60,11 +60,8 @@ void __kunmap_atomic(void *kvaddr) BUG_ON(vaddr >= (unsigned long)high_memory); } #endif - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init set_highmem_pages_init(void) { diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index 1e6aa15c4bdf..d6a10704307a 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -68,8 +68,6 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } -void __kunmap_atomic(void *kvaddr); - void kmap_init(void); #endif diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 90b85a897cb0..4de323e43682 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -54,7 +54,7 @@ void *kmap_atomic_high(struct page *page) } EXPORT_SYMBOL(kmap_atomic_high); -void __kunmap_atomic(void *kvaddr) +void kunmap_atomic_high(void *kvaddr) { if (kvaddr >= (void *)FIXADDR_START && kvaddr < (void *)FIXADDR_TOP) { @@ -73,11 +73,8 @@ void __kunmap_atomic(void *kvaddr) kmap_atomic_idx_pop(); } - - pagefault_enable(); - preempt_enable(); } -EXPORT_SYMBOL(__kunmap_atomic); +EXPORT_SYMBOL(kunmap_atomic_high); void __init kmap_init(void) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index d2209ae8be99..945b58d8a57b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -33,6 +33,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #ifdef CONFIG_HIGHMEM extern void *kmap_atomic_high(struct page *page); +extern void kunmap_atomic_high(void *kvaddr); #include #ifndef ARCH_HAS_KMAP_FLUSH_TLB @@ -151,10 +152,12 @@ static inline void *kmap_atomic(struct page *page) } #define kmap_atomic_prot(page, prot) kmap_atomic(page) -static inline void __kunmap_atomic(void *addr) +static inline void kunmap_atomic_high(void *addr) { - pagefault_enable(); - preempt_enable(); + /* + * Nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() + * handles re-enabling faults + preemption + */ } #define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) @@ -204,7 +207,9 @@ static inline void kmap_atomic_idx_pop(void) #define kunmap_atomic(addr) \ do { \ BUILD_BUG_ON(__same_type((addr), struct page *)); \ - __kunmap_atomic(addr); \ + kunmap_atomic_high(addr); \ + pagefault_enable(); \ + preempt_enable(); \ } while (0) -- cgit v1.2.3 From 20b271dfe9d932b02b067a1f7ba9805c5b8d79bd Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:47:58 -0700 Subject: arch/kmap: define kmap_atomic_prot() for all arch's MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To support kmap_atomic_prot(), all architectures need to support protections passed to their kmap_atomic_high() function. Pass protections into kmap_atomic_high() and change the name to kmap_atomic_high_prot() to match. Then define kmap_atomic_prot() as a core function which calls kmap_atomic_high_prot() when needed. Finally, redefine kmap_atomic() as a wrapper of kmap_atomic_prot() with the default kmap_prot exported by the architectures. Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-11-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/mm/highmem.c | 6 +++--- arch/arm/mm/highmem.c | 6 +++--- arch/csky/mm/highmem.c | 6 +++--- arch/microblaze/include/asm/highmem.h | 16 ---------------- arch/mips/mm/highmem.c | 6 +++--- arch/nds32/mm/highmem.c | 6 +++--- arch/powerpc/include/asm/highmem.h | 17 ----------------- arch/sparc/mm/highmem.c | 6 +++--- arch/x86/include/asm/highmem.h | 14 -------------- arch/xtensa/mm/highmem.c | 6 +++--- include/linux/highmem.h | 7 ++++--- 11 files changed, 25 insertions(+), 71 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 5d3eab4ac0b0..479b0d72d3cf 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -49,7 +49,7 @@ extern pte_t * pkmap_page_table; static pte_t * fixmap_page_table; -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { int idx, cpu_idx; unsigned long vaddr; @@ -59,11 +59,11 @@ void *kmap_atomic_high(struct page *page) vaddr = FIXMAP_ADDR(idx); set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, - mk_pte(page, kmap_prot)); + mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kv) { diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c index ac8394655a6e..e013f6b81328 100644 --- a/arch/arm/mm/highmem.c +++ b/arch/arm/mm/highmem.c @@ -31,7 +31,7 @@ static inline pte_t get_fixmap_pte(unsigned long vaddr) return *ptep; } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr; @@ -67,11 +67,11 @@ void *kmap_atomic_high(struct page *page) * in place, so the contained TLB flush ensures the TLB is updated * with the new mapping. */ - set_fixmap_pte(idx, mk_pte(page, kmap_prot)); + set_fixmap_pte(idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index f4311669b5bb..3ae5c8cd7619 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -21,7 +21,7 @@ EXPORT_SYMBOL(kmap_flush_tlb); EXPORT_SYMBOL(kmap); -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; @@ -32,12 +32,12 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); flush_tlb_one((unsigned long)vaddr); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 90d96239152f..d7c55cfd27bd 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -51,22 +51,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} - -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} - #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } #endif /* __KERNEL__ */ diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 96a486777a18..8e8726992720 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -18,7 +18,7 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; int idx, type; @@ -29,12 +29,12 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte - idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); local_flush_tlb_one((unsigned long)vaddr); return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c index b11b88956353..4284cd59e21a 100644 --- a/arch/nds32/mm/highmem.c +++ b/arch/nds32/mm/highmem.c @@ -10,7 +10,7 @@ #include #include -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned int idx; unsigned long vaddr, pte; @@ -21,7 +21,7 @@ void *kmap_atomic_high(struct page *page) idx = type + KM_TYPE_NR * smp_processor_id(); vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | (kmap_prot); + pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); set_pte(ptep, pte); @@ -31,7 +31,7 @@ void *kmap_atomic_high(struct page *page) __nds32__isb(); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index ee5de974c5ef..8d8ee3fcd800 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -59,23 +59,6 @@ extern pte_t *pkmap_page_table; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} - -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} - - #define flush_cache_kmaps() flush_cache_all() #endif /* __KERNEL__ */ diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 02e23a677c58..9309bcab4ae6 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -54,7 +54,7 @@ void __init kmap_init(void) kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { unsigned long vaddr; long idx, type; @@ -73,7 +73,7 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte-idx))); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte-idx, mk_pte(page, prot)); /* XXX Fix - Anton */ #if 0 __flush_tlb_one(vaddr); @@ -83,7 +83,7 @@ void *kmap_atomic_high(struct page *page) return (void*) vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index be66b77885a0..0f420b24e0fc 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -58,20 +58,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - return kmap_atomic_high_prot(page, prot); -} -static inline void *kmap_atomic_high(struct page *page) -{ - return kmap_atomic_high_prot(page, kmap_prot); -} void *kmap_atomic_pfn(unsigned long pfn); void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 50168b09510a..99b5ad137ab5 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -37,7 +37,7 @@ static inline enum fixed_addresses kmap_idx(int type, unsigned long color) color; } -void *kmap_atomic_high(struct page *page) +void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) { enum fixed_addresses idx; unsigned long vaddr; @@ -48,11 +48,11 @@ void *kmap_atomic_high(struct page *page) #ifdef CONFIG_DEBUG_HIGHMEM BUG_ON(!pte_none(*(kmap_pte + idx))); #endif - set_pte(kmap_pte + idx, mk_pte(page, kmap_prot)); + set_pte(kmap_pte + idx, mk_pte(page, prot)); return (void *)vaddr; } -EXPORT_SYMBOL(kmap_atomic_high); +EXPORT_SYMBOL(kmap_atomic_high_prot); void kunmap_atomic_high(void *kvaddr) { diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 945b58d8a57b..9c559c670299 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -32,7 +32,7 @@ static inline void invalidate_kernel_vmap_range(void *vaddr, int size) #include #ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_high(struct page *page); +extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); extern void kunmap_atomic_high(void *kvaddr); #include @@ -77,14 +77,15 @@ static inline void kunmap(struct page *page) * be used in IRQ contexts, so in some (very limited) cases we need * it. */ -static inline void *kmap_atomic(struct page *page) +static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) { preempt_disable(); pagefault_disable(); if (!PageHighMem(page)) return page_address(page); - return kmap_atomic_high(page); + return kmap_atomic_high_prot(page, prot); } +#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -- cgit v1.2.3 From 090e77e166334b83f555de408df64b9ab394ea08 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Thu, 4 Jun 2020 16:48:18 -0700 Subject: kmap: consolidate kmap_prot definitions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Most architectures define kmap_prot to be PAGE_KERNEL. Let sparc and xtensa define there own and define PAGE_KERNEL as the default if not overridden. [akpm@linux-foundation.org: coding style fixes] Suggested-by: Christoph Hellwig Signed-off-by: Ira Weiny Signed-off-by: Andrew Morton Cc: Al Viro Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian König Cc: Chris Zankel Cc: Daniel Vetter Cc: Dan Williams Cc: Dave Hansen Cc: "David S. Miller" Cc: Helge Deller Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: "James E.J. Bottomley" Cc: Max Filippov Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200507150004.1423069-16-ira.weiny@intel.com Signed-off-by: Linus Torvalds --- arch/arc/include/asm/highmem.h | 3 --- arch/arm/include/asm/highmem.h | 2 -- arch/csky/include/asm/highmem.h | 2 -- arch/microblaze/include/asm/highmem.h | 1 - arch/mips/include/asm/highmem.h | 2 -- arch/nds32/include/asm/highmem.h | 1 - arch/powerpc/include/asm/highmem.h | 1 - arch/sparc/include/asm/highmem.h | 3 ++- arch/sparc/mm/highmem.c | 4 ---- arch/x86/include/asm/fixmap.h | 1 - include/linux/highmem.h | 4 ++++ 11 files changed, 6 insertions(+), 18 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 70900a73bfc8..6e5eafb3afdd 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -25,9 +25,6 @@ #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) -#define kmap_prot PAGE_KERNEL - - #include extern void kmap_init(void); diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index b0d4bd8dc3c1..31811be38d78 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -10,8 +10,6 @@ #define PKMAP_NR(virt) (((virt) - PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL - #define flush_cache_kmaps() \ do { \ if (cache_is_vivt()) \ diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index ea2f3f39174d..14645e3d5cd5 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -38,8 +38,6 @@ extern void *kmap_atomic_pfn(unsigned long pfn); extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* __ASM_CSKY_HIGHMEM_H */ diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index d7c55cfd27bd..284ca8fb54c1 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -25,7 +25,6 @@ #include #include -#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 76dec0bd4f59..f1f788b57166 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -54,8 +54,6 @@ extern void *kmap_atomic_pfn(unsigned long pfn); extern void kmap_init(void); -#define kmap_prot PAGE_KERNEL - #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index a48a6536d41a..5717647d14d1 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -32,7 +32,6 @@ #define LAST_PKMAP_MASK (LAST_PKMAP - 1) #define PKMAP_NR(virt) (((virt) - (PKMAP_BASE)) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -#define kmap_prot PAGE_KERNEL static inline void flush_cache_kmaps(void) { diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 8d8ee3fcd800..104026f7d6bc 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -29,7 +29,6 @@ #include #include -#define kmap_prot PAGE_KERNEL extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index f4babe67cb5d..ddb03c04f1f3 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -25,11 +25,12 @@ #include #include #include +#include /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; -extern pgprot_t kmap_prot; +#define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) extern pte_t *pkmap_page_table; void kmap_init(void) __init; diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c index 9309bcab4ae6..6ff6e2a9f9b3 100644 --- a/arch/sparc/mm/highmem.c +++ b/arch/sparc/mm/highmem.c @@ -32,9 +32,6 @@ #include #include -pgprot_t kmap_prot; -EXPORT_SYMBOL(kmap_prot); - static pte_t *kmap_pte; void __init kmap_init(void) @@ -51,7 +48,6 @@ void __init kmap_init(void) /* cache the first kmap pte */ kmap_pte = pte_offset_kernel(dir, address); - kmap_prot = __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE); } void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 28183ee3cc42..b9527a54db99 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -152,7 +152,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; extern pte_t *kmap_pte; -#define kmap_prot PAGE_KERNEL extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 091b32dff2d1..d6e82e3de027 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -40,6 +40,10 @@ extern void kunmap_atomic_high(void *kvaddr); static inline void kmap_flush_tlb(unsigned long addr) { } #endif +#ifndef kmap_prot +#define kmap_prot PAGE_KERNEL +#endif + void *kmap_high(struct page *page); static inline void *kmap(struct page *page) { -- cgit v1.2.3 From e0cf615d725cb3b69f0bdf1d8afdd4d4c31b4fd1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 7 Jun 2020 21:41:42 -0700 Subject: asm-generic: don't include in cacheflush.h This seems to lead to some crazy include loops when using asm-generic/cacheflush.h on more architectures, so leave it to the arch header for now. [hch@lst.de: fix warning] Link: http://lkml.kernel.org/r/20200520173520.GA11199@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Cc: Will Deacon Cc: Nick Piggin Cc: Peter Zijlstra Cc: Jeff Dike Cc: Richard Weinberger Cc: Anton Ivanov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Dan Williams Cc: Vishal Verma Cc: Dave Jiang Cc: Keith Busch Cc: Ira Weiny Cc: Arnd Bergmann Link: http://lkml.kernel.org/r/20200515143646.3857579-7-hch@lst.de Signed-off-by: Linus Torvalds --- arch/um/include/asm/tlb.h | 2 ++ arch/x86/include/asm/cacheflush.h | 2 ++ drivers/media/platform/omap3isp/ispvideo.c | 2 +- drivers/nvdimm/pmem.c | 3 ++- include/asm-generic/cacheflush.h | 3 --- 5 files changed, 7 insertions(+), 5 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 70ee60383900..ff9c62828962 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -2,6 +2,8 @@ #ifndef __UM_TLB_H #define __UM_TLB_H +#include + #include #include #include diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 63feaf2a5f93..b192d917a6d0 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_CACHEFLUSH_H #define _ASM_X86_CACHEFLUSH_H +#include + /* Caches aren't brain-dead on the intel. */ #include #include diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c index 6f769c527fae..10c214bd0903 100644 --- a/drivers/media/platform/omap3isp/ispvideo.c +++ b/drivers/media/platform/omap3isp/ispvideo.c @@ -10,7 +10,6 @@ * Sakari Ailus */ -#include #include #include #include @@ -19,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 97f948f8f4e6..d1ecd6da11a2 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -7,7 +7,6 @@ * Copyright (c) 2015, Boaz Harrosh . */ -#include #include #include #include @@ -25,6 +24,8 @@ #include #include #include +#include +#include #include "pmem.h" #include "pfn.h" #include "nd.h" diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 906277492ec5..bf9bb83e9fc8 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,9 +2,6 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H -/* Keep includes the same across arches. */ -#include - #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 /* -- cgit v1.2.3 From d46b3df78ad4b4c178f1035a35463cbc0ce768b2 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 8 Jun 2020 21:31:57 -0700 Subject: x86: add missing const qualifiers for log_lvl Currently, the log-level of show_stack() depends on a platform realization. It creates situations where the headers are printed with lower log level or higher than the stacktrace (depending on a platform or user). Furthermore, it forces the logic decision from user to an architecture side. In result, some users as sysrq/kdb/etc are doing tricks with temporary rising console_loglevel while printing their messages. And in result it not only may print unwanted messages from other CPUs, but also omit printing at all in the unlucky case where the printk() was deferred. Introducing log-level parameter and KERN_UNSUPPRESSED [1] seems an easier approach than introducing more printk buffers. Also, it will consolidate printings with headers. Keep log_lvl const show_trace_log_lvl() and printk_stack_address() as the new generic show_stack_loglvl() wants to have a proper const qualifier. And gcc rightfully produces warnings in case it's not keept: arch/x86/kernel/dumpstack.c: In function `show_stack': arch/x86/kernel/dumpstack.c:294:37: warning: passing argument 4 of `show_trace_log_lv ' discards `const' qualifier from pointer target type [-Wdiscarded-qualifiers] 294 | show_trace_log_lvl(task, NULL, sp, loglvl); | ^~~~~~ arch/x86/kernel/dumpstack.c:163:32: note: expected `char *' but argument is of type `const char *' 163 | unsigned long *stack, char *log_lvl) | ~~~~~~^~~~~~~ [1]: https://lore.kernel.org/lkml/20190528002412.1625-1-dima@arista.com/T/#u Signed-off-by: Dmitry Safonov Signed-off-by: Andrew Morton Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20200418201944.482088-41-dima@arista.com Signed-off-by: Linus Torvalds --- arch/x86/include/asm/stacktrace.h | 2 +- arch/x86/kernel/dumpstack.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 14db05086bbf..5ae5a68e469d 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -87,7 +87,7 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl); + unsigned long *stack, const char *log_lvl); /* The form of the top of the frame on the stack */ struct stack_frame { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ae64ec7f752f..b94bc31a1757 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -65,7 +65,7 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info) } static void printk_stack_address(unsigned long address, int reliable, - char *log_lvl) + const char *log_lvl) { touch_nmi_watchdog(); printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); @@ -160,7 +160,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, char *log_lvl) + unsigned long *stack, const char *log_lvl) { struct unwind_state state; struct stack_info stack_info = {0}; -- cgit v1.2.3 From e31cf2f4ca422ac9b14ecc4a1295b8977a20f812 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 8 Jun 2020 21:32:33 -0700 Subject: mm: don't include asm/pgtable.h if linux/mm.h is already included Patch series "mm: consolidate definitions of page table accessors", v2. The low level page table accessors (pXY_index(), pXY_offset()) are duplicated across all architectures and sometimes more than once. For instance, we have 31 definition of pgd_offset() for 25 supported architectures. Most of these definitions are actually identical and typically it boils down to, e.g. static inline unsigned long pmd_index(unsigned long address) { return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1); } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { return (pmd_t *)pud_page_vaddr(*pud) + pmd_index(address); } These definitions can be shared among 90% of the arches provided XYZ_SHIFT, PTRS_PER_XYZ and xyz_page_vaddr() are defined. For architectures that really need a custom version there is always possibility to override the generic version with the usual ifdefs magic. These patches introduce include/linux/pgtable.h that replaces include/asm-generic/pgtable.h and add the definitions of the page table accessors to the new header. This patch (of 12): The linux/mm.h header includes to allow inlining of the functions involving page table manipulations, e.g. pte_alloc() and pmd_alloc(). So, there is no point to explicitly include in the files that include . The include statements in such cases are remove with a simple loop: for f in $(git grep -l "include ") ; do sed -i -e '/include / d' $f done Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Cain Cc: Catalin Marinas Cc: Chris Zankel Cc: "David S. Miller" Cc: Geert Uytterhoeven Cc: Greentime Hu Cc: Greg Ungerer Cc: Guan Xuetao Cc: Guo Ren Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Ley Foon Tan Cc: Mark Salter Cc: Matthew Wilcox Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Mike Rapoport Cc: Nick Hu Cc: Paul Walmsley Cc: Richard Weinberger Cc: Rich Felker Cc: Russell King Cc: Stafford Horne Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Tony Luck Cc: Vincent Chen Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Link: http://lkml.kernel.org/r/20200514170327.31389-1-rppt@kernel.org Link: http://lkml.kernel.org/r/20200514170327.31389-2-rppt@kernel.org Signed-off-by: Linus Torvalds --- arch/alpha/boot/bootp.c | 1 - arch/alpha/boot/bootpz.c | 1 - arch/alpha/boot/main.c | 1 - arch/alpha/include/asm/io.h | 1 - arch/alpha/kernel/process.c | 1 - arch/alpha/kernel/ptrace.c | 1 - arch/alpha/kernel/setup.c | 1 - arch/alpha/kernel/smp.c | 1 - arch/alpha/kernel/sys_alcor.c | 1 - arch/alpha/kernel/sys_cabriolet.c | 1 - arch/alpha/kernel/sys_dp264.c | 1 - arch/alpha/kernel/sys_eb64p.c | 1 - arch/alpha/kernel/sys_eiger.c | 1 - arch/alpha/kernel/sys_jensen.c | 1 - arch/alpha/kernel/sys_marvel.c | 1 - arch/alpha/kernel/sys_miata.c | 1 - arch/alpha/kernel/sys_mikasa.c | 1 - arch/alpha/kernel/sys_nautilus.c | 1 - arch/alpha/kernel/sys_noritake.c | 1 - arch/alpha/kernel/sys_rawhide.c | 1 - arch/alpha/kernel/sys_ruffian.c | 1 - arch/alpha/kernel/sys_rx164.c | 1 - arch/alpha/kernel/sys_sable.c | 1 - arch/alpha/kernel/sys_sio.c | 1 - arch/alpha/kernel/sys_sx164.c | 1 - arch/alpha/kernel/sys_takara.c | 1 - arch/alpha/kernel/sys_titan.c | 1 - arch/alpha/kernel/sys_wildfire.c | 1 - arch/alpha/mm/init.c | 1 - arch/arm/kernel/machine_kexec.c | 1 - arch/arm/kernel/module.c | 1 - arch/arm/kernel/ptrace.c | 1 - arch/arm/kernel/smp.c | 1 - arch/arm/mach-ebsa110/core.c | 1 - arch/arm/mach-footbridge/common.c | 1 - arch/arm/mach-imx/mm-imx21.c | 1 - arch/arm/mach-imx/mm-imx27.c | 1 - arch/arm/mach-imx/mm-imx3.c | 1 - arch/arm/mach-iop32x/i2c.c | 1 - arch/arm/mach-iop32x/iq31244.c | 1 - arch/arm/mach-iop32x/iq80321.c | 1 - arch/arm/mach-iop32x/n2100.c | 1 - arch/arm/mach-ixp4xx/common.c | 1 - arch/arm/mach-sa1100/assabet.c | 1 - arch/arm/mm/copypage-v4mc.c | 1 - arch/arm/mm/copypage-v6.c | 1 - arch/arm/mm/copypage-xscale.c | 1 - arch/arm/mm/dump.c | 1 - arch/arm/mm/fault-armv.c | 1 - arch/arm/mm/fault.c | 1 - arch/arm/mm/pageattr.c | 1 - arch/arm64/kernel/hibernate.c | 1 - arch/arm64/kernel/ptrace.c | 1 - arch/arm64/kernel/smp.c | 1 - arch/arm64/mm/dump.c | 1 - arch/arm64/mm/fault.c | 1 - arch/arm64/mm/kasan_init.c | 1 - arch/arm64/mm/pageattr.c | 1 - arch/csky/kernel/module.c | 1 - arch/csky/kernel/ptrace.c | 1 - arch/csky/mm/init.c | 1 - arch/csky/mm/tlb.c | 1 - arch/h8300/kernel/process.c | 1 - arch/h8300/kernel/setup.c | 1 - arch/h8300/kernel/signal.c | 1 - arch/h8300/mm/fault.c | 1 - arch/h8300/mm/init.c | 1 - arch/h8300/mm/memory.c | 1 - arch/hexagon/mm/vm_fault.c | 1 - arch/ia64/kernel/efi.c | 1 - arch/ia64/kernel/ptrace.c | 1 - arch/ia64/kernel/smp.c | 1 - arch/ia64/kernel/smpboot.c | 1 - arch/ia64/mm/contig.c | 1 - arch/ia64/mm/fault.c | 1 - arch/m68k/68000/timers.c | 1 - arch/m68k/amiga/config.c | 1 - arch/m68k/apollo/config.c | 1 - arch/m68k/atari/atasound.c | 1 - arch/m68k/atari/stram.c | 1 - arch/m68k/bvme6000/config.c | 1 - arch/m68k/kernel/process.c | 1 - arch/m68k/kernel/ptrace.c | 1 - arch/m68k/kernel/setup_no.c | 1 - arch/m68k/kernel/signal.c | 1 - arch/m68k/kernel/uboot.c | 1 - arch/m68k/mac/config.c | 1 - arch/m68k/mm/mcfmmu.c | 1 - arch/m68k/mm/sun3kmap.c | 1 - arch/m68k/mm/sun3mmu.c | 1 - arch/m68k/mvme147/config.c | 1 - arch/m68k/mvme16x/config.c | 1 - arch/m68k/q40/config.c | 1 - arch/m68k/sun3/config.c | 1 - arch/m68k/sun3/dvma.c | 1 - arch/m68k/sun3/mmu_emu.c | 1 - arch/m68k/sun3/sun3dvma.c | 1 - arch/m68k/sun3x/dvma.c | 1 - arch/m68k/sun3x/prom.c | 1 - arch/microblaze/kernel/signal.c | 1 - arch/microblaze/mm/fault.c | 1 - arch/mips/fw/arc/memory.c | 1 - arch/mips/include/asm/mach-generic/floppy.h | 1 - arch/mips/include/asm/mach-jazz/floppy.h | 1 - arch/mips/jazz/jazzdma.c | 1 - arch/mips/kernel/module.c | 1 - arch/mips/kernel/process.c | 1 - arch/mips/kernel/ptrace.c | 1 - arch/mips/kernel/ptrace32.c | 1 - arch/mips/kernel/smp-bmips.c | 1 - arch/mips/kernel/traps.c | 1 - arch/mips/kvm/tlb.c | 1 - arch/mips/lib/dump_tlb.c | 1 - arch/mips/lib/r3k_dump_tlb.c | 1 - arch/mips/mm/c-octeon.c | 1 - arch/mips/mm/c-r3k.c | 1 - arch/mips/mm/c-r4k.c | 1 - arch/mips/mm/c-tx39.c | 1 - arch/mips/mm/init.c | 1 - arch/mips/mm/page.c | 1 - arch/mips/mm/pgtable-32.c | 1 - arch/mips/mm/pgtable-64.c | 1 - arch/mips/mm/sc-ip22.c | 1 - arch/mips/mm/sc-mips.c | 1 - arch/mips/mm/sc-r5k.c | 1 - arch/mips/mm/tlb-r3k.c | 1 - arch/mips/mm/tlb-r4k.c | 1 - arch/mips/sgi-ip27/ip27-init.c | 1 - arch/mips/sgi-ip27/ip27-timer.c | 1 - arch/mips/sgi-ip32/ip32-memory.c | 1 - arch/nds32/mm/fault.c | 1 - arch/nds32/mm/proc.c | 1 - arch/nios2/kernel/module.c | 1 - arch/nios2/mm/init.c | 1 - arch/nios2/mm/pgtable.c | 1 - arch/nios2/mm/tlb.c | 1 - arch/openrisc/include/asm/tlbflush.h | 1 - arch/openrisc/kernel/asm-offsets.c | 1 - arch/openrisc/kernel/process.c | 1 - arch/openrisc/kernel/ptrace.c | 1 - arch/openrisc/kernel/setup.c | 1 - arch/openrisc/kernel/traps.c | 1 - arch/openrisc/mm/init.c | 1 - arch/openrisc/mm/tlb.c | 1 - arch/parisc/include/asm/mmu_context.h | 1 - arch/parisc/kernel/module.c | 1 - arch/parisc/kernel/ptrace.c | 1 - arch/parisc/kernel/smp.c | 1 - arch/parisc/mm/init.c | 1 - arch/powerpc/include/asm/io.h | 1 - arch/powerpc/kernel/asm-offsets.c | 1 - arch/powerpc/kernel/process.c | 1 - arch/powerpc/kernel/signal_32.c | 1 - arch/powerpc/kernel/signal_64.c | 1 - arch/powerpc/kernel/traps.c | 1 - arch/powerpc/kernel/vdso.c | 1 - arch/powerpc/lib/code-patching.c | 1 - arch/powerpc/mm/book3s64/hash_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/hash_pgtable.c | 1 - arch/powerpc/mm/book3s64/radix_hugetlbpage.c | 1 - arch/powerpc/mm/book3s64/radix_pgtable.c | 1 - arch/powerpc/mm/fault.c | 1 - arch/powerpc/mm/hugetlbpage.c | 1 - arch/powerpc/mm/init_32.c | 1 - arch/powerpc/mm/init_64.c | 1 - arch/powerpc/mm/mem.c | 1 - arch/powerpc/mm/nohash/40x.c | 1 - arch/powerpc/mm/nohash/fsl_booke.c | 1 - arch/powerpc/mm/pgtable_32.c | 1 - arch/powerpc/mm/pgtable_64.c | 1 - arch/powerpc/mm/ptdump/hashpagetable.c | 1 - arch/powerpc/mm/ptdump/ptdump.c | 1 - arch/powerpc/perf/callchain.c | 1 - arch/powerpc/perf/callchain_32.c | 1 - arch/powerpc/perf/callchain_64.c | 1 - arch/powerpc/platforms/8xx/cpm1.c | 1 - arch/powerpc/platforms/8xx/micropatch.c | 1 - arch/powerpc/platforms/cell/setup.c | 1 - arch/powerpc/platforms/chrp/setup.c | 1 - arch/powerpc/platforms/maple/setup.c | 1 - arch/powerpc/platforms/maple/time.c | 1 - arch/powerpc/platforms/powermac/setup.c | 1 - arch/powerpc/platforms/powermac/time.c | 1 - arch/powerpc/platforms/pseries/setup.c | 1 - arch/powerpc/sysdev/cpm2.c | 1 - arch/powerpc/xmon/xmon.c | 1 - arch/riscv/kernel/setup.c | 1 - arch/riscv/mm/init.c | 1 - arch/s390/include/asm/tlbflush.h | 1 - arch/s390/kernel/machine_kexec.c | 1 - arch/s390/kernel/ptrace.c | 1 - arch/s390/kernel/vdso.c | 1 - arch/s390/mm/dump_pagetables.c | 1 - arch/s390/mm/fault.c | 1 - arch/s390/mm/init.c | 1 - arch/s390/mm/pageattr.c | 1 - arch/s390/mm/pgtable.c | 1 - arch/s390/mm/vmem.c | 1 - arch/sh/kernel/machine_kexec.c | 1 - arch/sh/kernel/ptrace_32.c | 1 - arch/sh/kernel/signal_32.c | 1 - arch/sh/mm/cache-sh3.c | 1 - arch/sh/mm/cache-sh4.c | 1 - arch/sh/mm/cache-sh7705.c | 1 - arch/sh/mm/nommu.c | 1 - arch/sparc/kernel/leon_smp.c | 1 - arch/sparc/kernel/process_32.c | 1 - arch/sparc/kernel/process_64.c | 1 - arch/sparc/kernel/ptrace_32.c | 1 - arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/setup_32.c | 1 - arch/sparc/kernel/setup_64.c | 1 - arch/sparc/kernel/signal32.c | 1 - arch/sparc/kernel/signal_32.c | 1 - arch/sparc/kernel/signal_64.c | 1 - arch/sparc/kernel/smp_32.c | 1 - arch/sparc/kernel/smp_64.c | 1 - arch/sparc/kernel/traps_64.c | 1 - arch/sparc/mm/fault_32.c | 1 - arch/sparc/mm/fault_64.c | 1 - arch/sparc/mm/hugetlbpage.c | 1 - arch/sparc/mm/init_32.c | 1 - arch/sparc/mm/init_64.c | 1 - arch/sparc/mm/io-unit.c | 1 - arch/sparc/mm/iommu.c | 1 - arch/sparc/mm/tlb.c | 1 - arch/um/kernel/process.c | 1 - arch/um/kernel/skas/mmu.c | 1 - arch/um/kernel/skas/uaccess.c | 1 - arch/um/kernel/tlb.c | 1 - arch/um/kernel/trap.c | 1 - arch/um/kernel/um_arch.c | 1 - arch/unicore32/kernel/module.c | 1 - arch/unicore32/mm/fault.c | 1 - arch/x86/include/asm/iomap.h | 1 - arch/x86/include/asm/xen/page.h | 1 - arch/x86/kernel/alternative.c | 1 - arch/x86/kernel/amd_gart_64.c | 1 - arch/x86/kernel/doublefault_32.c | 1 - arch/x86/kernel/machine_kexec_32.c | 1 - arch/x86/kernel/machine_kexec_64.c | 1 - arch/x86/kernel/module.c | 1 - arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/ptrace.c | 1 - arch/x86/kernel/tboot.c | 1 - arch/x86/mm/dump_pagetables.c | 1 - arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 1 - arch/x86/mm/kasan_init_64.c | 1 - arch/x86/mm/pat/cpa-test.c | 1 - arch/x86/mm/pat/memtype.c | 1 - arch/x86/mm/pgtable.c | 1 - arch/x86/mm/pgtable_32.c | 1 - arch/x86/mm/pti.c | 1 - arch/x86/platform/efi/efi_64.c | 1 - arch/x86/xen/enlighten_pv.c | 1 - arch/x86/xen/grant-table.c | 1 - arch/xtensa/kernel/process.c | 1 - arch/xtensa/kernel/ptrace.c | 1 - arch/xtensa/kernel/setup.c | 1 - drivers/char/agp/frontend.c | 1 - drivers/char/agp/generic.c | 1 - drivers/char/bsr.c | 1 - drivers/char/mspec.c | 1 - drivers/gpu/drm/i915/i915_mm.c | 1 - drivers/infiniband/sw/rdmavt/mmap.c | 1 - drivers/infiniband/sw/rxe/rxe_mmap.c | 1 - drivers/media/platform/davinci/vpbe_display.c | 1 - drivers/media/v4l2-core/v4l2-common.c | 1 - drivers/misc/sgi-gru/grufault.c | 1 - drivers/net/ethernet/sun/sunhme.c | 1 - drivers/sbus/char/flash.c | 1 - drivers/sbus/char/uctrl.c | 1 - drivers/scsi/a2091.c | 1 - drivers/scsi/a3000.c | 1 - drivers/scsi/gvp11.c | 1 - drivers/scsi/lasi700.c | 1 - drivers/scsi/mvme147.c | 1 - drivers/scsi/sni_53c710.c | 1 - drivers/video/console/newport_con.c | 1 - drivers/video/fbdev/acornfb.c | 1 - drivers/video/fbdev/atafb.c | 1 - drivers/video/fbdev/cirrusfb.c | 1 - drivers/video/fbdev/cyber2000fb.c | 1 - drivers/video/fbdev/fb-puv3.c | 1 - drivers/video/fbdev/hitfb.c | 1 - drivers/video/fbdev/neofb.c | 1 - drivers/video/fbdev/q40fb.c | 1 - drivers/video/fbdev/savage/savagefb_driver.c | 1 - drivers/xen/balloon.c | 1 - drivers/xen/grant-table.c | 1 - drivers/xen/privcmd.c | 1 - drivers/xen/xenbus/xenbus_probe.c | 1 - drivers/xen/xenbus/xenbus_probe_backend.c | 1 - drivers/xen/xenbus/xenbus_probe_frontend.c | 1 - fs/proc/array.c | 1 - fs/proc/meminfo.c | 1 - fs/proc/nommu.c | 1 - fs/proc/vmcore.c | 1 - include/linux/dax.h | 1 - init/init_task.c | 1 - kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/power/snapshot.c | 1 - lib/ioremap.c | 1 - mm/debug_vm_pgtable.c | 1 - mm/gup.c | 1 - mm/hugetlb.c | 1 - mm/memory.c | 1 - mm/page_io.c | 1 - mm/shmem.c | 1 - mm/sparse-vmemmap.c | 1 - mm/sparse.c | 1 - mm/swap_state.c | 1 - mm/swapfile.c | 1 - mm/vmacache.c | 1 - sound/core/sgbuf.c | 1 - virt/kvm/kvm_main.c | 1 - 319 files changed, 319 deletions(-) (limited to 'arch/x86/include') diff --git a/arch/alpha/boot/bootp.c b/arch/alpha/boot/bootp.c index 95c0359f4858..00266e6e1b71 100644 --- a/arch/alpha/boot/bootp.c +++ b/arch/alpha/boot/bootp.c @@ -16,7 +16,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/bootpz.c b/arch/alpha/boot/bootpz.c index 99b8d7dc344b..43af71835adf 100644 --- a/arch/alpha/boot/bootpz.c +++ b/arch/alpha/boot/bootpz.c @@ -18,7 +18,6 @@ #include #include -#include #include #include diff --git a/arch/alpha/boot/main.c b/arch/alpha/boot/main.c index 8f5ed8610970..e5347a080008 100644 --- a/arch/alpha/boot/main.c +++ b/arch/alpha/boot/main.c @@ -14,7 +14,6 @@ #include #include -#include #include diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index d1ed5a8133c5..13bea465f1c0 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 48b81d015d8a..b45f0b0d6511 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index cb8d599e72d6..8c43212ae38e 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/setup.c b/arch/alpha/kernel/setup.c index 6fa802c495b4..f5c42a8fcf9c 100644 --- a/arch/alpha/kernel/setup.c +++ b/arch/alpha/kernel/setup.c @@ -55,7 +55,6 @@ static struct notifier_block alpha_panic_block = { }; #include -#include #include #include #include diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 52995bf413fe..631cc17410d1 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -36,7 +36,6 @@ #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_alcor.c b/arch/alpha/kernel/sys_alcor.c index ce5430056f65..e063b3857b3d 100644 --- a/arch/alpha/kernel/sys_alcor.c +++ b/arch/alpha/kernel/sys_alcor.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_cabriolet.c b/arch/alpha/kernel/sys_cabriolet.c index 0aa6a27d0e2f..47459b73cdb7 100644 --- a/arch/alpha/kernel/sys_cabriolet.c +++ b/arch/alpha/kernel/sys_cabriolet.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c index d33508621820..9fb445d7dca5 100644 --- a/arch/alpha/kernel/sys_dp264.c +++ b/arch/alpha/kernel/sys_dp264.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eb64p.c b/arch/alpha/kernel/sys_eb64p.c index 1cdfe55fb987..3c43fd347526 100644 --- a/arch/alpha/kernel/sys_eb64p.c +++ b/arch/alpha/kernel/sys_eb64p.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_eiger.c b/arch/alpha/kernel/sys_eiger.c index 016f79251141..bf99dcfd40c4 100644 --- a/arch/alpha/kernel/sys_eiger.c +++ b/arch/alpha/kernel/sys_eiger.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index d0d44f543d77..0a2ab6cb18db 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "proto.h" diff --git a/arch/alpha/kernel/sys_marvel.c b/arch/alpha/kernel/sys_marvel.c index 533899a4a1a1..83d6c53d6d4d 100644 --- a/arch/alpha/kernel/sys_marvel.c +++ b/arch/alpha/kernel/sys_marvel.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_miata.c b/arch/alpha/kernel/sys_miata.c index 702292af2225..e1bee8f84c58 100644 --- a/arch/alpha/kernel/sys_miata.c +++ b/arch/alpha/kernel/sys_miata.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_mikasa.c b/arch/alpha/kernel/sys_mikasa.c index 3af4f94113e1..7690dfd57cb6 100644 --- a/arch/alpha/kernel/sys_mikasa.c +++ b/arch/alpha/kernel/sys_mikasa.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_nautilus.c b/arch/alpha/kernel/sys_nautilus.c index 32850e45834b..53adf43dcd44 100644 --- a/arch/alpha/kernel/sys_nautilus.c +++ b/arch/alpha/kernel/sys_nautilus.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_noritake.c b/arch/alpha/kernel/sys_noritake.c index b106f327f765..47f3ce4f719a 100644 --- a/arch/alpha/kernel/sys_noritake.c +++ b/arch/alpha/kernel/sys_noritake.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_rawhide.c b/arch/alpha/kernel/sys_rawhide.c index b76f65d0e8b5..b5846ffdadce 100644 --- a/arch/alpha/kernel/sys_rawhide.c +++ b/arch/alpha/kernel/sys_rawhide.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_ruffian.c b/arch/alpha/kernel/sys_ruffian.c index d33074011960..4b1c8d85c4f0 100644 --- a/arch/alpha/kernel/sys_ruffian.c +++ b/arch/alpha/kernel/sys_ruffian.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_rx164.c b/arch/alpha/kernel/sys_rx164.c index 4d85eaeb44aa..94046f9aea08 100644 --- a/arch/alpha/kernel/sys_rx164.c +++ b/arch/alpha/kernel/sys_rx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sable.c b/arch/alpha/kernel/sys_sable.c index 3cf0d32da5d8..930005b2f630 100644 --- a/arch/alpha/kernel/sys_sable.c +++ b/arch/alpha/kernel/sys_sable.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_sio.c b/arch/alpha/kernel/sys_sio.c index a6bdc1da47ad..7c420d8dac53 100644 --- a/arch/alpha/kernel/sys_sio.c +++ b/arch/alpha/kernel/sys_sio.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_sx164.c b/arch/alpha/kernel/sys_sx164.c index 17cc203176c8..dd9de84b630c 100644 --- a/arch/alpha/kernel/sys_sx164.c +++ b/arch/alpha/kernel/sys_sx164.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_takara.c b/arch/alpha/kernel/sys_takara.c index e230c6864088..9e2adb69bc74 100644 --- a/arch/alpha/kernel/sys_takara.c +++ b/arch/alpha/kernel/sys_takara.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c index c8390d8de140..b1f3b4fcf99b 100644 --- a/arch/alpha/kernel/sys_titan.c +++ b/arch/alpha/kernel/sys_titan.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/kernel/sys_wildfire.c b/arch/alpha/kernel/sys_wildfire.c index 2191bde161fd..2c54d707142a 100644 --- a/arch/alpha/kernel/sys_wildfire.c +++ b/arch/alpha/kernel/sys_wildfire.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 667cd21393b5..3c42b3147fd6 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -24,7 +24,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/machine_kexec.c b/arch/arm/kernel/machine_kexec.c index 76300f3813e8..974b6c64d3e6 100644 --- a/arch/arm/kernel/machine_kexec.c +++ b/arch/arm/kernel/machine_kexec.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index af0a8500a24e..e15444b25ca0 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -17,7 +17,6 @@ #include #include -#include #include #include #include diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 4cc6a7eff635..d0f7c8896c96 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -25,7 +25,6 @@ #include #include -#include #include #define CREATE_TRACE_POINTS diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 46e1be9e57a8..9a6432557871 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/mach-ebsa110/core.c b/arch/arm/mach-ebsa110/core.c index 575b2e2b6759..5960e3dfd2bf 100644 --- a/arch/arm/mach-ebsa110/core.c +++ b/arch/arm/mach-ebsa110/core.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/arm/mach-footbridge/common.c b/arch/arm/mach-footbridge/common.c index 015f75d1c98d..eee095f0e2f6 100644 --- a/arch/arm/mach-footbridge/common.c +++ b/arch/arm/mach-footbridge/common.c @@ -14,7 +14,6 @@ #include #include