From 18c71ce9c8822d48d2b4c50242051535d46082ac Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 4 Dec 2017 10:57:23 -0600 Subject: x86/CPU/AMD: Add the Secure Encrypted Virtualization CPU feature MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the CPU features to include identifying and reporting on the Secure Encrypted Virtualization (SEV) feature. SEV is identified by CPUID 0x8000001f, but requires BIOS support to enable it (set bit 23 of MSR_K8_SYSCFG and set bit 0 of MSR_K7_HWCR). Only show the SEV feature as available if reported by CPUID and enabled by BIOS. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Borislav Petkov Cc: kvm@vger.kernel.org Cc: x86@kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 2 ++ arch/x86/kernel/cpu/amd.c | 66 ++++++++++++++++++++++++++------------ arch/x86/kernel/cpu/scattered.c | 1 + 4 files changed, 50 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index c0b0e9e8aa66..19b955adacff 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -201,6 +201,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ #define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_SEV ( 7*32+11) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 34c4922bbc3f..507d3e30f7fe 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -382,6 +382,8 @@ #define MSR_K7_PERFCTR3 0xc0010007 #define MSR_K7_CLK_CTL 0xc001001b #define MSR_K7_HWCR 0xc0010015 +#define MSR_K7_HWCR_SMMLOCK_BIT 0 +#define MSR_K7_HWCR_SMMLOCK BIT_ULL(MSR_K7_HWCR_SMMLOCK_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d58184b7cd44..c1234aa0550c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -556,6 +556,51 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } } +static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) +{ + u64 msr; + + /* + * BIOS support is required for SME and SEV. + * For SME: If BIOS has enabled SME then adjust x86_phys_bits by + * the SME physical address space reduction value. + * If BIOS has not enabled SME then don't advertise the + * SME feature (set in scattered.c). + * For SEV: If BIOS has not enabled SEV then don't advertise the + * SEV feature (set in scattered.c). + * + * In all cases, since support for SME and SEV requires long mode, + * don't advertise the feature under CONFIG_X86_32. + */ + if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { + /* Check if memory encryption is enabled */ + rdmsrl(MSR_K8_SYSCFG, msr); + if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT)) + goto clear_all; + + /* + * Always adjust physical address bits. Even though this + * will be a value above 32-bits this is still done for + * CONFIG_X86_32 so that accurate values are reported. + */ + c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; + + if (IS_ENABLED(CONFIG_X86_32)) + goto clear_all; + + rdmsrl(MSR_K7_HWCR, msr); + if (!(msr & MSR_K7_HWCR_SMMLOCK)) + goto clear_sev; + + return; + +clear_all: + clear_cpu_cap(c, X86_FEATURE_SME); +clear_sev: + clear_cpu_cap(c, X86_FEATURE_SEV); + } +} + static void early_init_amd(struct cpuinfo_x86 *c) { u32 dummy; @@ -627,26 +672,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (cpu_has_amd_erratum(c, amd_erratum_400)) set_cpu_bug(c, X86_BUG_AMD_E400); - /* - * BIOS support is required for SME. If BIOS has enabled SME then - * adjust x86_phys_bits by the SME physical address space reduction - * value. If BIOS has not enabled SME then don't advertise the - * feature (set in scattered.c). Also, since the SME support requires - * long mode, don't advertise the feature under CONFIG_X86_32. - */ - if (cpu_has(c, X86_FEATURE_SME)) { - u64 msr; - - /* Check if SME is enabled */ - rdmsrl(MSR_K8_SYSCFG, msr); - if (msr & MSR_K8_SYSCFG_MEM_ENCRYPT) { - c->x86_phys_bits -= (cpuid_ebx(0x8000001f) >> 6) & 0x3f; - if (IS_ENABLED(CONFIG_X86_32)) - clear_cpu_cap(c, X86_FEATURE_SME); - } else { - clear_cpu_cap(c, X86_FEATURE_SME); - } - } + early_detect_mem_encrypt(c); } static void init_amd_k8(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 05459ad3db46..63a78d5fe505 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -32,6 +32,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, { X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 }, + { X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 }, { 0, 0, 0, 0, 0 } }; -- cgit v1.2.3 From cea3a19b007a690a40fac373f22bf34ea69761a2 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 4 Dec 2017 10:57:24 -0600 Subject: kvm: svm: prepare for new bit definition in nested_ctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the nested_ctl variable in the vmcb_control_area structure is used to indicate nested paging support. The nested paging support field is actually defined as bit 0 of the field. In order to support a new feature flag the usage of the nested_ctl and nested paging support must be converted to operate on a single bit. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/svm.h | 2 ++ arch/x86/kvm/svm.c | 7 ++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 78dd9df88157..c936c9824405 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -146,6 +146,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_VM_CR_SVM_LOCK_MASK 0x0008ULL #define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL +#define SVM_NESTED_CTL_NP_ENABLE BIT(0) + struct __attribute__ ((__packed__)) vmcb_seg { u16 selector; u16 attrib; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 59e13a79c2e3..1c8158a6ee06 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1293,7 +1293,7 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ - control->nested_ctl = 1; + control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); @@ -2918,7 +2918,8 @@ static bool nested_vmcb_checks(struct vmcb *vmcb) if (vmcb->control.asid == 0) return false; - if (vmcb->control.nested_ctl && !npt_enabled) + if ((vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) && + !npt_enabled) return false; return true; @@ -2932,7 +2933,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; - if (nested_vmcb->control.nested_ctl) { + if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) { kvm_mmu_unload(&svm->vcpu); svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3; nested_svm_init_mmu_context(&svm->vcpu); -- cgit v1.2.3 From ba7c3398dc7eedbbe9d19a17ce3ab98349079fc7 Mon Sep 17 00:00:00 2001 From: Tom Lendacky Date: Mon, 4 Dec 2017 10:57:24 -0600 Subject: kvm: svm: Add SEV feature definitions to KVM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define the SEV enable bit for the VMCB control structure. The hypervisor will use this bit to enable SEV in the guest. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Tom Lendacky Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/svm.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index c936c9824405..0487ac054870 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -147,6 +147,7 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define SVM_VM_CR_SVM_DIS_MASK 0x0010ULL #define SVM_NESTED_CTL_NP_ENABLE BIT(0) +#define SVM_NESTED_CTL_SEV_ENABLE BIT(1) struct __attribute__ ((__packed__)) vmcb_seg { u16 selector; -- cgit v1.2.3 From 4faefff3241e02dd66bfb77bd7ca40f3aa08f62e Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:25 -0600 Subject: KVM: SVM: Prepare to reserve asid for SEV guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, ASID allocation start at 1. Add a svm_vcpu_data.min_asid which allows supplying a dynamic start ASID. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Paolo Bonzini Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1c8158a6ee06..e403cf1f6ba3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -525,6 +525,7 @@ struct svm_cpu_data { u64 asid_generation; u32 max_asid; u32 next_asid; + u32 min_asid; struct kvm_ldttss_desc *tss_desc; struct page *save_area; @@ -782,6 +783,7 @@ static int svm_hardware_enable(void) sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; + sd->min_asid = 1; gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); @@ -2088,7 +2090,7 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) { if (sd->next_asid > sd->max_asid) { ++sd->asid_generation; - sd->next_asid = 1; + sd->next_asid = sd->min_asid; svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; } -- cgit v1.2.3 From 8765d75329a386dd7742f94a1ea5fdcdea8d93d0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:25 -0600 Subject: KVM: X86: Extend CPUID range to include new leaf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This CPUID leaf provides the memory encryption support information on AMD Platform. Its complete description is available in APM volume 2, Section 15.34 Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Borislav Petkov Signed-off-by: Brijesh Singh --- arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/svm.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0099e10eb045..c6473ca825cd 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -604,7 +604,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->edx = 0; break; case 0x80000000: - entry->eax = min(entry->eax, 0x8000001a); + entry->eax = min(entry->eax, 0x8000001f); break; case 0x80000001: entry->edx &= kvm_cpuid_8000_0001_edx_x86_features; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e403cf1f6ba3..7e302a25bc45 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5170,6 +5170,12 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) entry->edx |= SVM_FEATURE_NPT; break; + case 0x8000001F: + /* Support memory encryption cpuid if host supports it */ + if (boot_cpu_has(X86_FEATURE_SEV)) + cpuid(0x8000001f, &entry->eax, &entry->ebx, + &entry->ecx, &entry->edx); + } } -- cgit v1.2.3 From 5acc5c063196b4a531a761a954023c1848ec832b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:26 -0600 Subject: KVM: Introduce KVM_MEMORY_ENCRYPT_OP ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the hardware supports memory encryption then the KVM_MEMORY_ENCRYPT_OP ioctl can be used by qemu to issue a platform specific memory encryption commands. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Paolo Bonzini Reviewed-by: Borislav Petkov --- Documentation/virtual/kvm/api.txt | 16 ++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 6 ++++++ include/uapi/linux/kvm.h | 2 ++ 4 files changed, 26 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index f670e4b9e7f3..c8755be35543 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3394,6 +3394,22 @@ invalid, if invalid pages are written to (e.g. after the end of memory) or if no page table is present for the addresses (e.g. when using hugepages). +4.109 KVM_MEMORY_ENCRYPT_OP + +Capability: basic +Architectures: x86 +Type: system +Parameters: an opaque platform specific structure (in/out) +Returns: 0 on success; -1 on error + +If the platform supports creating encrypted VMs then this ioctl can be used +for issuing platform-specific memory encryption commands to manage those +encrypted VMs. + +Currently, this ioctl is used for issuing Secure Encrypted Virtualization +(SEV) commands on AMD Processors. The SEV commands are defined in +Documentation/virtual/kvm/amd-memory-encryption.txt. + 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1bfb99770c34..c87e214d55df 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1066,6 +1066,8 @@ struct kvm_x86_ops { int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); int (*enable_smi_window)(struct kvm_vcpu *vcpu); + + int (*mem_enc_op)(struct kvm *kvm, void __user *argp); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34c85aa2e2d1..7bbed0c0ba79 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4281,6 +4281,12 @@ set_identity_unlock: r = kvm_vm_ioctl_enable_cap(kvm, &cap); break; } + case KVM_MEMORY_ENCRYPT_OP: { + r = -ENOTTY; + if (kvm_x86_ops->mem_enc_op) + r = kvm_x86_ops->mem_enc_op(kvm, argp); + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 282d7613fce8..addd0cf4445f 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1358,6 +1358,8 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_S390_CMMA_MIGRATION */ #define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) +/* Memory Encryption Commands */ +#define KVM_MEMORY_ENCRYPT_OP _IOWR(KVMIO, 0xba, unsigned long) #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -- cgit v1.2.3 From 69eaedee411c1fc1cf123520897a96b7cf04d8a0 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:26 -0600 Subject: KVM: Introduce KVM_MEMORY_ENCRYPT_{UN,}REG_REGION ioctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If hardware supports memory encryption then KVM_MEMORY_ENCRYPT_REG_REGION and KVM_MEMORY_ENCRYPT_UNREG_REGION ioctl's can be used by userspace to register/unregister the guest memory regions which may contain the encrypted data (e.g guest RAM, PCI BAR, SMRAM etc). Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- Documentation/virtual/kvm/api.txt | 34 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 24 ++++++++++++++++++++++++ include/uapi/linux/kvm.h | 8 ++++++++ 4 files changed, 68 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index c8755be35543..c2ced6a44bbb 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3410,6 +3410,40 @@ Currently, this ioctl is used for issuing Secure Encrypted Virtualization (SEV) commands on AMD Processors. The SEV commands are defined in Documentation/virtual/kvm/amd-memory-encryption.txt. +4.110 KVM_MEMORY_ENCRYPT_REG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to register a guest memory region which may +contain encrypted data (e.g. guest RAM, SMRAM etc). + +It is used in the SEV-enabled guest. When encryption is enabled, a guest +memory region may contain encrypted data. The SEV memory encryption +engine uses a tweak such that two identical plaintext pages, each at +different locations will have differing ciphertexts. So swapping or +moving ciphertext of those pages will not result in plaintext being +swapped. So relocating (or migrating) physical backing pages for the SEV +guest will require some additional steps. + +Note: The current SEV key management spec does not provide commands to +swap or migrate (move) ciphertext pages. Hence, for now we pin the guest +memory region registered with the ioctl. + +4.111 KVM_MEMORY_ENCRYPT_UNREG_REGION + +Capability: basic +Architectures: x86 +Type: system +Parameters: struct kvm_enc_region (in) +Returns: 0 on success; -1 on error + +This ioctl can be used to unregister the guest memory region registered +with KVM_MEMORY_ENCRYPT_REG_REGION ioctl above. + 5. The kvm_run structure ------------------------ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c87e214d55df..58b7cc30466b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1068,6 +1068,8 @@ struct kvm_x86_ops { int (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); + int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7bbed0c0ba79..926f55cecf2e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4287,6 +4287,30 @@ set_identity_unlock: r = kvm_x86_ops->mem_enc_op(kvm, argp); break; } + case KVM_MEMORY_ENCRYPT_REG_REGION: { + struct kvm_enc_region region; + + r = -EFAULT; + if (copy_from_user(®ion, argp, sizeof(region))) + goto out; + + r = -ENOTTY; + if (kvm_x86_ops->mem_enc_reg_region) + r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion); + break; + } + case KVM_MEMORY_ENCRYPT_UNREG_REGION: { + struct kvm_enc_region region; + + r = -EFAULT; + if (copy_from_user(®ion, argp, sizeof(region))) + goto out; + + r = -ENOTTY; + if (kvm_x86_ops->mem_enc_unreg_region) + r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion); + break; + } default: r = -ENOTTY; } diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index addd0cf4445f..c8c65190907d 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1361,6 +1361,14 @@ struct kvm_s390_ucas_mapping { /* Memory Encryption Commands */ #define KVM_MEMORY_ENCRYPT_OP _IOWR(KVMIO, 0xba, unsigned long) +struct kvm_enc_region { + __u64 addr; + __u64 size; +}; + +#define KVM_MEMORY_ENCRYPT_REG_REGION _IOR(KVMIO, 0xbb, struct kvm_enc_region) +#define KVM_MEMORY_ENCRYPT_UNREG_REGION _IOR(KVMIO, 0xbc, struct kvm_enc_region) + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) -- cgit v1.2.3 From 5dd0a57cf38eeb8b6be1d9c3df9add2f5756d974 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:32 -0600 Subject: KVM: X86: Add CONFIG_KVM_AMD_SEV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The config option can be used to enable SEV support on AMD Processors. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/Kconfig | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 3df51c287844..148ea324b90c 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -81,6 +81,16 @@ config KVM_AMD To compile this as a module, choose M here: the module will be called kvm-amd. +config KVM_AMD_SEV + def_bool y + bool "AMD Secure Encrypted Virtualization (SEV) support" + depends on KVM_AMD && X86_64 + select CRYPTO_DEV_CCP + select CRYPTO_DEV_CCP_DD + select CRYPTO_DEV_SP_PSP + ---help--- + Provides support for launching Encrypted VMs on AMD processors. + config KVM_MMU_AUDIT bool "Audit KVM MMU" depends on KVM && TRACEPOINTS -- cgit v1.2.3 From ed3cd233f8074015e164547abfcdd4678fd10bb4 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:32 -0600 Subject: KVM: SVM: Reserve ASID range for SEV guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A SEV-enabled guest must use ASIDs from the defined subset, while non-SEV guests can use the remaining ASID range. The range of allowed SEV guest ASIDs is [1 - CPUID_8000_001F[ECX][31:0]]. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7e302a25bc45..bff7ce727a78 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -319,6 +319,8 @@ enum { #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL +static unsigned int max_sev_asid; + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -783,7 +785,7 @@ static int svm_hardware_enable(void) sd->asid_generation = 1; sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - sd->min_asid = 1; + sd->min_asid = max_sev_asid + 1; gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); -- cgit v1.2.3 From e9df09428996fcdc43e2b0db2a0e8b38198931c4 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:33 -0600 Subject: KVM: SVM: Add sev module_param MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The module parameter can be used to control the SEV feature support. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bff7ce727a78..ca63642517a7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -284,6 +285,10 @@ module_param(vls, int, 0444); static int vgif = true; module_param(vgif, int, 0444); +/* enable/disable SEV support */ +static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); +module_param(sev, int, 0444); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -1049,6 +1054,39 @@ static int avic_ga_log_notifier(u32 ga_tag) return 0; } +static __init int sev_hardware_setup(void) +{ + struct sev_user_data_status *status; + int rc; + + /* Maximum number of encrypted guests supported simultaneously */ + max_sev_asid = cpuid_ecx(0x8000001F); + + if (!max_sev_asid) + return 1; + + status = kmalloc(sizeof(*status), GFP_KERNEL); + if (!status) + return 1; + + /* + * Check SEV platform status. + * + * PLATFORM_STATUS can be called in any state, if we failed to query + * the PLATFORM status then either PSP firmware does not support SEV + * feature or SEV firmware is dead. + */ + rc = sev_platform_status(status, NULL); + if (rc) + goto err; + + pr_info("SEV supported\n"); + +err: + kfree(status); + return rc; +} + static __init int svm_hardware_setup(void) { int cpu; @@ -1084,6 +1122,17 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); } + if (sev) { + if (boot_cpu_has(X86_FEATURE_SEV) && + IS_ENABLED(CONFIG_KVM_AMD_SEV)) { + r = sev_hardware_setup(); + if (r) + sev = false; + } else { + sev = false; + } + } + for_each_possible_cpu(cpu) { r = svm_cpu_init(cpu); if (r) -- cgit v1.2.3 From 1654efcbc431a369397a20bf85e45870d15c8689 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:34 -0600 Subject: KVM: SVM: Add KVM_SEV_INIT command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command initializes the SEV platform context and allocates a new ASID for this guest from the SEV ASID pool. The firmware must be initialized before we issue any guest launch commands to create a new memory encryption context. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/kvm_host.h | 7 +++ arch/x86/kvm/svm.c | 132 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 138 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 58b7cc30466b..384dcfda43cc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -747,6 +747,11 @@ enum kvm_irqchip_mode { KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; +struct kvm_sev_info { + bool active; /* SEV enabled guest */ + unsigned int asid; /* ASID used for this guest */ +}; + struct kvm_arch { unsigned int n_used_mmu_pages; unsigned int n_requested_mmu_pages; @@ -834,6 +839,8 @@ struct kvm_arch { bool x2apic_format; bool x2apic_broadcast_quirk_disabled; + + struct kvm_sev_info sev_info; }; struct kvm_vm_stat { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ca63642517a7..51186107eb22 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -325,6 +326,20 @@ enum { #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL static unsigned int max_sev_asid; +static unsigned int min_sev_asid; +static unsigned long *sev_asid_bitmap; + +static inline bool svm_sev_enabled(void) +{ + return max_sev_asid; +} + +static inline bool sev_guest(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + return sev->active; +} static inline void mark_all_dirty(struct vmcb *vmcb) { @@ -1065,6 +1080,15 @@ static __init int sev_hardware_setup(void) if (!max_sev_asid) return 1; + /* Minimum ASID value that should be used for SEV guest */ + min_sev_asid = cpuid_edx(0x8000001F); + + /* Initialize SEV ASID bitmap */ + sev_asid_bitmap = kcalloc(BITS_TO_LONGS(max_sev_asid), + sizeof(unsigned long), GFP_KERNEL); + if (!sev_asid_bitmap) + return 1; + status = kmalloc(sizeof(*status), GFP_KERNEL); if (!status) return 1; @@ -1194,6 +1218,9 @@ static __exit void svm_hardware_unsetup(void) { int cpu; + if (svm_sev_enabled()) + kfree(sev_asid_bitmap); + for_each_possible_cpu(cpu) svm_cpu_uninit(cpu); @@ -1384,6 +1411,9 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } + if (sev_guest(svm->vcpu.kvm)) + svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; + mark_all_dirty(svm->vmcb); enable_gif(svm); @@ -1466,6 +1496,29 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) return 0; } +static void __sev_asid_free(int asid) +{ + int pos; + + pos = asid - 1; + clear_bit(pos, sev_asid_bitmap); +} + +static void sev_asid_free(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + __sev_asid_free(sev->asid); +} + +static void sev_vm_destroy(struct kvm *kvm) +{ + if (!sev_guest(kvm)) + return; + + sev_asid_free(kvm); +} + static void avic_vm_destroy(struct kvm *kvm) { unsigned long flags; @@ -1484,6 +1537,12 @@ static void avic_vm_destroy(struct kvm *kvm) spin_unlock_irqrestore(&svm_vm_data_hash_lock, flags); } +static void svm_vm_destroy(struct kvm *kvm) +{ + avic_vm_destroy(kvm); + sev_vm_destroy(kvm); +} + static int avic_vm_init(struct kvm *kvm) { unsigned long flags; @@ -5556,6 +5615,75 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) return 0; } +static int sev_asid_new(void) +{ + int pos; + + /* + * SEV-enabled guest must use asid from min_sev_asid to max_sev_asid. + */ + pos = find_next_zero_bit(sev_asid_bitmap, max_sev_asid, min_sev_asid - 1); + if (pos >= max_sev_asid) + return -EBUSY; + + set_bit(pos, sev_asid_bitmap); + return pos + 1; +} + +static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + int asid, ret; + + ret = -EBUSY; + asid = sev_asid_new(); + if (asid < 0) + return ret; + + ret = sev_platform_init(&argp->error); + if (ret) + goto e_free; + + sev->active = true; + sev->asid = asid; + + return 0; + +e_free: + __sev_asid_free(asid); + return ret; +} + +static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) +{ + struct kvm_sev_cmd sev_cmd; + int r; + + if (!svm_sev_enabled()) + return -ENOTTY; + + if (copy_from_user(&sev_cmd, argp, sizeof(struct kvm_sev_cmd))) + return -EFAULT; + + mutex_lock(&kvm->lock); + + switch (sev_cmd.id) { + case KVM_SEV_INIT: + r = sev_guest_init(kvm, &sev_cmd); + break; + default: + r = -EINVAL; + goto out; + } + + if (copy_to_user(argp, &sev_cmd, sizeof(struct kvm_sev_cmd))) + r = -EFAULT; + +out: + mutex_unlock(&kvm->lock); + return r; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5572,7 +5700,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .vcpu_reset = svm_vcpu_reset, .vm_init = avic_vm_init, - .vm_destroy = avic_vm_destroy, + .vm_destroy = svm_vm_destroy, .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, @@ -5671,6 +5799,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pre_enter_smm = svm_pre_enter_smm, .pre_leave_smm = svm_pre_leave_smm, .enable_smi_window = enable_smi_window, + + .mem_enc_op = svm_mem_enc_op, }; static int __init svm_init(void) -- cgit v1.2.3 From 70cd94e60c733e3afc18b0e6aab789c13b5571da Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:34 -0600 Subject: KVM: SVM: VMRUN should use associated ASID when SEV is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SEV hardware uses ASIDs to associate a memory encryption key with a guest VM. During guest creation, a SEV VM uses the SEV_CMD_ACTIVATE command to bind a particular ASID to the guest. Lets make sure that the VMCB is programmed with the bound ASID before a VMRUN. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 51186107eb22..cdbdc86d7aee 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -213,6 +213,9 @@ struct vcpu_svm { */ struct list_head ir_list; spinlock_t ir_list_lock; + + /* which host CPU was used for running this vcpu */ + unsigned int last_cpu; }; /* @@ -341,6 +344,13 @@ static inline bool sev_guest(struct kvm *kvm) return sev->active; } +static inline int sev_get_asid(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + return sev->asid; +} + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -551,6 +561,9 @@ struct svm_cpu_data { struct kvm_ldttss_desc *tss_desc; struct page *save_area; + + /* index = sev_asid, value = vmcb pointer */ + struct vmcb **sev_vmcbs; }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); @@ -864,6 +877,7 @@ static void svm_cpu_uninit(int cpu) return; per_cpu(svm_data, raw_smp_processor_id()) = NULL; + kfree(sd->sev_vmcbs); __free_page(sd->save_area); kfree(sd); } @@ -877,11 +891,18 @@ static int svm_cpu_init(int cpu) if (!sd) return -ENOMEM; sd->cpu = cpu; - sd->save_area = alloc_page(GFP_KERNEL); r = -ENOMEM; + sd->save_area = alloc_page(GFP_KERNEL); if (!sd->save_area) goto err_1; + if (svm_sev_enabled()) { + r = -ENOMEM; + sd->sev_vmcbs = kmalloc((max_sev_asid + 1) * sizeof(void *), GFP_KERNEL); + if (!sd->sev_vmcbs) + goto err_1; + } + per_cpu(svm_data, cpu) = sd; return 0; @@ -1498,10 +1519,16 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) static void __sev_asid_free(int asid) { - int pos; + struct svm_cpu_data *sd; + int cpu, pos; pos = asid - 1; clear_bit(pos, sev_asid_bitmap); + + for_each_possible_cpu(cpu) { + sd = per_cpu(svm_data, cpu); + sd->sev_vmcbs[pos] = NULL; + } } static void sev_asid_free(struct kvm *kvm) @@ -4466,12 +4493,39 @@ static void reload_tss(struct kvm_vcpu *vcpu) load_TR_desc(); } +static void pre_sev_run(struct vcpu_svm *svm, int cpu) +{ + struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + int asid = sev_get_asid(svm->vcpu.kvm); + + /* Assign the asid allocated with this SEV guest */ + svm->vmcb->control.asid = asid; + + /* + * Flush guest TLB: + * + * 1) when different VMCB for the same ASID is to be run on the same host CPU. + * 2) or this VMCB was executed on different host CPU in previous VMRUNs. + */ + if (sd->sev_vmcbs[asid] == svm->vmcb && + svm->last_cpu == cpu) + return; + + svm->last_cpu = cpu; + sd->sev_vmcbs[asid] = svm->vmcb; + svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; + mark_dirty(svm->vmcb, VMCB_ASID); +} + static void pre_svm_run(struct vcpu_svm *svm) { int cpu = raw_smp_processor_id(); struct svm_cpu_data *sd = per_cpu(svm_data, cpu); + if (sev_guest(svm->vcpu.kvm)) + return pre_sev_run(svm, cpu); + /* FIXME: handle wraparound of asid_generation */ if (svm->asid_generation != sd->asid_generation) new_asid(svm, sd); -- cgit v1.2.3 From 59414c989220825f970f38dbcbf11f18e817d73c Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:35 -0600 Subject: KVM: SVM: Add support for KVM_SEV_LAUNCH_START command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KVM_SEV_LAUNCH_START command is used to create a memory encryption context within the SEV firmware. In order to do so, the guest owner should provide the guest's policy, its public Diffie-Hellman (PDH) key and session information. The command implements the LAUNCH_START flow defined in SEV spec Section 6.2. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/kvm_host.h | 2 + arch/x86/kvm/svm.c | 153 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 384dcfda43cc..0ea890375532 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -750,6 +750,8 @@ enum kvm_irqchip_mode { struct kvm_sev_info { bool active; /* SEV enabled guest */ unsigned int asid; /* ASID used for this guest */ + unsigned int handle; /* SEV firmware handle */ + int fd; /* SEV device fd */ }; struct kvm_arch { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index cdbdc86d7aee..e5b712e55186 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1538,11 +1538,45 @@ static void sev_asid_free(struct kvm *kvm) __sev_asid_free(sev->asid); } +static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) +{ + struct sev_data_decommission *decommission; + struct sev_data_deactivate *data; + + if (!handle) + return; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + /* deactivate handle */ + data->handle = handle; + sev_guest_deactivate(data, NULL); + + wbinvd_on_all_cpus(); + sev_guest_df_flush(NULL); + kfree(data); + + decommission = kzalloc(sizeof(*decommission), GFP_KERNEL); + if (!decommission) + return; + + /* decommission handle */ + decommission->handle = handle; + sev_guest_decommission(decommission, NULL); + + kfree(decommission); +} + static void sev_vm_destroy(struct kvm *kvm) { + struct kvm_sev_info *sev = &kvm->arch.sev_info; + if (!sev_guest(kvm)) return; + sev_unbind_asid(kvm, sev->handle); sev_asid_free(kvm); } @@ -5708,6 +5742,122 @@ e_free: return ret; } +static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) +{ + struct sev_data_activate *data; + int asid = sev_get_asid(kvm); + int ret; + + wbinvd_on_all_cpus(); + + ret = sev_guest_df_flush(error); + if (ret) + return ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* activate ASID on the given handle */ + data->handle = handle; + data->asid = asid; + ret = sev_guest_activate(data, error); + kfree(data); + + return ret; +} + +static int sev_issue_cmd(int fd, int id, void *data, int *error) +{ + struct fd f; + int ret; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = sev_issue_cmd_external_user(f.file, id, data, error); + + fdput(f); + return ret; +} + +static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_start *start; + struct kvm_sev_launch_start params; + void *dh_blob, *session_blob; + int *error = &argp->error; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + start = kzalloc(sizeof(*start), GFP_KERNEL); + if (!start) + return -ENOMEM; + + dh_blob = NULL; + if (params.dh_uaddr) { + dh_blob = psp_copy_user_blob(params.dh_uaddr, params.dh_len); + if (IS_ERR(dh_blob)) { + ret = PTR_ERR(dh_blob); + goto e_free; + } + + start->dh_cert_address = __sme_set(__pa(dh_blob)); + start->dh_cert_len = params.dh_len; + } + + session_blob = NULL; + if (params.session_uaddr) { + session_blob = psp_copy_user_blob(params.session_uaddr, params.session_len); + if (IS_ERR(session_blob)) { + ret = PTR_ERR(session_blob); + goto e_free_dh; + } + + start->session_address = __sme_set(__pa(session_blob)); + start->session_len = params.session_len; + } + + start->handle = params.handle; + start->policy = params.policy; + + /* create memory encryption context */ + ret = sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); + if (ret) + goto e_free_session; + + /* Bind ASID to this guest */ + ret = sev_bind_asid(kvm, start->handle, error); + if (ret) + goto e_free_session; + + /* return handle to userspace */ + params.handle = start->handle; + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) { + sev_unbind_asid(kvm, start->handle); + ret = -EFAULT; + goto e_free_session; + } + + sev->handle = start->handle; + sev->fd = argp->sev_fd; + +e_free_session: + kfree(session_blob); +e_free_dh: + kfree(dh_blob); +e_free: + kfree(start); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -5725,6 +5875,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_INIT: r = sev_guest_init(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_START: + r = sev_launch_start(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 89c5058090528026f6542e8b12f4262e492bd3a2 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:35 -0600 Subject: KVM: SVM: Add support for KVM_SEV_LAUNCH_UPDATE_DATA command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for encrypting the guest memory region using the VM encryption key (VEK) created during KVM_SEV_LAUNCH_START. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Improvements-by: Borislav Petkov Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 191 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 190 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0ea890375532..a0b021f1fd05 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -752,6 +752,7 @@ struct kvm_sev_info { unsigned int asid; /* ASID used for this guest */ unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ + unsigned long pages_locked; /* Number of pages locked */ }; struct kvm_arch { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e5b712e55186..88951cbef3ec 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include @@ -331,6 +333,7 @@ enum { static unsigned int max_sev_asid; static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; +#define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) static inline bool svm_sev_enabled(void) { @@ -1569,6 +1572,83 @@ static void sev_unbind_asid(struct kvm *kvm, unsigned int handle) kfree(decommission); } +static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr, + unsigned long ulen, unsigned long *n, + int write) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + unsigned long npages, npinned, size; + unsigned long locked, lock_limit; + struct page **pages; + int first, last; + + /* Calculate number of pages. */ + first = (uaddr & PAGE_MASK) >> PAGE_SHIFT; + last = ((uaddr + ulen - 1) & PAGE_MASK) >> PAGE_SHIFT; + npages = (last - first + 1); + + locked = sev->pages_locked + npages; + lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) { + pr_err("SEV: %lu locked pages exceed the lock limit of %lu.\n", locked, lock_limit); + return NULL; + } + + /* Avoid using vmalloc for smaller buffers. */ + size = npages * sizeof(struct page *); + if (size > PAGE_SIZE) + pages = vmalloc(size); + else + pages = kmalloc(size, GFP_KERNEL); + + if (!pages) + return NULL; + + /* Pin the user virtual address. */ + npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages); + if (npinned != npages) { + pr_err("SEV: Failure locking %lu pages.\n", npages); + goto err; + } + + *n = npages; + sev->pages_locked = locked; + + return pages; + +err: + if (npinned > 0) + release_pages(pages, npinned); + + kvfree(pages); + return NULL; +} + +static void sev_unpin_memory(struct kvm *kvm, struct page **pages, + unsigned long npages) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + release_pages(pages, npages); + kvfree(pages); + sev->pages_locked -= npages; +} + +static void sev_clflush_pages(struct page *pages[], unsigned long npages) +{ + uint8_t *page_virtual; + unsigned long i; + + if (npages == 0 || pages == NULL) + return; + + for (i = 0; i < npages; i++) { + page_virtual = kmap_atomic(pages[i]); + clflush_cache_range(page_virtual, PAGE_SIZE); + kunmap_atomic(page_virtual); + } +} + static void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &kvm->arch.sev_info; @@ -5767,7 +5847,7 @@ static int sev_bind_asid(struct kvm *kvm, unsigned int handle, int *error) return ret; } -static int sev_issue_cmd(int fd, int id, void *data, int *error) +static int __sev_issue_cmd(int fd, int id, void *data, int *error) { struct fd f; int ret; @@ -5782,6 +5862,13 @@ static int sev_issue_cmd(int fd, int id, void *data, int *error) return ret; } +static int sev_issue_cmd(struct kvm *kvm, int id, void *data, int *error) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + + return __sev_issue_cmd(sev->fd, id, data, error); +} + static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) { struct kvm_sev_info *sev = &kvm->arch.sev_info; @@ -5829,7 +5916,7 @@ static int sev_launch_start(struct kvm *kvm, struct kvm_sev_cmd *argp) start->policy = params.policy; /* create memory encryption context */ - ret = sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); + ret = __sev_issue_cmd(argp->sev_fd, SEV_CMD_LAUNCH_START, start, error); if (ret) goto e_free_session; @@ -5858,6 +5945,103 @@ e_free: return ret; } +static int get_num_contig_pages(int idx, struct page **inpages, + unsigned long npages) +{ + unsigned long paddr, next_paddr; + int i = idx + 1, pages = 1; + + /* find the number of contiguous pages starting from idx */ + paddr = __sme_page_pa(inpages[idx]); + while (i < npages) { + next_paddr = __sme_page_pa(inpages[i++]); + if ((paddr + PAGE_SIZE) == next_paddr) { + pages++; + paddr = next_paddr; + continue; + } + break; + } + + return pages; +} + +static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + unsigned long vaddr, vaddr_end, next_vaddr, npages, size; + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_launch_update_data params; + struct sev_data_launch_update_data *data; + struct page **inpages; + int i, ret, pages; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + vaddr = params.uaddr; + size = params.len; + vaddr_end = vaddr + size; + + /* Lock the user memory. */ + inpages = sev_pin_memory(kvm, vaddr, size, &npages, 1); + if (!inpages) { + ret = -ENOMEM; + goto e_free; + } + + /* + * The LAUNCH_UPDATE command will perform in-place encryption of the + * memory content (i.e it will write the same memory region with C=1). + * It's possible that the cache may contain the data with C=0, i.e., + * unencrypted so invalidate it first. + */ + sev_clflush_pages(inpages, npages); + + for (i = 0; vaddr < vaddr_end; vaddr = next_vaddr, i += pages) { + int offset, len; + + /* + * If the user buffer is not page-aligned, calculate the offset + * within the page. + */ + offset = vaddr & (PAGE_SIZE - 1); + + /* Calculate the number of pages that can be encrypted in one go. */ + pages = get_num_contig_pages(i, inpages, npages); + + len = min_t(size_t, ((pages * PAGE_SIZE) - offset), size); + + data->handle = sev->handle; + data->len = len; + data->address = __sme_page_pa(inpages[i]) + offset; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_DATA, data, &argp->error); + if (ret) + goto e_unpin; + + size -= len; + next_vaddr = vaddr + len; + } + +e_unpin: + /* content of memory is updated, mark pages dirty */ + for (i = 0; i < npages; i++) { + set_page_dirty_lock(inpages[i]); + mark_page_accessed(inpages[i]); + } + /* unlock the user pages */ + sev_unpin_memory(kvm, inpages, npages); +e_free: + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -5878,6 +6062,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_START: r = sev_launch_start(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_UPDATE_DATA: + r = sev_launch_update_data(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 0d0736f76347f7e14a3e5bb7a202a75a53ce1e3b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:36 -0600 Subject: KVM: SVM: Add support for KVM_SEV_LAUNCH_MEASURE command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used to retrieve the measurement of contents encrypted through the KVM_SEV_LAUNCH_UPDATE_DATA command. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 88951cbef3ec..74e010e6b5b9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6042,6 +6042,77 @@ e_free: return ret; } +static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_measure *data; + struct kvm_sev_launch_measure params; + void *blob = NULL; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* User wants to query the blob length */ + if (!params.len) + goto cmd; + + if (params.uaddr) { + if (params.len > SEV_FW_BLOB_MAX_SIZE) { + ret = -EINVAL; + goto e_free; + } + + if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) { + ret = -EFAULT; + goto e_free; + } + + ret = -ENOMEM; + blob = kmalloc(params.len, GFP_KERNEL); + if (!blob) + goto e_free; + + data->address = __psp_pa(blob); + data->len = params.len; + } + +cmd: + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_MEASURE, data, &argp->error); + + /* + * If we query the session length, FW responded with expected data. + */ + if (!params.len) + goto done; + + if (ret) + goto e_free_blob; + + if (blob) { + if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len)) + ret = -EFAULT; + } + +done: + params.len = data->len; + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free_blob: + kfree(blob); +e_free: + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6065,6 +6136,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_UPDATE_DATA: r = sev_launch_update_data(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_MEASURE: + r = sev_launch_measure(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 5bdb0e2fa45eb007a1cc630c6b520a638e60c1f6 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:36 -0600 Subject: KVM: SVM: Add support for SEV LAUNCH_FINISH command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for finializing the SEV guest launch process. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 74e010e6b5b9..6a8d81311e9d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6113,6 +6113,26 @@ e_free: return ret; } +static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_finish *data; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_FINISH, data, &argp->error); + + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6139,6 +6159,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_MEASURE: r = sev_launch_measure(kvm, &sev_cmd); break; + case KVM_SEV_LAUNCH_FINISH: + r = sev_launch_finish(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 255d9e75e2549a5bc85c3606f5a80e8a0ef4d170 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:37 -0600 Subject: KVM: SVM: Add support for SEV GUEST_STATUS command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for querying the SEV guest information. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6a8d81311e9d..f5da2753790d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6133,6 +6133,36 @@ static int sev_launch_finish(struct kvm *kvm, struct kvm_sev_cmd *argp) return ret; } +static int sev_guest_status(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct kvm_sev_guest_status params; + struct sev_data_guest_status *data; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_GUEST_STATUS, data, &argp->error); + if (ret) + goto e_free; + + params.policy = data->policy; + params.state = data->state; + params.handle = data->handle; + + if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + ret = -EFAULT; +e_free: + kfree(data); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6162,6 +6192,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_LAUNCH_FINISH: r = sev_launch_finish(kvm, &sev_cmd); break; + case KVM_SEV_GUEST_STATUS: + r = sev_guest_status(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 24f41fb23a39bc2b6f190dcef35a5813a4bf183a Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:37 -0600 Subject: KVM: SVM: Add support for SEV DEBUG_DECRYPT command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for decrypting a guest memory region for debug purposes. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 152 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f5da2753790d..ec2b864faec2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6163,6 +6163,155 @@ e_free: return ret; } +static int __sev_issue_dbg_cmd(struct kvm *kvm, unsigned long src, + unsigned long dst, int size, + int *error, bool enc) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_dbg *data; + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->handle = sev->handle; + data->dst_addr = dst; + data->src_addr = src; + data->len = size; + + ret = sev_issue_cmd(kvm, + enc ? SEV_CMD_DBG_ENCRYPT : SEV_CMD_DBG_DECRYPT, + data, error); + kfree(data); + return ret; +} + +static int __sev_dbg_decrypt(struct kvm *kvm, unsigned long src_paddr, + unsigned long dst_paddr, int sz, int *err) +{ + int offset; + + /* + * Its safe to read more than we are asked, caller should ensure that + * destination has enough space. + */ + src_paddr = round_down(src_paddr, 16); + offset = src_paddr & 15; + sz = round_up(sz + offset, 16); + + return __sev_issue_dbg_cmd(kvm, src_paddr, dst_paddr, sz, err, false); +} + +static int __sev_dbg_decrypt_user(struct kvm *kvm, unsigned long paddr, + unsigned long __user dst_uaddr, + unsigned long dst_paddr, + int size, int *err) +{ + struct page *tpage = NULL; + int ret, offset; + + /* if inputs are not 16-byte then use intermediate buffer */ + if (!IS_ALIGNED(dst_paddr, 16) || + !IS_ALIGNED(paddr, 16) || + !IS_ALIGNED(size, 16)) { + tpage = (void *)alloc_page(GFP_KERNEL); + if (!tpage) + return -ENOMEM; + + dst_paddr = __sme_page_pa(tpage); + } + + ret = __sev_dbg_decrypt(kvm, paddr, dst_paddr, size, err); + if (ret) + goto e_free; + + if (tpage) { + offset = paddr & 15; + if (copy_to_user((void __user *)(uintptr_t)dst_uaddr, + page_address(tpage) + offset, size)) + ret = -EFAULT; + } + +e_free: + if (tpage) + __free_page(tpage); + + return ret; +} + +static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) +{ + unsigned long vaddr, vaddr_end, next_vaddr; + unsigned long dst_vaddr, dst_vaddr_end; + struct page **src_p, **dst_p; + struct kvm_sev_dbg debug; + unsigned long n; + int ret, size; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug))) + return -EFAULT; + + vaddr = debug.src_uaddr; + size = debug.len; + vaddr_end = vaddr + size; + dst_vaddr = debug.dst_uaddr; + dst_vaddr_end = dst_vaddr + size; + + for (; vaddr < vaddr_end; vaddr = next_vaddr) { + int len, s_off, d_off; + + /* lock userspace source and destination page */ + src_p = sev_pin_memory(kvm, vaddr & PAGE_MASK, PAGE_SIZE, &n, 0); + if (!src_p) + return -EFAULT; + + dst_p = sev_pin_memory(kvm, dst_vaddr & PAGE_MASK, PAGE_SIZE, &n, 1); + if (!dst_p) { + sev_unpin_memory(kvm, src_p, n); + return -EFAULT; + } + + /* + * The DBG_{DE,EN}CRYPT commands will perform {dec,en}cryption of the + * memory content (i.e it will write the same memory region with C=1). + * It's possible that the cache may contain the data with C=0, i.e., + * unencrypted so invalidate it first. + */ + sev_clflush_pages(src_p, 1); + sev_clflush_pages(dst_p, 1); + + /* + * Since user buffer may not be page aligned, calculate the + * offset within the page. + */ + s_off = vaddr & ~PAGE_MASK; + d_off = dst_vaddr & ~PAGE_MASK; + len = min_t(size_t, (PAGE_SIZE - s_off), size); + + ret = __sev_dbg_decrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + dst_vaddr, + __sme_page_pa(dst_p[0]) + d_off, + len, &argp->error); + + sev_unpin_memory(kvm, src_p, 1); + sev_unpin_memory(kvm, dst_p, 1); + + if (ret) + goto err; + + next_vaddr = vaddr + len; + dst_vaddr = dst_vaddr + len; + size -= len; + } +err: + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6195,6 +6344,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_GUEST_STATUS: r = sev_guest_status(kvm, &sev_cmd); break; + case KVM_SEV_DBG_DECRYPT: + r = sev_dbg_crypt(kvm, &sev_cmd, true); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 7d1594f5d94bb9fa60dacca2390a6c58398f005a Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:37 -0600 Subject: KVM: SVM: Add support for SEV DEBUG_ENCRYPT command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command copies a plaintext into guest memory and encrypts it using the VM encryption key. The command will be used for debug purposes (e.g setting breakpoints through gdbserver) Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh --- arch/x86/kvm/svm.c | 98 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 93 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ec2b864faec2..11d4860997d9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6240,6 +6240,83 @@ e_free: return ret; } +static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, + unsigned long __user vaddr, + unsigned long dst_paddr, + unsigned long __user dst_vaddr, + int size, int *error) +{ + struct page *src_tpage = NULL; + struct page *dst_tpage = NULL; + int ret, len = size; + + /* If source buffer is not aligned then use an intermediate buffer */ + if (!IS_ALIGNED(vaddr, 16)) { + src_tpage = alloc_page(GFP_KERNEL); + if (!src_tpage) + return -ENOMEM; + + if (copy_from_user(page_address(src_tpage), + (void __user *)(uintptr_t)vaddr, size)) { + __free_page(src_tpage); + return -EFAULT; + } + + paddr = __sme_page_pa(src_tpage); + } + + /* + * If destination buffer or length is not aligned then do read-modify-write: + * - decrypt destination in an intermediate buffer + * - copy the source buffer in an intermediate buffer + * - use the intermediate buffer as source buffer + */ + if (!IS_ALIGNED(dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { + int dst_offset; + + dst_tpage = alloc_page(GFP_KERNEL); + if (!dst_tpage) { + ret = -ENOMEM; + goto e_free; + } + + ret = __sev_dbg_decrypt(kvm, dst_paddr, + __sme_page_pa(dst_tpage), size, error); + if (ret) + goto e_free; + + /* + * If source is kernel buffer then use memcpy() otherwise + * copy_from_user(). + */ + dst_offset = dst_paddr & 15; + + if (src_tpage) + memcpy(page_address(dst_tpage) + dst_offset, + page_address(src_tpage), size); + else { + if (copy_from_user(page_address(dst_tpage) + dst_offset, + (void __user *)(uintptr_t)vaddr, size)) { + ret = -EFAULT; + goto e_free; + } + } + + paddr = __sme_page_pa(dst_tpage); + dst_paddr = round_down(dst_paddr, 16); + len = round_up(size, 16); + } + + ret = __sev_issue_dbg_cmd(kvm, paddr, dst_paddr, len, error, true); + +e_free: + if (src_tpage) + __free_page(src_tpage); + if (dst_tpage) + __free_page(dst_tpage); + return ret; +} + static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) { unsigned long vaddr, vaddr_end, next_vaddr; @@ -6292,11 +6369,19 @@ static int sev_dbg_crypt(struct kvm *kvm, struct kvm_sev_cmd *argp, bool dec) d_off = dst_vaddr & ~PAGE_MASK; len = min_t(size_t, (PAGE_SIZE - s_off), size); - ret = __sev_dbg_decrypt_user(kvm, - __sme_page_pa(src_p[0]) + s_off, - dst_vaddr, - __sme_page_pa(dst_p[0]) + d_off, - len, &argp->error); + if (dec) + ret = __sev_dbg_decrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + dst_vaddr, + __sme_page_pa(dst_p[0]) + d_off, + len, &argp->error); + else + ret = __sev_dbg_encrypt_user(kvm, + __sme_page_pa(src_p[0]) + s_off, + vaddr, + __sme_page_pa(dst_p[0]) + d_off, + dst_vaddr, + len, &argp->error); sev_unpin_memory(kvm, src_p, 1); sev_unpin_memory(kvm, dst_p, 1); @@ -6347,6 +6432,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_DBG_DECRYPT: r = sev_dbg_crypt(kvm, &sev_cmd, true); break; + case KVM_SEV_DBG_ENCRYPT: + r = sev_dbg_crypt(kvm, &sev_cmd, false); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 9f5b5b950aa96b3d303d132e069f93c8bc4c9b58 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:38 -0600 Subject: KVM: SVM: Add support for SEV LAUNCH_SECRET command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The command is used for injecting a secret into the guest memory region. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Reviewed-by: Borislav Petkov --- arch/x86/kvm/svm.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 11d4860997d9..8a499425bf7e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6397,6 +6397,71 @@ err: return ret; } +static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct sev_data_launch_secret *data; + struct kvm_sev_launch_secret params; + struct page **pages; + void *blob, *hdr; + unsigned long n; + int ret; + + if (!sev_guest(kvm)) + return -ENOTTY; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + return -EFAULT; + + pages = sev_pin_memory(kvm, params.guest_uaddr, params.guest_len, &n, 1); + if (!pages) + return -ENOMEM; + + /* + * The secret must be copied into contiguous memory region, lets verify + * that userspace memory pages are contiguous before we issue command. + */ + if (get_num_contig_pages(0, pages, n) != n) { + ret = -EINVAL; + goto e_unpin_memory; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto e_unpin_memory; + + blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); + if (IS_ERR(blob)) { + ret = PTR_ERR(blob); + goto e_free; + } + + data->trans_address = __psp_pa(blob); + data->trans_len = params.trans_len; + + hdr = psp_copy_user_blob(params.hdr_uaddr, params.hdr_len); + if (IS_ERR(hdr)) { + ret = PTR_ERR(hdr); + goto e_free_blob; + } + data->trans_address = __psp_pa(blob); + data->trans_len = params.trans_len; + + data->handle = sev->handle; + ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); + + kfree(hdr); + +e_free_blob: + kfree(blob); +e_free: + kfree(data); +e_unpin_memory: + sev_unpin_memory(kvm, pages, n); + return ret; +} + static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -6435,6 +6500,9 @@ static int svm_mem_enc_op(struct kvm *kvm, void __user *argp) case KVM_SEV_DBG_ENCRYPT: r = sev_dbg_crypt(kvm, &sev_cmd, false); break; + case KVM_SEV_LAUNCH_SECRET: + r = sev_launch_secret(kvm, &sev_cmd); + break; default: r = -EINVAL; goto out; -- cgit v1.2.3 From 1e80fdc09d121d8327cdf62eefbb5abadddca792 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:38 -0600 Subject: KVM: SVM: Pin guest memory when SEV is active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV memory encryption engine uses a tweak such that two identical plaintext pages at different location will have different ciphertext. So swapping or moving ciphertext of two pages will not result in plaintext being swapped. Relocating (or migrating) physical backing pages for a SEV guest will require some additional steps. The current SEV key management spec does not provide commands to swap or migrate (move) ciphertext pages. For now, we pin the guest memory registered through KVM_MEMORY_ENCRYPT_REG_REGION ioctl. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm.c | 132 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index a0b021f1fd05..262950f9f2d9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -753,6 +753,7 @@ struct kvm_sev_info { unsigned int handle; /* SEV firmware handle */ int fd; /* SEV device fd */ unsigned long pages_locked; /* Number of pages locked */ + struct list_head regions_list; /* List of registered regions */ }; struct kvm_arch { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a499425bf7e..bdce5b4e71cc 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -335,6 +335,14 @@ static unsigned int min_sev_asid; static unsigned long *sev_asid_bitmap; #define __sme_page_pa(x) __sme_set(page_to_pfn(x) << PAGE_SHIFT) +struct enc_region { + struct list_head list; + unsigned long npages; + struct page **pages; + unsigned long uaddr; + unsigned long size; +}; + static inline bool svm_sev_enabled(void) { return max_sev_asid; @@ -1649,13 +1657,46 @@ static void sev_clflush_pages(struct page *pages[], unsigned long npages) } } +static void __unregister_enc_region_locked(struct kvm *kvm, + struct enc_region *region) +{ + /* + * The guest may change the memory encryption attribute from C=0 -> C=1 + * or vice versa for this memory range. Lets make sure caches are + * flushed to ensure that guest data gets written into memory with + * correct C-bit. + */ + sev_clflush_pages(region->pages, region->npages); + + sev_unpin_memory(kvm, region->pages, region->npages); + list_del(®ion->list); + kfree(region); +} + static void sev_vm_destroy(struct kvm *kvm) { struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct list_head *head = &sev->regions_list; + struct list_head *pos, *q; if (!sev_guest(kvm)) return; + mutex_lock(&kvm->lock); + + /* + * if userspace was terminated before unregistering the memory regions + * then lets unpin all the registered memory. + */ + if (!list_empty(head)) { + list_for_each_safe(pos, q, head) { + __unregister_enc_region_locked(kvm, + list_entry(pos, struct enc_region, list)); + } + } + + mutex_unlock(&kvm->lock); + sev_unbind_asid(kvm, sev->handle); sev_asid_free(kvm); } @@ -5814,6 +5855,7 @@ static int sev_guest_init(struct kvm *kvm, struct kvm_sev_cmd *argp) sev->active = true; sev->asid = asid; + INIT_LIST_HEAD(&sev->regions_list); return 0; @@ -6516,6 +6558,94 @@ out: return r; } +static int svm_register_enc_region(struct kvm *kvm, + struct kvm_enc_region *range) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct enc_region *region; + int ret = 0; + + if (!sev_guest(kvm)) + return -ENOTTY; + + region = kzalloc(sizeof(*region), GFP_KERNEL); + if (!region) + return -ENOMEM; + + region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1); + if (!region->pages) { + ret = -ENOMEM; + goto e_free; + } + + /* + * The guest may change the memory encryption attribute from C=0 -> C=1 + * or vice versa for this memory range. Lets make sure caches are + * flushed to ensure that guest data gets written into memory with + * correct C-bit. + */ + sev_clflush_pages(region->pages, region->npages); + + region->uaddr = range->addr; + region->size = range->size; + + mutex_lock(&kvm->lock); + list_add_tail(®ion->list, &sev->regions_list); + mutex_unlock(&kvm->lock); + + return ret; + +e_free: + kfree(region); + return ret; +} + +static struct enc_region * +find_enc_region(struct kvm *kvm, struct kvm_enc_region *range) +{ + struct kvm_sev_info *sev = &kvm->arch.sev_info; + struct list_head *head = &sev->regions_list; + struct enc_region *i; + + list_for_each_entry(i, head, list) { + if (i->uaddr == range->addr && + i->size == range->size) + return i; + } + + return NULL; +} + + +static int svm_unregister_enc_region(struct kvm *kvm, + struct kvm_enc_region *range) +{ + struct enc_region *region; + int ret; + + mutex_lock(&kvm->lock); + + if (!sev_guest(kvm)) { + ret = -ENOTTY; + goto failed; + } + + region = find_enc_region(kvm, range); + if (!region) { + ret = -EINVAL; + goto failed; + } + + __unregister_enc_region_locked(kvm, region); + + mutex_unlock(&kvm->lock); + return 0; + +failed: + mutex_unlock(&kvm->lock); + return ret; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -6633,6 +6763,8 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .enable_smi_window = enable_smi_window, .mem_enc_op = svm_mem_enc_op, + .mem_enc_reg_region = svm_register_enc_region, + .mem_enc_unreg_region = svm_unregister_enc_region, }; static int __init svm_init(void) -- cgit v1.2.3 From 0ede79e1322479221fb0d0a7d2828d8ed033f060 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:39 -0600 Subject: KVM: SVM: Clear C-bit from the page fault address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When SEV is active, on #VMEXIT the page fault address will contain the C-bit. We must clear the C-bit before handling the fault. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Borislav Petkov Signed-off-by: Brijesh Singh --- arch/x86/kvm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index bdce5b4e71cc..f350e9bac9eb 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2430,7 +2430,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { - u64 fault_address = svm->vmcb->control.exit_info_2; + u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); u64 error_code = svm->vmcb->control.exit_info_1; return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, @@ -2440,7 +2440,7 @@ static int pf_interception(struct vcpu_svm *svm) static int npf_interception(struct vcpu_svm *svm) { - u64 fault_address = svm->vmcb->control.exit_info_2; + u64 fault_address = __sme_clr(svm->vmcb->control.exit_info_2); u64 error_code = svm->vmcb->control.exit_info_1; trace_kvm_page_fault(fault_address, error_code); -- cgit v1.2.3 From 35c6f649bbb2e3f367116307273f998f0bf3e08e Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:39 -0600 Subject: KVM: SVM: Do not install #UD intercept when SEV is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On #UD, x86_emulate_instruction() fetches the data from guest memory and decodes the instruction bytes to assist further. When SEV is enabled, the instruction bytes will be encrypted using the guest-specific key and the hypervisor will no longer able to fetch the instruction bytes to assist UD handling. By not installing intercept we let the guest receive and handle #UD. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Borislav Petkov Signed-off-by: Brijesh Singh --- arch/x86/kvm/svm.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f350e9bac9eb..3e848f952b4f 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1443,8 +1443,10 @@ static void init_vmcb(struct vcpu_svm *svm) svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; } - if (sev_guest(svm->vcpu.kvm)) + if (sev_guest(svm->vcpu.kvm)) { svm->vmcb->control.nested_ctl |= SVM_NESTED_CTL_SEV_ENABLE; + clr_exception_intercept(svm, UD_VECTOR); + } mark_all_dirty(svm->vmcb); -- cgit v1.2.3 From 00b10fe1046c4b2232097a7ffaa9238c7e479388 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 4 Dec 2017 10:57:40 -0600 Subject: KVM: X86: Restart the guest when insn_len is zero and SEV is enabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On AMD platforms, under certain conditions insn_len may be zero on #NPF. This can happen if a guest gets a page-fault on data access but the HW table walker is not able to read the instruction page (e.g instruction page is not present in memory). Typically, when insn_len is zero, x86_emulate_instruction() walks the guest page table and fetches the instruction bytes from guest memory. When SEV is enabled, the guest memory is encrypted with guest-specific key hence hypervisor will not able to fetch the instruction bytes. In those cases we simply restart the guest. I have encountered this issue when running kernbench inside the guest. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Paolo Bonzini Cc: "Radim Krčmář" Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Cc: x86@kernel.org Cc: kvm@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh --- arch/x86/kvm/mmu.c | 10 ++++++++++ arch/x86/kvm/svm.c | 6 ++++-- 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5e66e5c6640..d5e5dbd0e5ad 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4950,6 +4950,16 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: + /* + * On AMD platforms, under certain conditions insn_len may be zero on #NPF. + * This can happen if a guest gets a page-fault on data access but the HW + * table walker is not able to read the instruction page (e.g instruction + * page is not present in memory). In those cases we simply restart the + * guest. + */ + if (unlikely(insn && !insn_len)) + return 1; + er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3e848f952b4f..ec5df5752995 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2436,7 +2436,8 @@ static int pf_interception(struct vcpu_svm *svm) u64 error_code = svm->vmcb->control.exit_info_1; return kvm_handle_page_fault(&svm->vcpu, error_code, fault_address, - svm->vmcb->control.insn_bytes, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } @@ -2447,7 +2448,8 @@ static int npf_interception(struct vcpu_svm *svm) trace_kvm_page_fault(fault_address, error_code); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code, - svm->vmcb->control.insn_bytes, + static_cpu_has(X86_FEATURE_DECODEASSISTS) ? + svm->vmcb->control.insn_bytes : NULL, svm->vmcb->control.insn_len); } -- cgit v1.2.3 From ae3e61e1c28338d077b704505570fa181df1e41f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:36:41 +0200 Subject: KVM: x86: add support for UMIP Add the CPUID bits, make the CR4.UMIP bit not reserved anymore, and add UMIP support for instructions that are already emulated by KVM. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/cpuid.c | 4 ++-- arch/x86/kvm/emulate.c | 8 ++++++++ arch/x86/kvm/x86.c | 3 +++ 4 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 516798431328..ff79134d1d71 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -86,7 +86,7 @@ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ - | X86_CR4_SMAP | X86_CR4_PKE)) + | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 0099e10eb045..77fb8732b47b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -387,8 +387,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = - F(AVX512VBMI) | F(LA57) | F(PKU) | - 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ); + F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | + F(AVX512_VPOPCNTDQ) | F(UMIP); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index abe74f779f9d..5edb25267628 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3720,6 +3720,10 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt, { struct desc_ptr desc_ptr; + if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && + ctxt->ops->cpl(ctxt) > 0) + return emulate_gp(ctxt, 0); + if (ctxt->mode == X86EMUL_MODE_PROT64) ctxt->op_bytes = 8; get(ctxt, &desc_ptr); @@ -3779,6 +3783,10 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) static int em_smsw(struct x86_emulate_ctxt *ctxt) { + if ((ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && + ctxt->ops->cpl(ctxt) > 0) + return emulate_gp(ctxt, 0); + if (ctxt->dst.type == OP_MEM) ctxt->dst.bytes = 2; ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 56d036b9ad75..dd2e80ac49ff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -794,6 +794,9 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57)) return 1; + if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP)) + return 1; + if (is_long_mode(vcpu)) { if (!(cr4 & X86_CR4_PAE)) return 1; -- cgit v1.2.3 From dd307d017b445a3af4379c7ff548cb3da5ecde31 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:35:51 +0200 Subject: KVM: x86: emulate sldt and str These are needed to handle the descriptor table vmexits when emulating UMIP. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5edb25267628..fa703e3baa4b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3633,17 +3633,27 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } -static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) +static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment) { - if (ctxt->modrm_reg > VCPU_SREG_GS) - return emulate_ud(ctxt); + if (segment > VCPU_SREG_GS && + (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_UMIP) && + ctxt->ops->cpl(ctxt) > 0) + return emulate_gp(ctxt, 0); - ctxt->dst.val = get_segment_selector(ctxt, ctxt->modrm_reg); + ctxt->dst.val = get_segment_selector(ctxt, segment); if (ctxt->dst.bytes == 4 && ctxt->dst.type == OP_MEM) ctxt->dst.bytes = 2; return X86EMUL_CONTINUE; } +static int em_mov_rm_sreg(struct x86_emulate_ctxt *ctxt) +{ + if (ctxt->modrm_reg > VCPU_SREG_GS) + return emulate_ud(ctxt); + + return em_store_sreg(ctxt, ctxt->modrm_reg); +} + static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; @@ -3659,6 +3669,11 @@ static int em_mov_sreg_rm(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, ctxt->modrm_reg); } +static int em_sldt(struct x86_emulate_ctxt *ctxt) +{ + return em_store_sreg(ctxt, VCPU_SREG_LDTR); +} + static int em_lldt(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; @@ -3668,6 +3683,11 @@ static int em_lldt(struct x86_emulate_ctxt *ctxt) return load_segment_descriptor(ctxt, sel, VCPU_SREG_LDTR); } +static int em_str(struct x86_emulate_ctxt *ctxt) +{ + return em_store_sreg(ctxt, VCPU_SREG_TR); +} + static int em_ltr(struct x86_emulate_ctxt *ctxt) { u16 sel = ctxt->src.val; @@ -4372,8 +4392,8 @@ static const struct opcode group5[] = { }; static const struct opcode group6[] = { - DI(Prot | DstMem, sldt), - DI(Prot | DstMem, str), + II(Prot | DstMem, em_sldt, sldt), + II(Prot | DstMem, em_str, str), II(Prot | Priv | SrcMem16, em_lldt, lldt), II(Prot | Priv | SrcMem16, em_ltr, ltr), N, N, N, N, -- cgit v1.2.3 From 66336cab3531d3325ebde36a04725dddd0c42cb5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:36:41 +0200 Subject: KVM: x86: add support for emulating UMIP The User-Mode Instruction Prevention feature present in recent Intel processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str) from being executed with CPL > 0. Otherwise, a general protection fault is issued. UMIP instructions in general are also able to trigger vmexits, so we can actually emulate UMIP on older processors. This commit sets up the infrastructure so that kvm-intel.ko and kvm-amd.ko can set the UMIP feature bit for CPUID even if the feature is not actually available in hardware. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/cpuid.c | 2 ++ arch/x86/kvm/svm.c | 6 ++++++ arch/x86/kvm/vmx.c | 6 ++++++ 4 files changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ff79134d1d71..515db75081d1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1017,6 +1017,7 @@ struct kvm_x86_ops { void (*handle_external_intr)(struct kvm_vcpu *vcpu); bool (*mpx_supported)(void); bool (*xsaves_supported)(void); + bool (*umip_emulated)(void); int (*check_nested_events)(struct kvm_vcpu *vcpu, bool external_intr); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 77fb8732b47b..2b3b06458f6f 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -327,6 +327,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, unsigned f_invpcid = kvm_x86_ops->invpcid_supported() ? F(INVPCID) : 0; unsigned f_mpx = kvm_mpx_supported() ? F(MPX) : 0; unsigned f_xsaves = kvm_x86_ops->xsaves_supported() ? F(XSAVES) : 0; + unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0; /* cpuid 1.edx */ const u32 kvm_cpuid_1_edx_x86_features = @@ -473,6 +474,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ebx |= F(TSC_ADJUST); entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; cpuid_mask(&entry->ecx, CPUID_7_ECX); + entry->ecx |= f_umip; /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) entry->ecx &= ~F(PKU); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index eb714f1cdf7e..1f3e7f210374 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -5204,6 +5204,11 @@ static bool svm_xsaves_supported(void) return false; } +static bool svm_umip_emulated(void) +{ + return false; +} + static bool svm_has_wbinvd_exit(void) { return true; @@ -5597,6 +5602,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .invpcid_supported = svm_invpcid_supported, .mpx_supported = svm_mpx_supported, .xsaves_supported = svm_xsaves_supported, + .umip_emulated = svm_umip_emulated, .set_supported_cpuid = svm_set_supported_cpuid, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8eba631c4dbd..b989cfe412b1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9155,6 +9155,11 @@ static bool vmx_xsaves_supported(void) SECONDARY_EXEC_XSAVES; } +static bool vmx_umip_emulated(void) +{ + return false; +} + static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { u32 exit_intr_info; @@ -12170,6 +12175,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .handle_external_intr = vmx_handle_external_intr, .mpx_supported = vmx_mpx_supported, .xsaves_supported = vmx_xsaves_supported, + .umip_emulated = vmx_umip_emulated, .check_nested_events = vmx_check_nested_events, -- cgit v1.2.3 From 0367f205a3b7c0efe774634eef1f4697c79a4132 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 10:44:55 +0200 Subject: KVM: vmx: add support for emulating UMIP UMIP can be emulated almost perfectly on Intel processor by enabling descriptor-table exits. SMSW does not cause a vmexit and hence it cannot be changed into a #GP fault, but all in all it's the most "innocuous" of the unprivileged instructions that UMIP blocks. In fact, Linux is _also_ emulating SMSW instructions on behalf of the program that executes them, because some 16-bit programs expect to use SMSW to detect vm86 mode, so this is an even smaller issue. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b989cfe412b1..ebd50de981ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3662,6 +3662,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_ENABLE_EPT | SECONDARY_EXEC_UNRESTRICTED_GUEST | SECONDARY_EXEC_PAUSE_LOOP_EXITING | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -4369,6 +4370,14 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) (to_vmx(vcpu)->rmode.vm86_active ? KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); + if ((cr4 & X86_CR4_UMIP) && !boot_cpu_has(X86_FEATURE_UMIP)) { + vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + hw_cr4 &= ~X86_CR4_UMIP; + } else + vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, + SECONDARY_EXEC_DESC); + if (cr4 & X86_CR4_VMXE) { /* * To use VMXON (and later other VMX instructions), a guest @@ -5308,6 +5317,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) struct kvm_vcpu *vcpu = &vmx->vcpu; u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl; + if (!cpu_need_virtualize_apic_accesses(vcpu)) exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; if (vmx->vpid == 0) @@ -5326,6 +5336,11 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + + /* SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP, + * in vmx_set_cr4. */ + exec_control &= ~SECONDARY_EXEC_DESC; + /* SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD (handle_vmptrld). We can NOT enable shadow_vmcs here because we don't have yet @@ -6101,6 +6116,12 @@ static int handle_set_cr4(struct kvm_vcpu *vcpu, unsigned long val) return kvm_set_cr4(vcpu, val); } +static int handle_desc(struct kvm_vcpu *vcpu) +{ + WARN_ON(!(vcpu->arch.cr4 & X86_CR4_UMIP)); + return emulate_instruction(vcpu, 0) == EMULATE_DONE; +} + static int handle_cr(struct kvm_vcpu *vcpu) { unsigned long exit_qualification, val; @@ -8193,6 +8214,8 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { [EXIT_REASON_XSETBV] = handle_xsetbv, [EXIT_REASON_TASK_SWITCH] = handle_task_switch, [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, + [EXIT_REASON_GDTR_IDTR] = handle_desc, + [EXIT_REASON_LDTR_TR] = handle_desc, [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause, @@ -9157,7 +9180,8 @@ static bool vmx_xsaves_supported(void) static bool vmx_umip_emulated(void) { - return false; + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_DESC; } static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) @@ -9755,7 +9779,8 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) u32 mask = SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | - SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_DESC; u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); -- cgit v1.2.3 From fb6d4d340e0532032c808a9933eaaa7b8de435ab Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 12 Jul 2016 11:04:26 +0200 Subject: KVM: x86: emulate RDPID This is encoded as F3 0F C7 /7 with a register argument. The register argument is the second array in the group9 GroupDual, while F3 is the fourth element of a Prefix. Reviewed-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 7 ++++++- arch/x86/kvm/emulate.c | 22 +++++++++++++++++++++- arch/x86/kvm/vmx.c | 15 +++++++++++++++ 3 files changed, 42 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2b3b06458f6f..b94371146533 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -293,13 +293,18 @@ static int __do_cpuid_ent_emulated(struct kvm_cpuid_entry2 *entry, { switch (func) { case 0: - entry->eax = 1; /* only one leaf currently */ + entry->eax = 7; ++*nent; break; case 1: entry->ecx = F(MOVBE); ++*nent; break; + case 7: + entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; + if (index == 0) + entry->ecx = F(RDPID); + ++*nent; default: break; } diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index fa703e3baa4b..cb929d0bb1bd 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3514,6 +3514,16 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_rdpid(struct x86_emulate_ctxt *ctxt) +{ + u64 tsc_aux = 0; + + if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux)) + return emulate_gp(ctxt, 0); + ctxt->dst.val = tsc_aux; + return X86EMUL_CONTINUE; +} + static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { u64 tsc = 0; @@ -4424,10 +4434,20 @@ static const struct opcode group8[] = { F(DstMem | SrcImmByte | Lock | PageTable, em_btc), }; +/* + * The "memory" destination is actually always a register, since we come + * from the register case of group9. + */ +static const struct gprefix pfx_0f_c7_7 = { + N, N, N, II(DstMem | ModRM | Op3264 | EmulateOnUD, em_rdpid, rdtscp), +}; + + static const struct group_dual group9 = { { N, I(DstMem64 | Lock | PageTable, em_cmpxchg8b), N, N, N, N, N, N, }, { - N, N, N, N, N, N, N, N, + N, N, N, N, N, N, N, + GP(0, &pfx_0f_c7_7), } }; static const struct opcode group11[] = { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ebd50de981ee..8dafc5d7d4d6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11687,6 +11687,21 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage) { + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; + + /* + * RDPID causes #UD if disabled through secondary execution controls. + * Because it is marked as EmulateOnUD, we need to intercept it here. + */ + if (info->intercept == x86_intercept_rdtscp && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) { + ctxt->exception.vector = UD_VECTOR; + ctxt->exception.error_code_valid = false; + return X86EMUL_PROPAGATE_FAULT; + } + + /* TODO: check more intercepts... */ return X86EMUL_CONTINUE; } -- cgit v1.2.3 From 858ac87fc2ff7be8fedfa8dd2ba47627068c2eae Mon Sep 17 00:00:00 2001 From: Gimcuan Hui Date: Fri, 3 Nov 2017 18:52:46 +0000 Subject: x86: kvm: mmu: make kvm_mmu_clear_all_pte_masks static The kvm_mmu_clear_all_pte_masks interface is only used by kvm_mmu_module_init locally, and does not need to be called by other module, make it static. This patch cleans up sparse warning: symbol 'kvm_mmu_clear_all_pte_masks' was not declared. Should it be static? Signed-off-by: Gimcuan Hui Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c4deb1f34faa..89da688784fa 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -381,7 +381,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, } EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes); -void kvm_mmu_clear_all_pte_masks(void) +static void kvm_mmu_clear_all_pte_masks(void) { shadow_user_mask = 0; shadow_accessed_mask = 0; -- cgit v1.2.3 From 0d37d26f11bc1c0085d0c89ba603e3bb2c7f7076 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 30 Nov 2017 12:04:22 +0000 Subject: KVM: x86: MMU: make array audit_point_name static The array audit_point_name is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: arch/x86/kvm/mmu_audit.c:22:12: warning: symbol 'audit_point_name' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu_audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index d22ddbdf5e6e..1272861e77b9 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -19,7 +19,7 @@ #include -char const *audit_point_name[] = { +static char const *audit_point_name[] = { "pre page fault", "post page fault", "pre pte write", -- cgit v1.2.3 From 2a140f3b6e23a309453b6f68709a50ece543f0f4 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 29 Nov 2017 22:23:41 +0100 Subject: KVM: x86: prevent MWAIT in guest with buggy MONITOR MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bug prevents MWAIT from waking up after a write to the monitored cache line. KVM might emulate a CPU model that shouldn't have the bug, so the guest would not employ a workaround and possibly miss wakeups. Better to avoid the situation. Reviewed-by: Alexander Graf Acked-by: Borislav Petkov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d0b95b7a90b4..81f5f50794f6 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -281,6 +281,9 @@ static inline bool kvm_mwait_in_guest(void) return false; } + if (boot_cpu_has_bug(X86_BUG_MONITOR)) + return false; + /* * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as * they would allow guest to stop the CPU completely by disabling -- cgit v1.2.3 From 346f48fa311ceee7478b5b810ba050a4a22e0b2e Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 29 Nov 2017 22:23:42 +0100 Subject: KVM: x86: drop bogus MWAIT check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check was added in some iteration while trying to fix a reported OS X on Core 2 bug, but that bug is elsewhere. The comment is misleading because the guest can call MWAIT with ECX = 0 even if we enforce CPUID5_ECX_INTERRUPT_BREAK; the call would have the exactly the same effect as if the host didn't have the feature. A problem is that a QEMU feature exposes CPUID5_ECX_INTERRUPT_BREAK on CPUs that do not support it. Removing the check changes behavior on last Pentium 4 lines (Presler, Dempsey, and Tulsa, which had VMX and MONITOR while missing INTERRUPT_BREAK) when running a guest OS that uses MWAIT without checking for its presence (QEMU doesn't expose MONITOR). The only known OS that ignores the MONITOR flag is old Mac OS X and we allowed it to bug on Core 2 (MWAIT used to throw #UD and only that OS noticed), so we can save another 20 lines letting it bug on even older CPUs. Alternatively, we can return MWAIT exiting by default and let userspace toggle it. Reviewed-by: Alexander Graf Acked-by: Borislav Petkov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 81f5f50794f6..d15859ec5e92 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -265,8 +265,6 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) static inline bool kvm_mwait_in_guest(void) { - unsigned int eax, ebx, ecx, edx; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) return false; @@ -275,29 +273,10 @@ static inline bool kvm_mwait_in_guest(void) /* All AMD CPUs have a working MWAIT implementation */ return true; case X86_VENDOR_INTEL: - /* Handle Intel below */ - break; + return !boot_cpu_has_bug(X86_BUG_MONITOR); default: return false; } - - if (boot_cpu_has_bug(X86_BUG_MONITOR)) - return false; - - /* - * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as - * they would allow guest to stop the CPU completely by disabling - * interrupts then invoking MWAIT. - */ - if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) - return false; - - cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); - - if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) - return false; - - return true; } #endif -- cgit v1.2.3 From 431f5d4443964bcc58ac2e4c6fd8ace6dcd3a8d1 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 29 Nov 2017 22:23:43 +0100 Subject: KVM: x86: simplify kvm_mwait_in_guest() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If Intel/AMD implements MWAIT, we expect that it works well and only reject known bugs; no reason to do it the other way around for minor vendors. (Not that they are relevant ATM.) This allows further simplification of kvm_mwait_in_guest(). And use boot_cpu_has() instead of "cpu_has(&boot_cpu_data," while at it. Reviewed-by: Alexander Graf Acked-by: Borislav Petkov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index d15859ec5e92..c69f973111cb 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -265,18 +265,8 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) static inline bool kvm_mwait_in_guest(void) { - if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) - return false; - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_AMD: - /* All AMD CPUs have a working MWAIT implementation */ - return true; - case X86_VENDOR_INTEL: - return !boot_cpu_has_bug(X86_BUG_MONITOR); - default: - return false; - } + return boot_cpu_has(X86_FEATURE_MWAIT) && + !boot_cpu_has_bug(X86_BUG_MONITOR); } #endif -- cgit v1.2.3 From 52797bf9a875c4a30f846196386684e646e08a91 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Wed, 15 Nov 2017 13:43:14 +0200 Subject: KVM: x86: Add emulation of MSR_SMI_COUNT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This MSR returns the number of #SMIs that occurred on CPU since boot. It was seen to be used frequently by ESXi guest. Patch adds a new vcpu-arch specific var called smi_count to save the number of #SMIs which occurred on CPU since boot. It is exposed as a read-only MSR to guest (causing #GP on wrmsr) in RDMSR/WRMSR emulation code. MSR_SMI_COUNT is also added to emulated_msrs[] to make sure user-space can save/restore it for migration purposes. Signed-off-by: Liran Alon Suggested-by: Paolo Bonzini Reviewed-by: Nikita Leshenko Reviewed-by: Bhavesh Davda Signed-off-by: Konrad Rzeszutek Wilk Reviewed-by: Paolo Bonzini Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 515db75081d1..340e3604dcc7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -504,6 +504,7 @@ struct kvm_vcpu_arch { int mp_state; u64 ia32_misc_enable_msr; u64 smbase; + u64 smi_count; bool tpr_access_reporting; u64 ia32_xss; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dd2e80ac49ff..7ea9a57b0684 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1039,6 +1039,7 @@ static u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, + MSR_SMI_COUNT, MSR_PLATFORM_INFO, MSR_MISC_FEATURES_ENABLES, }; @@ -2231,6 +2232,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.smbase = data; break; + case MSR_SMI_COUNT: + if (!msr_info->host_initiated) + return 1; + vcpu->arch.smi_count = data; + break; case MSR_KVM_WALL_CLOCK_NEW: case MSR_KVM_WALL_CLOCK: vcpu->kvm->arch.wall_clock = data; @@ -2505,6 +2511,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.smbase; break; + case MSR_SMI_COUNT: + msr_info->data = vcpu->arch.smi_count; + break; case MSR_IA32_PERF_STATUS: /* TSC increment by tick */ msr_info->data = 1000ULL; @@ -6451,6 +6460,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) kvm_x86_ops->queue_exception(vcpu); } else if (vcpu->arch.smi_pending && !is_smm(vcpu) && kvm_x86_ops->smi_allowed(vcpu)) { vcpu->arch.smi_pending = false; + ++vcpu->arch.smi_count; enter_smm(vcpu); } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) { --vcpu->arch.nmi_pending; @@ -7808,6 +7818,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.hflags = 0; vcpu->arch.smi_pending = 0; + vcpu->arch.smi_count = 0; atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; -- cgit v1.2.3 From 80fef315a74d79d765dbf58d9481843a364c50d6 Mon Sep 17 00:00:00 2001 From: Yang Zhong Date: Wed, 22 Nov 2017 15:27:29 +0800 Subject: KVM: Expose new cpu features to guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Intel IceLake cpu has added new cpu features,AVX512_VBMI2/GFNI/ VAES/VPCLMULQDQ/AVX512_VNNI/AVX512_BITALG. Those new cpu features need expose to guest VM. The bit definition: CPUID.(EAX=7,ECX=0):ECX[bit 06] AVX512_VBMI2 CPUID.(EAX=7,ECX=0):ECX[bit 08] GFNI CPUID.(EAX=7,ECX=0):ECX[bit 09] VAES CPUID.(EAX=7,ECX=0):ECX[bit 10] VPCLMULQDQ CPUID.(EAX=7,ECX=0):ECX[bit 11] AVX512_VNNI CPUID.(EAX=7,ECX=0):ECX[bit 12] AVX512_BITALG The release document ref below link: https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf The kernel dependency commit in kvm.git: (c128dbfa0f879f8ce7b79054037889b0b2240728) Signed-off-by: Yang Zhong Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index b94371146533..3db9e1cd4f63 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -394,7 +394,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | - F(AVX512_VPOPCNTDQ) | F(UMIP); + F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | + F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = -- cgit v1.2.3 From 00647b44944a2f7212ac2c3825a465153d6e438f Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Mon, 27 Nov 2017 17:22:25 -0600 Subject: KVM: nVMX: Eliminate vmcs02 pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The potential performance advantages of a vmcs02 pool have never been realized. To simplify the code, eliminate the pool. Instead, a single vmcs02 is allocated per VCPU when the VCPU enters VMX operation. Signed-off-by: Jim Mattson Signed-off-by: Mark Kanda Reviewed-by: Ameya More Reviewed-by: David Hildenbrand Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 146 +++++++++-------------------------------------------- 1 file changed, 23 insertions(+), 123 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8dafc5d7d4d6..d1870f3c8c69 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -184,7 +184,6 @@ module_param(ple_window_max, int, S_IRUGO); extern const ulong vmx_return; #define NR_AUTOLOAD_MSRS 8 -#define VMCS02_POOL_SIZE 1 struct vmcs { u32 revision_id; @@ -225,7 +224,7 @@ struct shared_msr_entry { * stored in guest memory specified by VMPTRLD, but is opaque to the guest, * which must access it using VMREAD/VMWRITE/VMCLEAR instructions. * More than one of these structures may exist, if L1 runs multiple L2 guests. - * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the + * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the * underlying hardware which will be used to run L2. * This structure is packed to ensure that its layout is identical across * machines (necessary for live migration). @@ -408,13 +407,6 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE 0x1000 -/* Used to remember the last vmcs02 used for some recently used vmcs12s */ -struct vmcs02_list { - struct list_head list; - gpa_t vmptr; - struct loaded_vmcs vmcs02; -}; - /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -439,15 +431,15 @@ struct nested_vmx { */ bool sync_shadow_vmcs; - /* vmcs02_list cache of VMCSs recently used to run L2 guests */ - struct list_head vmcs02_pool; - int vmcs02_num; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ bool nested_run_pending; + + struct loaded_vmcs vmcs02; + /* - * Guest pages referred to in vmcs02 with host-physical pointers, so - * we must keep them pinned while L2 runs. + * Guest pages referred to in the vmcs02 with host-physical + * pointers, so we must keep them pinned while L2 runs. */ struct page *apic_access_page; struct page *virtual_apic_page; @@ -6988,94 +6980,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu) return handle_nop(vcpu); } -/* - * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12. - * We could reuse a single VMCS for all the L2 guests, but we also want the - * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this - * allows keeping them loaded on the processor, and in the future will allow - * optimizations where prepare_vmcs02 doesn't need to set all the fields on - * every entry if they never change. - * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE - * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first. - * - * The following functions allocate and free a vmcs02 in this pool. - */ - -/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */ -static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmx->nested.current_vmptr) { - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) { - /* Recycle the least recently used VMCS. */ - item = list_last_entry(&vmx->nested.vmcs02_pool, - struct vmcs02_list, list); - item->vmptr = vmx->nested.current_vmptr; - list_move(&item->list, &vmx->nested.vmcs02_pool); - return &item->vmcs02; - } - - /* Create a new VMCS */ - item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL); - if (!item) - return NULL; - item->vmcs02.vmcs = alloc_vmcs(); - item->vmcs02.shadow_vmcs = NULL; - if (!item->vmcs02.vmcs) { - kfree(item); - return NULL; - } - loaded_vmcs_init(&item->vmcs02); - item->vmptr = vmx->nested.current_vmptr; - list_add(&(item->list), &(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num++; - return &item->vmcs02; -} - -/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */ -static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) -{ - struct vmcs02_list *item; - list_for_each_entry(item, &vmx->nested.vmcs02_pool, list) - if (item->vmptr == vmptr) { - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - return; - } -} - -/* - * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs - * must be &vmx->vmcs01. - */ -static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) -{ - struct vmcs02_list *item, *n; - - WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); - list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - /* - * Something will leak if the above WARN triggers. Better than - * a use-after-free. - */ - if (vmx->loaded_vmcs == &item->vmcs02) - continue; - - free_loaded_vmcs(&item->vmcs02); - list_del(&item->list); - kfree(item); - vmx->nested.vmcs02_num--; - } -} - /* * The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(), * set the success or error code of an emulated VMX instruction, as specified @@ -7257,6 +7161,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs *shadow_vmcs; + vmx->nested.vmcs02.vmcs = alloc_vmcs(); + vmx->nested.vmcs02.shadow_vmcs = NULL; + if (!vmx->nested.vmcs02.vmcs) + goto out_vmcs02; + loaded_vmcs_init(&vmx->nested.vmcs02); + if (cpu_has_vmx_msr_bitmap()) { vmx->nested.msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); @@ -7279,9 +7189,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) vmx->vmcs01.shadow_vmcs = shadow_vmcs; } - INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool)); - vmx->nested.vmcs02_num = 0; - hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); vmx->nested.preemption_timer.function = vmx_preemption_timer_fn; @@ -7296,6 +7203,9 @@ out_cached_vmcs12: free_page((unsigned long)vmx->nested.msr_bitmap); out_msr_bitmap: + free_loaded_vmcs(&vmx->nested.vmcs02); + +out_vmcs02: return -ENOMEM; } @@ -7449,7 +7359,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->vmcs01.shadow_vmcs = NULL; } kfree(vmx->nested.cached_vmcs12); - /* Unpin physical memory we referred to in current vmcs02 */ + /* Unpin physical memory we referred to in the vmcs02 */ if (vmx->nested.apic_access_page) { kvm_release_page_dirty(vmx->nested.apic_access_page); vmx->nested.apic_access_page = NULL; @@ -7465,7 +7375,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } - nested_free_all_saved_vmcss(vmx); + free_loaded_vmcs(&vmx->nested.vmcs02); } /* Emulate the VMXOFF instruction */ @@ -7508,8 +7418,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) vmptr + offsetof(struct vmcs12, launch_state), &zero, sizeof(zero)); - nested_free_vmcs02(vmx, vmptr); - nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } @@ -8423,10 +8331,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason) /* * The host physical addresses of some pages of guest memory - * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU - * may write to these pages via their host physical address while - * L2 is running, bypassing any address-translation-based dirty - * tracking (e.g. EPT write protection). + * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC + * Page). The CPU may write to these pages via their host + * physical address while L2 is running, bypassing any + * address-translation-based dirty tracking (e.g. EPT write + * protection). * * Mark them dirty on every exit from L2 to prevent them from * getting out of sync with dirty tracking. @@ -10912,20 +10821,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - struct loaded_vmcs *vmcs02; u32 msr_entry_idx; u32 exit_qual; - vmcs02 = nested_get_current_vmcs02(vmx); - if (!vmcs02) - return -ENOMEM; - enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - vmx_switch_vmcs(vcpu, vmcs02); + vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); vmx_segment_cache_clear(vmx); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { @@ -11543,10 +11447,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vm_exit_controls_reset_shadow(vmx); vmx_segment_cache_clear(vmx); - /* if no vmcs02 cache requested, remove the one we used */ - if (VMCS02_POOL_SIZE == 0) - nested_free_vmcs02(vmx, vmx->nested.current_vmptr); - /* Update any VMCS fields that might have changed while L2 ran */ vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); -- cgit v1.2.3 From 276c796cfef5bdaf9aae055f520b8857eaa3fa19 Mon Sep 17 00:00:00 2001 From: Mark Kanda Date: Mon, 27 Nov 2017 17:22:26 -0600 Subject: KVM: nVMX: Add a WARN for freeing a loaded VMCS02 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When attempting to free a loaded VMCS02, add a WARN and avoid freeing it (to avoid use-after-free situations). Suggested-by: Paolo Bonzini Signed-off-by: Mark Kanda Reviewed-by: Ameya More Reviewed-by: Krish Sadhukhan Reviewed-by: David Hildenbrand Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d1870f3c8c69..3b5f70285414 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3846,6 +3846,19 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs) WARN_ON(loaded_vmcs->shadow_vmcs != NULL); } +static void vmx_nested_free_vmcs02(struct vcpu_vmx *vmx) +{ + struct loaded_vmcs *loaded_vmcs = &vmx->nested.vmcs02; + + /* + * Just leak the VMCS02 if the WARN triggers. Better than + * a use-after-free. + */ + if (WARN_ON(vmx->loaded_vmcs == loaded_vmcs)) + return; + free_loaded_vmcs(loaded_vmcs); +} + static void free_kvm_area(void) { int cpu; @@ -7203,7 +7216,7 @@ out_cached_vmcs12: free_page((unsigned long)vmx->nested.msr_bitmap); out_msr_bitmap: - free_loaded_vmcs(&vmx->nested.vmcs02); + vmx_nested_free_vmcs02(vmx); out_vmcs02: return -ENOMEM; @@ -7375,7 +7388,7 @@ static void free_nested(struct vcpu_vmx *vmx) vmx->nested.pi_desc = NULL; } - free_loaded_vmcs(&vmx->nested.vmcs02); + vmx_nested_free_vmcs02(vmx); } /* Emulate the VMXOFF instruction */ -- cgit v1.2.3 From 74c55931c71352317ae0f5736ee9e4ca07ba4238 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 29 Nov 2017 01:31:20 -0800 Subject: KVM: VMX: Cache IA32_DEBUGCTL in memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MSR_IA32_DEBUGCTLMSR is zeroed on VMEXIT, so it is saved/restored each time during world switch. This patch caches the host IA32_DEBUGCTL MSR and saves/restores the host IA32_DEBUGCTL msr when guest/host switches to avoid to save/restore each time during world switch. This saves about 100 clock cycles per vmexit. Suggested-by: Jim Mattson Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Jim Mattson Signed-off-by: Wanpeng Li Reviewed-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3b5f70285414..4e7da90b4426 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -650,6 +650,8 @@ struct vcpu_vmx { u32 host_pkru; + unsigned long host_debugctlmsr; + /* * Only bits masked by msr_ia32_feature_control_valid_bits can be set in * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included @@ -2318,6 +2320,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vmx_vcpu_pi_load(vcpu, cpu); vmx->host_pkru = read_pkru(); + vmx->host_debugctlmsr = get_debugctlmsr(); } static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) @@ -9261,7 +9264,7 @@ static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long debugctlmsr, cr3, cr4; + unsigned long cr3, cr4; /* Record the guest's net vcpu time for enforced NMI injections. */ if (unlikely(!enable_vnmi && @@ -9314,7 +9317,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) __write_pkru(vcpu->arch.pkru); atomic_switch_perf_msrs(vmx); - debugctlmsr = get_debugctlmsr(); vmx_arm_hv_timer(vcpu); @@ -9425,8 +9427,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ); /* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */ - if (debugctlmsr) - update_debugctlmsr(debugctlmsr); + if (vmx->host_debugctlmsr) + update_debugctlmsr(vmx->host_debugctlmsr); #ifndef CONFIG_X86_64 /* -- cgit v1.2.3 From 9c48d517ce6da398b8cff0603b75b366759023c4 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 1 Dec 2017 00:15:10 -0800 Subject: KVM: X86: Reduce the overhead when lapic_timer_advance is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When I run ebizzy in a 32 vCPUs guest on a 32 pCPUs Xeon box, I can observe ~8000 kvm_wait_lapic_expire CurAvg/s through kvm_stat tool even if the advance tscdeadline hrtimer expiration is disabled. Each call to wait_lapic_expire() will consume ~70 cycles when a timer fires since apic_timer_expire() will set expired_tscdeadline and then wait_lapic_expire() will do some caculation before bailing out. So total ~175us per second is lost on this 3.2Ghz machine. This patch reduces the overhead by skipping the function wait_lapic_expire() when lapic_timer_advance is disabled. Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7ea9a57b0684..54d66f238e20 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7018,7 +7018,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } trace_kvm_entry(vcpu->vcpu_id); - wait_lapic_expire(vcpu); + if (lapic_timer_advance_ns) + wait_lapic_expire(vcpu); guest_enter_irqoff(); if (unlikely(vcpu->arch.switch_db_regs)) { -- cgit v1.2.3 From 8eb73e2d410f00d383023fe41c0c25c6195b7389 Mon Sep 17 00:00:00 2001 From: Quan Xu Date: Tue, 12 Dec 2017 16:44:21 +0800 Subject: KVM: VMX: drop I/O permission bitmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since KVM removes the only I/O port 0x80 bypass on Intel hosts, clear CPU_BASED_USE_IO_BITMAPS and set CPU_BASED_UNCOND_IO_EXITING bit. Then these I/O permission bitmaps are not used at all, so drop I/O permission bitmaps. Signed-off-by: Jim Mattson Signed-off-by: Radim KrÄmář Signed-off-by: Quan Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 18 +++--------------- 1 file changed, 3 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4e7da90b4426..b016cf1b9f77 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -937,8 +937,6 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); enum { - VMX_IO_BITMAP_A, - VMX_IO_BITMAP_B, VMX_MSR_BITMAP_LEGACY, VMX_MSR_BITMAP_LONGMODE, VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, @@ -952,8 +950,6 @@ enum { static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; -#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) -#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) #define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) #define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) #define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) @@ -3627,7 +3623,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) #endif CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_USE_IO_BITMAPS | + CPU_BASED_UNCOND_IO_EXITING | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | CPU_BASED_INVLPG_EXITING | @@ -5468,10 +5464,6 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx) #endif int i; - /* I/O */ - vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a)); - vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b)); - if (enable_shadow_vmcs) { vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap)); vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap)); @@ -6783,10 +6775,6 @@ static __init int hardware_setup(void) memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); - memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); - - memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); - memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE); memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE); @@ -10563,8 +10551,8 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } /* - * Merging of IO bitmap not currently supported. - * Rather, exit every time. + * A vmexit (to either L1 hypervisor or L0 userspace) is always needed + * for I/O port accesses. */ exec_control &= ~CPU_BASED_USE_IO_BITMAPS; exec_control |= CPU_BASED_UNCOND_IO_EXITING; -- cgit v1.2.3 From ec7660ccdd2b71d8c7f0243f8590253713e9b75d Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:23 +0100 Subject: KVM: Take vcpu->mutex outside vcpu_load As we're about to call vcpu_load() from architecture-specific implementations of the KVM vcpu ioctls, but yet we access data structures protected by the vcpu->mutex in the generic code, factor this logic out from vcpu_load(). x86 is the only architecture which calls vcpu_load() outside of the main vcpu ioctl function, and these calls will no longer take the vcpu mutex following this patch. However, with the exception of kvm_arch_vcpu_postcreate (see below), the callers are either in the creation or destruction path of the VCPU, which means there cannot be any concurrent access to the data structure, because the file descriptor is not yet accessible, or is already gone. kvm_arch_vcpu_postcreate makes the newly created vcpu potentially accessible by other in-kernel threads through the kvm->vcpus array, and we therefore take the vcpu mutex in this case directly. Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 4 +--- arch/x86/kvm/x86.c | 20 +++++++------------- include/linux/kvm_host.h | 2 +- virt/kvm/kvm_main.c | 17 ++++++----------- 4 files changed, 15 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b016cf1b9f77..ef7d13e536a4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9496,10 +9496,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int r; - r = vcpu_load(vcpu); - BUG_ON(r); + vcpu_load(vcpu); vmx_switch_vmcs(vcpu, &vmx->vmcs01); free_nested(vmx); vcpu_put(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54d66f238e20..3f2c78f58570 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7767,16 +7767,12 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) { - int r; - kvm_vcpu_mtrr_init(vcpu); - r = vcpu_load(vcpu); - if (r) - return r; + vcpu_load(vcpu); kvm_vcpu_reset(vcpu, false); kvm_mmu_setup(vcpu); vcpu_put(vcpu); - return r; + return 0; } void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) @@ -7786,13 +7782,15 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) kvm_hv_vcpu_postcreate(vcpu); - if (vcpu_load(vcpu)) + if (mutex_lock_killable(&vcpu->mutex)) return; + vcpu_load(vcpu); msr.data = 0x0; msr.index = MSR_IA32_TSC; msr.host_initiated = true; kvm_write_tsc(vcpu, &msr); vcpu_put(vcpu); + mutex_unlock(&vcpu->mutex); if (!kvmclock_periodic_sync) return; @@ -7803,11 +7801,9 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - int r; vcpu->arch.apf.msr_val = 0; - r = vcpu_load(vcpu); - BUG_ON(r); + vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); @@ -8179,9 +8175,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) { - int r; - r = vcpu_load(vcpu); - BUG_ON(r); + vcpu_load(vcpu); kvm_mmu_unload(vcpu); vcpu_put(vcpu); } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 6bdd4b9f6611..09de0ff3d677 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -533,7 +533,7 @@ static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id); void kvm_vcpu_uninit(struct kvm_vcpu *vcpu); -int __must_check vcpu_load(struct kvm_vcpu *vcpu); +void vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index bc092e3d1d73..c4d116b336f4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -151,17 +151,12 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) /* * Switches to specified vcpu, until a matching vcpu_put() */ -int vcpu_load(struct kvm_vcpu *vcpu) +void vcpu_load(struct kvm_vcpu *vcpu) { - int cpu; - - if (mutex_lock_killable(&vcpu->mutex)) - return -EINTR; - cpu = get_cpu(); + int cpu = get_cpu(); preempt_notifier_register(&vcpu->preempt_notifier); kvm_arch_vcpu_load(vcpu, cpu); put_cpu(); - return 0; } EXPORT_SYMBOL_GPL(vcpu_load); @@ -171,7 +166,6 @@ void vcpu_put(struct kvm_vcpu *vcpu) kvm_arch_vcpu_put(vcpu); preempt_notifier_unregister(&vcpu->preempt_notifier); preempt_enable(); - mutex_unlock(&vcpu->mutex); } EXPORT_SYMBOL_GPL(vcpu_put); @@ -2560,9 +2554,9 @@ static long kvm_vcpu_ioctl(struct file *filp, #endif - r = vcpu_load(vcpu); - if (r) - return r; + if (mutex_lock_killable(&vcpu->mutex)) + return -EINTR; + vcpu_load(vcpu); switch (ioctl) { case KVM_RUN: { struct pid *oldpid; @@ -2735,6 +2729,7 @@ out_free1: } out: vcpu_put(vcpu); + mutex_unlock(&vcpu->mutex); kfree(fpu); kfree(kvm_sregs); return r; -- cgit v1.2.3 From accb757d798c9b4d85cfe3e5972134c586525168 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:25 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_run Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_run(). Signed-off-by: Christoffer Dall Reviewed-by: Christian Borntraeger # s390 parts Reviewed-by: Cornelia Huck [Rebased. - Paolo] Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 +++ arch/powerpc/kvm/powerpc.c | 6 +++++- arch/s390/kvm/kvm-s390.c | 10 ++++++++-- arch/x86/kvm/x86.c | 3 ++- virt/kvm/arm/arm.c | 20 ++++++++++++++------ virt/kvm/kvm_main.c | 2 -- 6 files changed, 32 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 75fdeaa8c62f..ba5ecf22bb96 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -446,6 +446,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r = -EINTR; + vcpu_load(vcpu); + kvm_sigset_activate(vcpu); if (vcpu->mmio_needed) { @@ -480,6 +482,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) out: kvm_sigset_deactivate(vcpu); + vcpu_put(vcpu); return r; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 1915e86cef6f..4e2167a7ae19 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1408,6 +1408,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) { int r; + vcpu_load(vcpu); + if (vcpu->mmio_needed) { vcpu->mmio_needed = 0; if (!vcpu->mmio_is_write) @@ -1422,7 +1424,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run); if (r == RESUME_HOST) { vcpu->mmio_needed = 1; - return r; + goto out; } } #endif @@ -1456,6 +1458,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_sigset_deactivate(vcpu); +out: + vcpu_put(vcpu); return r; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ec8b68e97d3c..7972598e60d0 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3373,9 +3373,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (kvm_run->immediate_exit) return -EINTR; + vcpu_load(vcpu); + if (guestdbg_exit_pending(vcpu)) { kvm_s390_prepare_debug_exit(vcpu); - return 0; + rc = 0; + goto out; } kvm_sigset_activate(vcpu); @@ -3385,7 +3388,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } else if (is_vcpu_stopped(vcpu)) { pr_err_ratelimited("can't run stopped vcpu %d\n", vcpu->vcpu_id); - return -EINVAL; + rc = -EINVAL; + goto out; } sync_regs(vcpu, kvm_run); @@ -3415,6 +3419,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_sigset_deactivate(vcpu); vcpu->stat.exit_userspace++; +out: + vcpu_put(vcpu); return rc; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f2c78f58570..af9da75011bc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7280,8 +7280,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; + vcpu_load(vcpu); kvm_sigset_activate(vcpu); - kvm_load_guest_fpu(vcpu); if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { @@ -7328,6 +7328,7 @@ out: post_kvm_run_save(vcpu); kvm_sigset_deactivate(vcpu); + vcpu_put(vcpu); return r; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 6b60c98a6e22..a1b2e8a43ca0 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -619,21 +619,27 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) if (unlikely(!kvm_vcpu_initialized(vcpu))) return -ENOEXEC; + vcpu_load(vcpu); + ret = kvm_vcpu_first_run_init(vcpu); if (ret) - return ret; + goto out; if (run->exit_reason == KVM_EXIT_MMIO) { ret = kvm_handle_mmio_return(vcpu, vcpu->run); if (ret) - return ret; - if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) - return 0; + goto out; + if (kvm_arm_handle_step_debug(vcpu, vcpu->run)) { + ret = 0; + goto out; + } } - if (run->immediate_exit) - return -EINTR; + if (run->immediate_exit) { + ret = -EINTR; + goto out; + } kvm_sigset_activate(vcpu); @@ -772,6 +778,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) kvm_sigset_deactivate(vcpu); +out: + vcpu_put(vcpu); return ret; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7bbaad8717a2..0b149827570c 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2562,7 +2562,6 @@ static long kvm_vcpu_ioctl(struct file *filp, r = -EINVAL; if (arg) goto out; - vcpu_load(vcpu); oldpid = rcu_access_pointer(vcpu->pid); if (unlikely(oldpid != current->pids[PIDTYPE_PID].pid)) { /* The thread running this VCPU changed. */ @@ -2574,7 +2573,6 @@ static long kvm_vcpu_ioctl(struct file *filp, put_pid(oldpid); } r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); - vcpu_put(vcpu); trace_kvm_userspace_exit(vcpu->run->exit_reason, r); break; } -- cgit v1.2.3 From 1fc9b76b3dd2c57ca0fe42742043a5c3cbdc41c1 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:26 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_regs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_regs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 +++ arch/powerpc/kvm/book3s.c | 3 +++ arch/powerpc/kvm/booke.c | 3 +++ arch/s390/kvm/kvm-s390.c | 2 ++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 6 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index ba5ecf22bb96..6023b5f808c0 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1162,6 +1162,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + for (i = 0; i < ARRAY_SIZE(vcpu->arch.gprs); i++) regs->gpr[i] = vcpu->arch.gprs[i]; @@ -1169,6 +1171,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->lo = vcpu->arch.lo; regs->pc = vcpu->arch.pc; + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 72d977e30952..d85bfd733ccd 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -497,6 +497,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + regs->pc = kvmppc_get_pc(vcpu); regs->cr = kvmppc_get_cr(vcpu); regs->ctr = kvmppc_get_ctr(vcpu); @@ -518,6 +520,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 83b485810aea..e0e4f04c5535 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1431,6 +1431,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + regs->pc = vcpu->arch.pc; regs->cr = kvmppc_get_cr(vcpu); regs->ctr = vcpu->arch.ctr; @@ -1452,6 +1454,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) regs->gpr[i] = kvmppc_get_gpr(vcpu, i); + vcpu_put(vcpu); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 7972598e60d0..44317813f498 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2715,7 +2715,9 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); memcpy(®s->gprs, &vcpu->run->s.regs.gprs, sizeof(regs->gprs)); + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index af9da75011bc..3364c6d4f743 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7334,6 +7334,8 @@ out: int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { /* * We are here if userspace calls get_regs() in the middle of @@ -7367,6 +7369,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_get_rflags(vcpu); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 0b149827570c..6dab2e6f8321 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2583,9 +2583,7 @@ static long kvm_vcpu_ioctl(struct file *filp, kvm_regs = kzalloc(sizeof(struct kvm_regs), GFP_KERNEL); if (!kvm_regs) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_regs(vcpu, kvm_regs); - vcpu_put(vcpu); if (r) goto out_free1; r = -EFAULT; -- cgit v1.2.3 From 875656fe0c8473c544860d557ca1512753d6aeef Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:27 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_regs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_regs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 3 +++ arch/powerpc/kvm/book3s.c | 3 +++ arch/powerpc/kvm/booke.c | 3 +++ arch/s390/kvm/kvm-s390.c | 2 ++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 6 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 6023b5f808c0..dd5f463b0e72 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -1148,6 +1148,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + for (i = 1; i < ARRAY_SIZE(vcpu->arch.gprs); i++) vcpu->arch.gprs[i] = regs->gpr[i]; vcpu->arch.gprs[0] = 0; /* zero is special, and cannot be set. */ @@ -1155,6 +1157,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) vcpu->arch.lo = regs->lo; vcpu->arch.pc = regs->pc; + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index d85bfd733ccd..24bc7aabfc44 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -528,6 +528,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + kvmppc_set_pc(vcpu, regs->pc); kvmppc_set_cr(vcpu, regs->cr); kvmppc_set_ctr(vcpu, regs->ctr); @@ -548,6 +550,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index e0e4f04c5535..bcbbeddc3430 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1462,6 +1462,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { int i; + vcpu_load(vcpu); + vcpu->arch.pc = regs->pc; kvmppc_set_cr(vcpu, regs->cr); vcpu->arch.ctr = regs->ctr; @@ -1483,6 +1485,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) kvmppc_set_gpr(vcpu, i, regs->gpr[i]); + vcpu_put(vcpu); return 0; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 44317813f498..ab08823a854b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2709,7 +2709,9 @@ static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); memcpy(&vcpu->run->s.regs.gprs, ®s->gprs, sizeof(regs->gprs)); + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3364c6d4f743..f82e87bb3a51 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7375,6 +7375,8 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu_load(vcpu); + vcpu->arch.emulate_regs_need_sync_from_vcpu = true; vcpu->arch.emulate_regs_need_sync_to_vcpu = false; @@ -7404,6 +7406,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6dab2e6f8321..e25c1a1a4120 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2603,9 +2603,7 @@ out_free1: r = PTR_ERR(kvm_regs); goto out; } - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_regs(vcpu, kvm_regs); - vcpu_put(vcpu); kfree(kvm_regs); break; } -- cgit v1.2.3 From bcdec41cefbea525ad424050650acb0f2eed1378 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:28 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_sregs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_sregs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s.c | 8 +++++++- arch/powerpc/kvm/booke.c | 9 ++++++++- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 5 files changed, 22 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 24bc7aabfc44..6cc2377549f7 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -484,7 +484,13 @@ void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + int ret; + + vcpu_load(vcpu); + ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + vcpu_put(vcpu); + + return ret; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index bcbbeddc3430..f647e121070e 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1613,11 +1613,18 @@ int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + int ret; + + vcpu_load(vcpu); + sregs->pvr = vcpu->arch.pvr; get_sregs_base(vcpu, sregs); get_sregs_arch206(vcpu, sregs); - return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + ret = vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); + + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ab08823a854b..87544beff507 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2734,8 +2734,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + vcpu_load(vcpu); + memcpy(&sregs->acrs, &vcpu->run->s.regs.acrs, sizeof(sregs->acrs)); memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs)); + + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f82e87bb3a51..66a8d8cd00f6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7425,6 +7425,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, { struct desc_ptr dt; + vcpu_load(vcpu); + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); @@ -7456,6 +7458,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, set_bit(vcpu->arch.interrupt.nr, (unsigned long *)sregs->interrupt_bitmap); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e25c1a1a4120..4c2f6a4d1852 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2612,9 +2612,7 @@ out_free1: r = -ENOMEM; if (!kvm_sregs) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, kvm_sregs); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; -- cgit v1.2.3 From b4ef9d4e8cb8938e6c0aa3be672b0aeeb791ecf3 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:29 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_sregs Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_sregs(). Signed-off-by: Christoffer Dall Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/book3s.c | 8 +++++++- arch/powerpc/kvm/booke.c | 15 ++++++++++----- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 12 +++++++++--- virt/kvm/kvm_main.c | 2 -- 5 files changed, 30 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 6cc2377549f7..047651622cb8 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -496,7 +496,13 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); + int ret; + + vcpu_load(vcpu); + ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); + vcpu_put(vcpu); + + return ret; } int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index f647e121070e..38939a204e26 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1630,20 +1630,25 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { - int ret; + int ret = -EINVAL; + vcpu_load(vcpu); if (vcpu->arch.pvr != sregs->pvr) - return -EINVAL; + goto out; ret = set_sregs_base(vcpu, sregs); if (ret < 0) - return ret; + goto out; ret = set_sregs_arch206(vcpu, sregs); if (ret < 0) - return ret; + goto out; + + ret = vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); - return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +out: + vcpu_put(vcpu); + return ret; } int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 87544beff507..d27a1dd0986b 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2726,8 +2726,12 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) { + vcpu_load(vcpu); + memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs)); memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); + + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 66a8d8cd00f6..f01a7d73dad7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7525,15 +7525,18 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, int mmu_reset_needed = 0; int pending_vec, max_bits, idx; struct desc_ptr dt; + int ret = -EINVAL; + + vcpu_load(vcpu); if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (sregs->cr4 & X86_CR4_OSXSAVE)) - return -EINVAL; + goto out; apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; if (kvm_set_apic_base(vcpu, &apic_base_msr)) - return -EINVAL; + goto out; dt.size = sregs->idt.limit; dt.address = sregs->idt.base; @@ -7599,7 +7602,10 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_make_request(KVM_REQ_EVENT, vcpu); - return 0; + ret = 0; +out: + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4c2f6a4d1852..e04a216bcd14 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2628,9 +2628,7 @@ out_free1: kvm_sregs = NULL; goto out; } - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, kvm_sregs); - vcpu_put(vcpu); break; } case KVM_GET_MP_STATE: { -- cgit v1.2.3 From fd2325612c1493c85cce89ea16b2396baca83311 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:30 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_mpstate Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_mpstate(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 11 +++++++++-- arch/x86/kvm/x86.c | 3 +++ virt/kvm/arm/arm.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 4 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index d27a1dd0986b..4297f5094a88 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2833,9 +2833,16 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int ret; + + vcpu_load(vcpu); + /* CHECK_STOP and LOAD are not supported yet */ - return is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED : - KVM_MP_STATE_OPERATING; + ret = is_vcpu_stopped(vcpu) ? KVM_MP_STATE_STOPPED : + KVM_MP_STATE_OPERATING; + + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f01a7d73dad7..80791939065f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7465,6 +7465,8 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); + kvm_apic_accept_events(vcpu); if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED && vcpu->arch.pv.pv_unhalted) @@ -7472,6 +7474,7 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, else mp_state->mp_state = vcpu->arch.mp_state; + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index a1b2e8a43ca0..65d50100ba3c 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -381,11 +381,14 @@ static void vcpu_power_off(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + vcpu_load(vcpu); + if (vcpu->arch.power_off) mp_state->mp_state = KVM_MP_STATE_STOPPED; else mp_state->mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e04a216bcd14..8e2f582417c3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2634,9 +2634,7 @@ out_free1: case KVM_GET_MP_STATE: { struct kvm_mp_state mp_state; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, &mp_state); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; -- cgit v1.2.3 From e83dff5edf0c3f014e4b4ac5e1c86dbe797687c7 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:31 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_mpstate Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_mpstate(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 3 +++ arch/x86/kvm/x86.c | 14 +++++++++++--- virt/kvm/arm/arm.c | 9 +++++++-- virt/kvm/kvm_main.c | 2 -- 4 files changed, 21 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4297f5094a88..4e6304d45ca5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2850,6 +2850,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, { int rc = 0; + vcpu_load(vcpu); + /* user space knows about this interface - let it control the state */ vcpu->kvm->arch.user_cpu_state_ctrl = 1; @@ -2867,6 +2869,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, rc = -ENXIO; } + vcpu_put(vcpu); return rc; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 80791939065f..a9e6cca1f46e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7481,15 +7481,19 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int ret = -EINVAL; + + vcpu_load(vcpu); + if (!lapic_in_kernel(vcpu) && mp_state->mp_state != KVM_MP_STATE_RUNNABLE) - return -EINVAL; + goto out; /* INITs are latched while in SMM */ if ((is_smm(vcpu) || vcpu->arch.smi_pending) && (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) - return -EINVAL; + goto out; if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; @@ -7497,7 +7501,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, } else vcpu->arch.mp_state = mp_state->mp_state; kvm_make_request(KVM_REQ_EVENT, vcpu); - return 0; + + ret = 0; +out: + vcpu_put(vcpu); + return ret; } int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 65d50100ba3c..aa6167086211 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -395,6 +395,10 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, struct kvm_mp_state *mp_state) { + int ret = 0; + + vcpu_load(vcpu); + switch (mp_state->mp_state) { case KVM_MP_STATE_RUNNABLE: vcpu->arch.power_off = false; @@ -403,10 +407,11 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_power_off(vcpu); break; default: - return -EINVAL; + ret = -EINVAL; } - return 0; + vcpu_put(vcpu); + return ret; } /** diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 8e2f582417c3..4b5aeb4b8c58 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2649,9 +2649,7 @@ out_free1: r = -EFAULT; if (copy_from_user(&mp_state, argp, sizeof(mp_state))) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, &mp_state); - vcpu_put(vcpu); break; } case KVM_TRANSLATE: { -- cgit v1.2.3 From 1da5b61dac98360a7e50b1565f6d499c6fc8123a Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:32 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_translate Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_translate(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/powerpc/kvm/booke.c | 2 ++ arch/x86/kvm/x86.c | 3 +++ virt/kvm/kvm_main.c | 2 -- 3 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 38939a204e26..b5f46517c839 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -1791,7 +1791,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, { int r; + vcpu_load(vcpu); r = kvmppc_core_vcpu_translate(vcpu, tr); + vcpu_put(vcpu); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a9e6cca1f46e..080e00dff426 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7684,6 +7684,8 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, gpa_t gpa; int idx; + vcpu_load(vcpu); + idx = srcu_read_lock(&vcpu->kvm->srcu); gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL); srcu_read_unlock(&vcpu->kvm->srcu, idx); @@ -7692,6 +7694,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, tr->writeable = 1; tr->usermode = 0; + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4b5aeb4b8c58..37217ec647b4 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2658,9 +2658,7 @@ out_free1: r = -EFAULT; if (copy_from_user(&tr, argp, sizeof(tr))) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_translate(vcpu, &tr); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; -- cgit v1.2.3 From 66b5656222990f1a536f5900ccd98539f9cf231f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:33 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_guest_debug Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_guest_debug(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/arm64/kvm/guest.c | 15 ++++++++++++--- arch/powerpc/kvm/book3s.c | 2 ++ arch/powerpc/kvm/booke.c | 19 +++++++++++++------ arch/s390/kvm/kvm-s390.c | 16 ++++++++++++---- arch/x86/kvm/x86.c | 4 +++- virt/kvm/kvm_main.c | 2 -- 6 files changed, 42 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 5c7f657dd207..d7e3299a7734 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -361,10 +361,16 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { + int ret = 0; + + vcpu_load(vcpu); + trace_kvm_set_guest_debug(vcpu, dbg->control); - if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) - return -EINVAL; + if (dbg->control & ~KVM_GUESTDBG_VALID_MASK) { + ret = -EINVAL; + goto out; + } if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; @@ -378,7 +384,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, /* If not enabled clear all flags */ vcpu->guest_debug = 0; } - return 0; + +out: + vcpu_put(vcpu); + return ret; } int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu, diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 047651622cb8..234531d1bee1 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -755,7 +755,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { + vcpu_load(vcpu); vcpu->guest_debug = dbg->control; + vcpu_put(vcpu); return 0; } diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index b5f46517c839..6038e2e7aee0 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -2016,12 +2016,15 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, { struct debug_reg *dbg_reg; int n, b = 0, w = 0; + int ret = 0; + + vcpu_load(vcpu); if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { vcpu->arch.dbg_reg.dbcr0 = 0; vcpu->guest_debug = 0; kvm_guest_protect_msr(vcpu, MSR_DE, false); - return 0; + goto out; } kvm_guest_protect_msr(vcpu, MSR_DE, true); @@ -2053,8 +2056,9 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, #endif if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) - return 0; + goto out; + ret = -EINVAL; for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { uint64_t addr = dbg->arch.bp[n].addr; uint32_t type = dbg->arch.bp[n].type; @@ -2065,21 +2069,24 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, if (type & ~(KVMPPC_DEBUG_WATCH_READ | KVMPPC_DEBUG_WATCH_WRITE | KVMPPC_DEBUG_BREAKPOINT)) - return -EINVAL; + goto out; if (type & KVMPPC_DEBUG_BREAKPOINT) { /* Setting H/W breakpoint */ if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) - return -EINVAL; + goto out; } else { /* Setting H/W watchpoint */ if (kvmppc_booke_add_watchpoint(dbg_reg, addr, type, w++)) - return -EINVAL; + goto out; } } - return 0; + ret = 0; +out: + vcpu_put(vcpu); + return ret; } void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4e6304d45ca5..1c82dda07bc5 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2801,13 +2801,19 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, { int rc = 0; + vcpu_load(vcpu); + vcpu->guest_debug = 0; kvm_s390_clear_bp_data(vcpu); - if (dbg->control & ~VALID_GUESTDBG_FLAGS) - return -EINVAL; - if (!sclp.has_gpere) - return -EINVAL; + if (dbg->control & ~VALID_GUESTDBG_FLAGS) { + rc = -EINVAL; + goto out; + } + if (!sclp.has_gpere) { + rc = -EINVAL; + goto out; + } if (dbg->control & KVM_GUESTDBG_ENABLE) { vcpu->guest_debug = dbg->control; @@ -2827,6 +2833,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, atomic_andnot(CPUSTAT_P, &vcpu->arch.sie_block->cpuflags); } +out: + vcpu_put(vcpu); return rc; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 080e00dff426..9ef0b9299e93 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7625,6 +7625,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, unsigned long rflags; int i, r; + vcpu_load(vcpu); + if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { r = -EBUSY; if (vcpu->arch.exception.pending) @@ -7670,7 +7672,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, r = 0; out: - + vcpu_put(vcpu); return r; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 37217ec647b4..b44762fcaf84 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2673,9 +2673,7 @@ out_free1: r = -EFAULT; if (copy_from_user(&dbg, argp, sizeof(dbg))) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, &dbg); - vcpu_put(vcpu); break; } case KVM_SET_SIGNAL_MASK: { -- cgit v1.2.3 From 1393123e1e24aba96413d351b9546086ea07504d Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:34 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_get_fpu Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_get_fpu(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 4 ++++ arch/x86/kvm/x86.c | 7 +++++-- virt/kvm/kvm_main.c | 2 -- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 1c82dda07bc5..39327888bb06 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2762,6 +2762,8 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { + vcpu_load(vcpu); + /* make sure we have the latest values */ save_fpu_regs(); if (MACHINE_HAS_VX) @@ -2770,6 +2772,8 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) else memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs)); fpu->fpc = vcpu->run->s.regs.fpc; + + vcpu_put(vcpu); return 0; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9ef0b9299e93..846d292ea33b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7702,9 +7702,11 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + struct fxregs_state *fxsave; + vcpu_load(vcpu); + + fxsave = &vcpu->arch.guest_fpu.state.fxsave; memcpy(fpu->fpr, fxsave->st_space, 128); fpu->fcw = fxsave->cwd; fpu->fsw = fxsave->swd; @@ -7714,6 +7716,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fpu->last_dp = fxsave->rdp; memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b44762fcaf84..5791aa687233 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2704,9 +2704,7 @@ out_free1: r = -ENOMEM; if (!fpu) goto out; - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu); - vcpu_put(vcpu); if (r) goto out; r = -EFAULT; -- cgit v1.2.3 From 6a96bc7fa0cdd96bac2b8298d708a94f8de6f6d4 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:35 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_fpu Move vcpu_load() and vcpu_put() into the architecture specific implementations of kvm_arch_vcpu_ioctl_set_fpu(). Reviewed-by: David Hildenbrand Signed-off-by: Christoffer Dall Reviewed-by: Cornelia Huck Signed-off-by: Paolo Bonzini --- arch/s390/kvm/kvm-s390.c | 15 ++++++++++++--- arch/x86/kvm/x86.c | 8 ++++++-- virt/kvm/kvm_main.c | 2 -- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 39327888bb06..3bd2f83e1fa9 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2749,15 +2749,24 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - if (test_fp_ctl(fpu->fpc)) - return -EINVAL; + int ret = 0; + + vcpu_load(vcpu); + + if (test_fp_ctl(fpu->fpc)) { + ret = -EINVAL; + goto out; + } vcpu->run->s.regs.fpc = fpu->fpc; if (MACHINE_HAS_VX) convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, (freg_t *) fpu->fprs); else memcpy(vcpu->run->s.regs.fprs, &fpu->fprs, sizeof(fpu->fprs)); - return 0; + +out: + vcpu_put(vcpu); + return ret; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 846d292ea33b..981b61531ab7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7722,8 +7722,11 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - struct fxregs_state *fxsave = - &vcpu->arch.guest_fpu.state.fxsave; + struct fxregs_state *fxsave; + + vcpu_load(vcpu); + + fxsave = &vcpu->arch.guest_fpu.state.fxsave; memcpy(fxsave->st_space, fpu->fpr, 128); fxsave->cwd = fpu->fcw; @@ -7734,6 +7737,7 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) fxsave->rdp = fpu->last_dp; memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space); + vcpu_put(vcpu); return 0; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 5791aa687233..ca0ec9fb72ce 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2720,9 +2720,7 @@ out_free1: fpu = NULL; goto out; } - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu); - vcpu_put(vcpu); break; } default: -- cgit v1.2.3 From 9b062471e52a1692c5563ba1535c84d708e2ff6f Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Mon, 4 Dec 2017 21:35:36 +0100 Subject: KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl Move the calls to vcpu_load() and vcpu_put() in to the architecture specific implementations of kvm_arch_vcpu_ioctl() which dispatches further architecture-specific ioctls on to other functions. Some architectures support asynchronous vcpu ioctls which cannot call vcpu_load() or take the vcpu->mutex, because that would prevent concurrent execution with a running VCPU, which is the intended purpose of these ioctls, for example because they inject interrupts. We repeat the separate checks for these specifics in the architecture code for MIPS, S390 and PPC, and avoid taking the vcpu->mutex and calling vcpu_load for these ioctls. Signed-off-by: Christoffer Dall Signed-off-by: Paolo Bonzini --- arch/mips/kvm/mips.c | 49 +++++++++++++++++++++++---------------- arch/powerpc/kvm/powerpc.c | 13 ++++++----- arch/s390/kvm/kvm-s390.c | 19 ++++++++------- arch/x86/kvm/x86.c | 22 +++++++++++++----- virt/kvm/arm/arm.c | 58 ++++++++++++++++++++++++++++++++-------------- virt/kvm/kvm_main.c | 2 -- 6 files changed, 103 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index dd5f463b0e72..9200b3def440 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -910,56 +910,65 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl, void __user *argp = (void __user *)arg; long r; + if (ioctl == KVM_INTERRUPT) { + struct kvm_mips_interrupt irq; + + if (copy_from_user(&irq, argp, sizeof(irq))) + return -EFAULT; + kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, + irq.irq); + + return kvm_vcpu_ioctl_interrupt(vcpu, &irq); + } + + vcpu_load(vcpu); + switch (ioctl) { case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) - return -EFAULT; + break; if (ioctl == KVM_SET_ONE_REG) - return kvm_mips_set_reg(vcpu, ®); + r = kvm_mips_set_reg(vcpu, ®); else - return kvm_mips_get_reg(vcpu, ®); + r = kvm_mips_get_reg(vcpu, ®); + break; } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; struct kvm_reg_list reg_list; unsigned n; + r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) - return -EFAULT; + break; n = reg_list.n; reg_list.n = kvm_mips_num_regs(vcpu); if (copy_to_user(user_list, ®_list, sizeof(reg_list))) - return -EFAULT; + break; + r = -E2BIG; if (n < reg_list.n) - return -E2BIG; - return kvm_mips_copy_reg_indices(vcpu, user_list->reg); - } - case KVM_INTERRUPT: - { - struct kvm_mips_interrupt irq; - - if (copy_from_user(&irq, argp, sizeof(irq))) - return -EFAULT; - kvm_debug("[%d] %s: irq: %d\n", vcpu->vcpu_id, __func__, - irq.irq); - - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); break; - } + r = kvm_mips_copy_reg_indices(vcpu, user_list->reg); + break; + } case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; + r = -EFAULT; if (copy_from_user(&cap, argp, sizeof(cap))) - return -EFAULT; + break; r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); break; } default: r = -ENOIOCTLCMD; } + + vcpu_put(vcpu); return r; } diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 4e2167a7ae19..ba8134a989c1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -1614,16 +1614,16 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void __user *argp = (void __user *)arg; long r; - switch (ioctl) { - case KVM_INTERRUPT: { + if (ioctl == KVM_INTERRUPT) { struct kvm_interrupt irq; - r = -EFAULT; if (copy_from_user(&irq, argp, sizeof(irq))) - goto out; - r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); - goto out; + return -EFAULT; + return kvm_vcpu_ioctl_interrupt(vcpu, &irq); } + vcpu_load(vcpu); + + switch (ioctl) { case KVM_ENABLE_CAP: { struct kvm_enable_cap cap; @@ -1663,6 +1663,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: + vcpu_put(vcpu); return r; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 3bd2f83e1fa9..9700d71cb691 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -3737,24 +3737,25 @@ long kvm_arch_vcpu_ioctl(struct file *filp, case KVM_S390_IRQ: { struct kvm_s390_irq s390irq; - r = -EFAULT; if (copy_from_user(&s390irq, argp, sizeof(s390irq))) - break; - r = kvm_s390_inject_vcpu(vcpu, &s390irq); - break; + return -EFAULT; + return kvm_s390_inject_vcpu(vcpu, &s390irq); } case KVM_S390_INTERRUPT: { struct kvm_s390_interrupt s390int; struct kvm_s390_irq s390irq; - r = -EFAULT; if (copy_from_user(&s390int, argp, sizeof(s390int))) - break; + return -EFAULT; if (s390int_to_s390irq(&s390int, &s390irq)) return -EINVAL; - r = kvm_s390_inject_vcpu(vcpu, &s390irq); - break; + return kvm_s390_inject_vcpu(vcpu, &s390irq); } + } + + vcpu_load(vcpu); + + switch (ioctl) { case KVM_S390_STORE_STATUS: idx = srcu_read_lock(&vcpu->kvm->srcu); r = kvm_s390_vcpu_store_status(vcpu, arg); @@ -3879,6 +3880,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, default: r = -ENOTTY; } + + vcpu_put(vcpu); return r; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 981b61531ab7..fafaef072f1b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3484,6 +3484,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, void *buffer; } u; + vcpu_load(vcpu); + u.buffer = NULL; switch (ioctl) { case KVM_GET_LAPIC: { @@ -3509,8 +3511,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (!lapic_in_kernel(vcpu)) goto out; u.lapic = memdup_user(argp, sizeof(*u.lapic)); - if (IS_ERR(u.lapic)) - return PTR_ERR(u.lapic); + if (IS_ERR(u.lapic)) { + r = PTR_ERR(u.lapic); + goto out_nofree; + } r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic); break; @@ -3684,8 +3688,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XSAVE: { u.xsave = memdup_user(argp, sizeof(*u.xsave)); - if (IS_ERR(u.xsave)) - return PTR_ERR(u.xsave); + if (IS_ERR(u.xsave)) { + r = PTR_ERR(u.xsave); + goto out_nofree; + } r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave); break; @@ -3707,8 +3713,10 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } case KVM_SET_XCRS: { u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); - if (IS_ERR(u.xcrs)) - return PTR_ERR(u.xcrs); + if (IS_ERR(u.xcrs)) { + r = PTR_ERR(u.xcrs); + goto out_nofree; + } r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; @@ -3752,6 +3760,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, } out: kfree(u.buffer); +out_nofree: + vcpu_put(vcpu); return r; } diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index aa6167086211..cd7d90c9f644 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -1003,66 +1003,88 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; struct kvm_device_attr attr; + long r; + + vcpu_load(vcpu); switch (ioctl) { case KVM_ARM_VCPU_INIT: { struct kvm_vcpu_init init; + r = -EFAULT; if (copy_from_user(&init, argp, sizeof(init))) - return -EFAULT; + break; - return kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + r = kvm_arch_vcpu_ioctl_vcpu_init(vcpu, &init); + break; } case KVM_SET_ONE_REG: case KVM_GET_ONE_REG: { struct kvm_one_reg reg; + r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; + break; + r = -EFAULT; if (copy_from_user(®, argp, sizeof(reg))) - return -EFAULT; + break; + if (ioctl == KVM_SET_ONE_REG) - return kvm_arm_set_reg(vcpu, ®); + r = kvm_arm_set_reg(vcpu, ®); else - return kvm_arm_get_reg(vcpu, ®); + r = kvm_arm_get_reg(vcpu, ®); + break; } case KVM_GET_REG_LIST: { struct kvm_reg_list __user *user_list = argp; struct kvm_reg_list reg_list; unsigned n; + r = -ENOEXEC; if (unlikely(!kvm_vcpu_initialized(vcpu))) - return -ENOEXEC; + break; + r = -EFAULT; if (copy_from_user(®_list, user_list, sizeof(reg_list))) - return -EFAULT; + break; n = reg_list.n; reg_list.n = kvm_arm_num_regs(vcpu); if (copy_to_user(user_list, ®_list, sizeof(reg_list))) - return -EFAULT; + break; + r = -E2BIG; if (n < reg_list.n) - return -E2BIG; - return kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; + r = kvm_arm_copy_reg_indices(vcpu, user_list->reg); + break; } case KVM_SET_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_set_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_set_attr(vcpu, &attr); + break; } case KVM_GET_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_get_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_get_attr(vcpu, &attr); + break; } case KVM_HAS_DEVICE_ATTR: { + r = -EFAULT; if (copy_from_user(&attr, argp, sizeof(attr))) - return -EFAULT; - return kvm_arm_vcpu_has_attr(vcpu, &attr); + break; + r = kvm_arm_vcpu_has_attr(vcpu, &attr); + break; } default: - return -EINVAL; + r = -EINVAL; } + + vcpu_put(vcpu); + return r; } /** diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ca0ec9fb72ce..19c184fa1839 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2724,9 +2724,7 @@ out_free1: break; } default: - vcpu_load(vcpu); r = kvm_arch_vcpu_ioctl(filp, ioctl, arg); - vcpu_put(vcpu); } out: mutex_unlock(&vcpu->mutex); -- cgit v1.2.3 From fa55eedd6328d3072e82218a2346b8752253af2d Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:01 -0800 Subject: KVM: X86: Add KVM_VCPU_PREEMPTED MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The next patch will add another bit to the preempted field in kvm_steal_time. Define a constant for bit 0 (the only one that is currently used). Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/uapi/asm/kvm_para.h | 2 ++ arch/x86/kernel/kvm.c | 2 +- arch/x86/kvm/x86.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 09cc06483bed..15685bd22faa 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -51,6 +51,8 @@ struct kvm_steal_time { __u32 pad[11]; }; +#define KVM_VCPU_PREEMPTED (1 << 0) + #define KVM_CLOCK_PAIRING_WALLCLOCK 0 struct kvm_clock_pairing { __s64 sec; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index b40ffbf156c1..6610b92fc6a5 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -643,7 +643,7 @@ __visible bool __kvm_vcpu_is_preempted(long cpu) { struct kvm_steal_time *src = &per_cpu(steal_time, cpu); - return !!src->preempted; + return !!(src->preempted & KVM_VCPU_PREEMPTED); } PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fafaef072f1b..897f4795513f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2916,7 +2916,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - vcpu->arch.st.steal.preempted = 1; + vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED; kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, -- cgit v1.2.3 From 858a43aae23672d46fe802a41f4748f322965182 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:02 -0800 Subject: KVM: X86: use paravirtualized TLB Shootdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remote TLB flush does a busy wait which is fine in bare-metal scenario. But with-in the guest, the vcpus might have been pre-empted or blocked. In this scenario, the initator vcpu would end up busy-waiting for a long amount of time; it also consumes CPU unnecessarily to wake up the target of the shootdown. This patch set adds support for KVM's new paravirtualized TLB flush; remote TLB flush does not wait for vcpus that are sleeping, instead KVM will flush the TLB as soon as the vCPU starts running again. The improvement is clearly visible when the host is overcommitted; in this case, the PV TLB flush (in addition to avoiding the wait on the main CPU) prevents preempted vCPUs from stealing precious execution time from the running ones. Testing on a Xeon Gold 6142 2.6GHz 2 sockets, 32 cores, 64 threads, so 64 pCPUs, and each VM is 64 vCPUs. ebizzy -M vanilla optimized boost 1VM 46799 48670 4% 2VM 23962 42691 78% 3VM 16152 37539 132% Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- Documentation/virtual/kvm/cpuid.txt | 4 +++ arch/x86/include/uapi/asm/kvm_para.h | 2 ++ arch/x86/kernel/kvm.c | 47 ++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+) (limited to 'arch/x86') diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index 3c65feb83010..dcab6dc11e3b 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -54,6 +54,10 @@ KVM_FEATURE_PV_UNHALT || 7 || guest checks this feature bit || || before enabling paravirtualized || || spinlock support. ------------------------------------------------------------------------------ +KVM_FEATURE_PV_TLB_FLUSH || 9 || guest checks this feature bit + || || before enabling paravirtualized + || || tlb flush. +------------------------------------------------------------------------------ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 15685bd22faa..7a2ade4aa235 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -25,6 +25,7 @@ #define KVM_FEATURE_STEAL_TIME 5 #define KVM_FEATURE_PV_EOI 6 #define KVM_FEATURE_PV_UNHALT 7 +#define KVM_FEATURE_PV_TLB_FLUSH 9 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. @@ -52,6 +53,7 @@ struct kvm_steal_time { }; #define KVM_VCPU_PREEMPTED (1 << 0) +#define KVM_VCPU_FLUSH_TLB (1 << 1) #define KVM_CLOCK_PAIRING_WALLCLOCK 0 struct kvm_clock_pairing { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 6610b92fc6a5..4e37d1a851a6 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -498,6 +498,34 @@ static void __init kvm_apf_trap_init(void) update_intr_gate(X86_TRAP_PF, async_page_fault); } +static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask); + +static void kvm_flush_tlb_others(const struct cpumask *cpumask, + const struct flush_tlb_info *info) +{ + u8 state; + int cpu; + struct kvm_steal_time *src; + struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask); + + cpumask_copy(flushmask, cpumask); + /* + * We have to call flush only on online vCPUs. And + * queue flush_on_enter for pre-empted vCPUs + */ + for_each_cpu(cpu, flushmask) { + src = &per_cpu(steal_time, cpu); + state = READ_ONCE(src->preempted); + if ((state & KVM_VCPU_PREEMPTED)) { + if (try_cmpxchg(&src->preempted, &state, + state | KVM_VCPU_FLUSH_TLB)) + __cpumask_clear_cpu(cpu, flushmask); + } + } + + native_flush_tlb_others(flushmask, info); +} + static void __init kvm_guest_init(void) { int i; @@ -517,6 +545,9 @@ static void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) + pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; + if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); @@ -598,6 +629,22 @@ static __init int activate_jump_labels(void) } arch_initcall(activate_jump_labels); +static __init int kvm_setup_pv_tlb_flush(void) +{ + int cpu; + + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) { + for_each_possible_cpu(cpu) { + zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), + GFP_KERNEL, cpu_to_node(cpu)); + } + pr_info("KVM setup pv remote TLB flush\n"); + } + + return 0; +} +arch_initcall(kvm_setup_pv_tlb_flush); + #ifdef CONFIG_PARAVIRT_SPINLOCKS /* Kick a cpu by its apicid. Used to wake up a halted vcpu */ -- cgit v1.2.3 From c2ba05ccfde2f069a66c0462e5b5ef8a517dcc9c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:03 -0800 Subject: KVM: X86: introduce invalidate_gpa argument to tlb flush MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new bool invalidate_gpa argument to kvm_x86_ops->tlb_flush, it will be used by later patches to just flush guest tlb. For VMX, this will use INVVPID instead of INVEPT, which will invalidate combined mappings while keeping guest-physical mappings. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 14 +++++++------- arch/x86/kvm/vmx.c | 21 +++++++++++---------- arch/x86/kvm/x86.c | 6 +++--- 4 files changed, 22 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 340e3604dcc7..44de261e9223 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -966,7 +966,7 @@ struct kvm_x86_ops { unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); - void (*tlb_flush)(struct kvm_vcpu *vcpu); + void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa); void (*run)(struct kvm_vcpu *vcpu); int (*handle_exit)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1f3e7f210374..14cca8c601a9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -285,7 +285,7 @@ static int vgif = true; module_param(vgif, int, 0444); static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); -static void svm_flush_tlb(struct kvm_vcpu *vcpu); +static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); static int nested_svm_exit_handled(struct vcpu_svm *svm); @@ -2035,7 +2035,7 @@ static int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) return 1; if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); vcpu->arch.cr4 = cr4; if (!npt_enabled) @@ -2385,7 +2385,7 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, svm->vmcb->control.nested_cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_NPT); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu, @@ -2989,7 +2989,7 @@ static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa, svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; svm->nested.intercept = nested_vmcb->control.intercept; - svm_flush_tlb(&svm->vcpu); + svm_flush_tlb(&svm->vcpu, true); svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) svm->vcpu.arch.hflags |= HF_VINTR_MASK; @@ -4785,7 +4785,7 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) return 0; } -static void svm_flush_tlb(struct kvm_vcpu *vcpu) +static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { struct vcpu_svm *svm = to_svm(vcpu); @@ -5076,7 +5076,7 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = __sme_set(root); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) @@ -5090,7 +5090,7 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) svm->vmcb->save.cr3 = kvm_read_cr3(vcpu); mark_dirty(svm->vmcb, VMCB_CR); - svm_flush_tlb(vcpu); + svm_flush_tlb(vcpu, true); } static int is_disabled(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ef7d13e536a4..c1791759f1e6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4140,9 +4140,10 @@ static void exit_lmode(struct kvm_vcpu *vcpu) #endif -static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) +static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid, + bool invalidate_gpa) { - if (enable_ept) { + if (enable_ept && (invalidate_gpa || !enable_vpid)) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; ept_sync_context(construct_eptp(vcpu, vcpu->arch.mmu.root_hpa)); @@ -4151,15 +4152,15 @@ static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) } } -static void vmx_flush_tlb(struct kvm_vcpu *vcpu) +static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa); } static void vmx_flush_tlb_ept_only(struct kvm_vcpu *vcpu) { if (enable_ept) - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) @@ -4357,7 +4358,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) ept_load_pdptrs(vcpu); } - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); vmcs_writel(GUEST_CR3, guest_cr3); } @@ -7932,7 +7933,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } - __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + __vmx_flush_tlb(vcpu, vmx->nested.vpid02, true); nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); @@ -10614,11 +10615,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); + __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true); } } else { vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } } @@ -11323,7 +11324,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * L1's vpid. TODO: move to a more elaborate solution, giving * each L2 its own vpid and exposing the vpid feature to L1. */ - vmx_flush_tlb(vcpu); + vmx_flush_tlb(vcpu, true); } /* Restore posted intr vector. */ if (nested_cpu_has_posted_intr(vmcs12)) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 897f4795513f..1e6d9f2d1f0b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6781,10 +6781,10 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu) +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) { ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu); + kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); } void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, @@ -6855,7 +6855,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) kvm_mmu_sync_roots(vcpu); if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - kvm_vcpu_flush_tlb(vcpu); + kvm_vcpu_flush_tlb(vcpu, true); if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; r = 0; -- cgit v1.2.3 From f38a7b75267f1fb240a8178cbcb16d66dd37aac8 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Dec 2017 17:33:04 -0800 Subject: KVM: X86: support paravirtualized help for TLB shootdowns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running on a virtual machine, IPIs are expensive when the target CPU is sleeping. Thus, it is nice to be able to avoid them for TLB shootdowns. KVM can just do the flush via INVVPID on the guest's behalf the next time the CPU is scheduled. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Peter Zijlstra Signed-off-by: Wanpeng Li [Use "&" to test the bit instead of "==". - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 3 ++- arch/x86/kvm/x86.c | 19 ++++++++++++------- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 3db9e1cd4f63..a0e6c975f3a8 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -602,7 +602,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | - (1 << KVM_FEATURE_PV_UNHALT); + (1 << KVM_FEATURE_PV_UNHALT) | + (1 << KVM_FEATURE_PV_TLB_FLUSH); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1e6d9f2d1f0b..35d0b3de697b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2122,6 +2122,12 @@ static void kvmclock_reset(struct kvm_vcpu *vcpu) vcpu->arch.pv_time_enabled = false; } +static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) +{ + ++vcpu->stat.tlb_flush; + kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); +} + static void record_steal_time(struct kvm_vcpu *vcpu) { if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) @@ -2131,7 +2137,12 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; - vcpu->arch.st.steal.preempted = 0; + /* + * Doing a TLB flush here, on the guest's behalf, can avoid + * expensive IPIs. + */ + if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB) + kvm_vcpu_flush_tlb(vcpu, false); if (vcpu->arch.st.steal.version & 1) vcpu->arch.st.steal.version += 1; /* first time write, random junk */ @@ -6781,12 +6792,6 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa) -{ - ++vcpu->stat.tlb_flush; - kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa); -} - void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end) { -- cgit v1.2.3 From efdab992813fb2ed825745625b83c05032e9cda2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Wed, 13 Dec 2017 10:46:40 +0100 Subject: KVM: x86: fix escape of guest dr6 to the host MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit syzkaller reported: WARNING: CPU: 0 PID: 12927 at arch/x86/kernel/traps.c:780 do_debug+0x222/0x250 CPU: 0 PID: 12927 Comm: syz-executor Tainted: G OE 4.15.0-rc2+ #16 RIP: 0010:do_debug+0x222/0x250 Call Trace: <#DB> debug+0x3e/0x70 RIP: 0010:copy_user_enhanced_fast_string+0x10/0x20 _copy_from_user+0x5b/0x90 SyS_timer_create+0x33/0x80 entry_SYSCALL_64_fastpath+0x23/0x9a The testcase sets a watchpoint (with perf_event_open) on a buffer that is passed to timer_create() as the struct sigevent argument. In timer_create(), copy_from_user()'s rep movsb triggers the BP. The testcase also sets the debug registers for the guest. However, KVM only restores host debug registers when the host has active watchpoints, which triggers a race condition when running the testcase with multiple threads. The guest's DR6.BS bit can escape to the host before another thread invokes timer_create(), and do_debug() complains. The fix is to respect do_debug()'s dr6 invariant when leaving KVM. Reported-by: Dmitry Vyukov Cc: Paolo Bonzini Cc: Radim Krčmář Cc: David Hildenbrand Cc: Dmitry Vyukov Reviewed-by: David Hildenbrand Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 35d0b3de697b..a3dd44bb6f1e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2961,6 +2961,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) pagefault_enable(); kvm_x86_ops->vcpu_put(vcpu); vcpu->arch.last_host_tsc = rdtsc(); + /* + * If userspace has set any breakpoints or watchpoints, dr6 is restored + * on every vmexit, but if not, we might have a stale dr6 from the + * guest. do_debug expects dr6 to be cleared after it runs, do the same. + */ + set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 476b7adaa3272557168b287175b1e9e943913404 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 13:51:32 +0100 Subject: KVM: x86: avoid unnecessary XSETBV on guest entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit xsetbv can be expensive when running on nested virtualization, try to avoid it. Reviewed-by: Jim Mattson Reviewed-by: Wanpeng Li Reviewed-by: Quan Xu Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a3dd44bb6f1e..56d8a1e11e50 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -702,7 +702,8 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && !vcpu->guest_xcr0_loaded) { /* kvm_set_xcr() also depends on this */ - xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); + if (vcpu->arch.xcr0 != host_xcr0) + xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0); vcpu->guest_xcr0_loaded = 1; } } -- cgit v1.2.3 From 505c9e94d8329dc215617cf99505629e924f001e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 11 Jan 2018 13:10:38 +0100 Subject: KVM: x86: prefer "depends on" to "select" for SEV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid reverse dependencies. Instead, SEV will only be enabled if the PSP driver is available. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/Kconfig | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 148ea324b90c..92fd433c50b9 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -85,9 +85,7 @@ config KVM_AMD_SEV def_bool y bool "AMD Secure Encrypted Virtualization (SEV) support" depends on KVM_AMD && X86_64 - select CRYPTO_DEV_CCP - select CRYPTO_DEV_CCP_DD - select CRYPTO_DEV_SP_PSP + depends on CRYPTO_DEV_CCP && CRYPTO_DEV_CCP_DD && CRYPTO_DEV_SP_PSP ---help--- Provides support for launching Encrypted VMs on AMD processors. -- cgit v1.2.3 From b8d7044bcff7a955257b242515bcf1e5045edd9b Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 20 Dec 2017 15:29:28 +0800 Subject: x86/mm: add a function to check if a pfn is UC/UC-/WC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check whether the PAT memory type of a pfn cannot be overridden by MTRR UC memory type, i.e. the PAT memory type is UC, UC- or WC. This function will be used by KVM to distinguish MMIO pfns and give them UC memory type in the EPT page tables (on Intel processors, EPT memory types work like MTRRs). Signed-off-by: Haozhong Zhang Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/include/asm/pat.h | 2 ++ arch/x86/mm/pat.c | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h index 8a3ee355b422..92015c65fa2a 100644 --- a/arch/x86/include/asm/pat.h +++ b/arch/x86/include/asm/pat.h @@ -22,4 +22,6 @@ int io_reserve_memtype(resource_size_t start, resource_size_t end, void io_free_memtype(resource_size_t start, resource_size_t end); +bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn); + #endif /* _ASM_X86_PAT_H */ diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index fe7d57a8fb60..1555bd7d3449 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -677,6 +677,25 @@ static enum page_cache_mode lookup_memtype(u64 paddr) return rettype; } +/** + * pat_pfn_immune_to_uc_mtrr - Check whether the PAT memory type + * of @pfn cannot be overridden by UC MTRR memory type. + * + * Only to be called when PAT is enabled. + * + * Returns true, if the PAT memory type of @pfn is UC, UC-, or WC. + * Returns false in other cases. + */ +bool pat_pfn_immune_to_uc_mtrr(unsigned long pfn) +{ + enum page_cache_mode cm = lookup_memtype(PFN_PHYS(pfn)); + + return cm == _PAGE_CACHE_MODE_UC || + cm == _PAGE_CACHE_MODE_UC_MINUS || + cm == _PAGE_CACHE_MODE_WC; +} +EXPORT_SYMBOL_GPL(pat_pfn_immune_to_uc_mtrr); + /** * io_reserve_memtype - Request a memory type mapping for a region of memory * @start: start (physical address) of the region -- cgit v1.2.3 From aa2e063aea7961fb58eafecb924f8818e3d4ecfc Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Wed, 20 Dec 2017 15:29:29 +0800 Subject: KVM: MMU: consider host cache mode in MMIO page check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some reserved pages, such as those from NVDIMM DAX devices, are not for MMIO, and can be mapped with cached memory type for better performance. However, the above check misconceives those pages as MMIO. Because KVM maps MMIO pages with UC memory type, the performance of guest accesses to those pages would be harmed. Therefore, we check the host memory type in addition and only treat UC/UC-/WC pages as MMIO. Signed-off-by: Haozhong Zhang Reported-by: Cuevas Escareno, Ivan D Reported-by: Kumar, Karthik Reviewed-by: Xiao Guangrong Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/mmu.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff1e9ee259cf..1f1da400fcde 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -42,6 +42,7 @@ #include #include +#include #include #include #include @@ -2708,7 +2709,18 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, static bool kvm_is_mmio_pfn(kvm_pfn_t pfn) { if (pfn_valid(pfn)) - return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) && + /* + * Some reserved pages, such as those from NVDIMM + * DAX devices, are not for MMIO, and can be mapped + * with cached memory type for better performance. + * However, the above check misconceives those pages + * as MMIO, and results in KVM mapping them with UC + * memory type, which would hurt the performance. + * Therefore, we check the host memory type in addition + * and only treat UC/UC-/WC pages as MMIO. + */ + (!pat_enabled() || pat_pfn_immune_to_uc_mtrr(pfn)); return true; } -- cgit v1.2.3 From a6cb099a43314ddb2b0214c122f61288d3d6c0ba Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 20 Dec 2017 12:50:28 +0100 Subject: kvm/vmx: Use local vmx variable in vmx_get_msr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... just like in vmx_set_msr(). No functionality change. Signed-off-by: Borislav Petkov Cc: Paolo Bonzini Cc: Radim Krčmář Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7b1d71665bc2..14976cd4e0c1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3248,6 +3248,7 @@ static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu, */ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) { + struct vcpu_vmx *vmx = to_vmx(vcpu); struct shared_msr_entry *msr; switch (msr_info->index) { @@ -3259,8 +3260,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = vmcs_readl(GUEST_GS_BASE); break; case MSR_KERNEL_GS_BASE: - vmx_load_host_state(to_vmx(vcpu)); - msr_info->data = to_vmx(vcpu)->msr_guest_kernel_gs_base; + vmx_load_host_state(vmx); + msr_info->data = vmx->msr_guest_kernel_gs_base; break; #endif case MSR_EFER: @@ -3286,13 +3287,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_MCG_EXT_CTL: if (!msr_info->host_initiated && - !(to_vmx(vcpu)->msr_ia32_feature_control & + !(vmx->msr_ia32_feature_control & FEATURE_CONTROL_LMCE)) return 1; msr_info->data = vcpu->arch.mcg_ext_ctl; break; case MSR_IA32_FEATURE_CONTROL: - msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control; + msr_info->data = vmx->msr_ia32_feature_control; break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) @@ -3309,7 +3310,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; /* Otherwise falls through */ default: - msr = find_msr_entry(to_vmx(vcpu), msr_info->index); + msr = find_msr_entry(vmx, msr_info->index); if (msr) { msr_info->data = msr->data; break; -- cgit v1.2.3 From 5c7d4f9ad39d980728b39752304ce10bb2960cbf Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 19 Nov 2017 18:25:43 +0200 Subject: KVM: nVMX: Fix bug of injecting L2 exception into L1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_clear_exception_queue() should clear pending exception. This also includes exceptions which were only marked pending but not yet injected. This is because exception.pending is used for both L1 and L2 to determine if an exception should be raised to guest. Note that an exception which is pending but not yet injected will be raised again once the guest will be resumed. Consider the following scenario: 1) L0 KVM with ignore_msrs=false. 2) L1 prepare vmcs12 with the following: a) No intercepts on MSR (MSR_BITMAP exist and is filled with 0). b) No intercept for #GP. c) vmx-preemption-timer is configured. 3) L1 enters into L2. 4) L2 reads an unhandled MSR that exists in MSR_BITMAP (such as 0x1fff). L2 RDMSR could be handled as described below: 1) L2 exits to L0 on RDMSR and calls handle_rdmsr(). 2) handle_rdmsr() calls kvm_inject_gp() which sets KVM_REQ_EVENT, exception.pending=true and exception.injected=false. 3) vcpu_enter_guest() consumes KVM_REQ_EVENT and calls inject_pending_event() which calls vmx_check_nested_events() which sees that exception.pending=true but nested_vmx_check_exception() returns 0 and therefore does nothing at this point. However let's assume it later sees vmx-preemption-timer expired and therefore exits from L2 to L1 by calling nested_vmx_vmexit(). 4) nested_vmx_vmexit() calls prepare_vmcs12() which calls vmcs12_save_pending_event() but it does nothing as exception.injected is false. Also prepare_vmcs12() calls kvm_clear_exception_queue() which does nothing as exception.injected is already false. 5) We now return from vmx_check_nested_events() with 0 while still having exception.pending=true! 6) Therefore inject_pending_event() continues and we inject L2 exception to L1!... This commit will fix above issue by changing step (4) to clear exception.pending in kvm_clear_exception_queue(). Fixes: 664f8e26b00c ("KVM: X86: Fix loss of exception which has not yet been injected") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan Signed-off-by: Krish Sadhukhan Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 1 - arch/x86/kvm/x86.h | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 14976cd4e0c1..866c867b558a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11052,7 +11052,6 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr) if (block_nested_events) return -EBUSY; nested_vmx_inject_exception_vmexit(vcpu, exit_qual); - vcpu->arch.exception.pending = false; return 0; } diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c69f973111cb..b91215d1fd80 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -12,6 +12,7 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) { + vcpu->arch.exception.pending = false; vcpu->arch.exception.injected = false; } -- cgit v1.2.3 From fa59cc003804f7606b3d0aa8e16509f9103f7377 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:53 +0200 Subject: KVM: x86: Optimization: Create SVM stubs for sync_pir_to_irr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sync_pir_to_irr() is only called if vcpu->arch.apicv_active()==true. In case it is false, VMX code make sure to set sync_pir_to_irr to NULL. Therefore, having SVM stubs allows to remove check for if sync_pir_to_irr != NULL from all calling sites. Signed-off-by: Liran Alon Suggested-by: Paolo Bonzini Reviewed-by: Nikita Leshenko Reviewed-by: Liam Merwick [Return highest IRR in the SVM case. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 2 +- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/x86.c | 10 ++++------ 3 files changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e2c1fb8d35ce..0928608750e3 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -581,7 +581,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr) { int highest_irr; - if (kvm_x86_ops->sync_pir_to_irr && apic->vcpu->arch.apicv_active) + if (apic->vcpu->arch.apicv_active) highest_irr = kvm_x86_ops->sync_pir_to_irr(apic->vcpu); else highest_irr = apic_find_highest_irr(apic); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5d83f0474020..b613d331d031 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6740,6 +6740,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, + .sync_pir_to_irr = kvm_lapic_find_highest_irr, .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6d2e1459adc9..0204b2b8a293 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2973,7 +2973,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s) { - if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); return kvm_apic_get_state(vcpu, s); @@ -6820,7 +6820,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) if (irqchip_split(vcpu->kvm)) kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors); else { - if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) + if (vcpu->arch.apicv_active) kvm_x86_ops->sync_pir_to_irr(vcpu); kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors); } @@ -7046,10 +7046,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) * This handles the case where a posted interrupt was * notified with kvm_vcpu_kick. */ - if (kvm_lapic_enabled(vcpu)) { - if (kvm_x86_ops->sync_pir_to_irr && vcpu->arch.apicv_active) - kvm_x86_ops->sync_pir_to_irr(vcpu); - } + if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active) + kvm_x86_ops->sync_pir_to_irr(vcpu); if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || need_resched() || signal_pending(current)) { -- cgit v1.2.3 From e7387b0e27ec3a203bda4910dc4a107f6c5f912f Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:54 +0200 Subject: KVM: x86: Change __kvm_apic_update_irr() to also return if max IRR updated MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit doesn't change semantics. It is done as a preparation for future commits. Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Liam Merwick Signed-off-by: Liam Merwick Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/lapic.c | 23 ++++++++++++++++------- arch/x86/kvm/lapic.h | 4 ++-- arch/x86/kvm/vmx.c | 5 +++-- 3 files changed, 21 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 0928608750e3..924ac8ce9d50 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -364,32 +364,41 @@ static u8 count_vectors(void *bitmap) return count; } -int __kvm_apic_update_irr(u32 *pir, void *regs) +bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr) { u32 i, vec; - u32 pir_val, irr_val; - int max_irr = -1; + u32 pir_val, irr_val, prev_irr_val; + int max_updated_irr; + + max_updated_irr = -1; + *max_irr = -1; for (i = vec = 0; i <= 7; i++, vec += 32) { pir_val = READ_ONCE(pir[i]); irr_val = *((u32 *)(regs + APIC_IRR + i * 0x10)); if (pir_val) { + prev_irr_val = irr_val; irr_val |= xchg(&pir[i], 0); *((u32 *)(regs + APIC_IRR + i * 0x10)) = irr_val; + if (prev_irr_val != irr_val) { + max_updated_irr = + __fls(irr_val ^ prev_irr_val) + vec; + } } if (irr_val) - max_irr = __fls(irr_val) + vec; + *max_irr = __fls(irr_val) + vec; } - return max_irr; + return ((max_updated_irr != -1) && + (max_updated_irr == *max_irr)); } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); -int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir) +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr) { struct kvm_lapic *apic = vcpu->arch.apic; - return __kvm_apic_update_irr(pir, apic->regs); + return __kvm_apic_update_irr(pir, apic->regs, max_irr); } EXPORT_SYMBOL_GPL(kvm_apic_update_irr); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 4b9935a38347..56c36014f7b7 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -75,8 +75,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, int short_hand, unsigned int dest, int dest_mode); -int __kvm_apic_update_irr(u32 *pir, void *regs); -int kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir); +bool __kvm_apic_update_irr(u32 *pir, void *regs, int *max_irr); +bool kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir, int *max_irr); void kvm_apic_update_ppr(struct kvm_vcpu *vcpu); int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, struct dest_map *dest_map); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 866c867b558a..5ea482bb1b9c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5085,7 +5085,8 @@ static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu) max_irr = find_last_bit((unsigned long *)vmx->nested.pi_desc->pir, 256); if (max_irr != 256) { vapic_page = kmap(vmx->nested.virtual_apic_page); - __kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page); + __kvm_apic_update_irr(vmx->nested.pi_desc->pir, + vapic_page, &max_irr); kunmap(vmx->nested.virtual_apic_page); status = vmcs_read16(GUEST_INTR_STATUS); @@ -8986,7 +8987,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - max_irr = kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } -- cgit v1.2.3 From f27a85c4988d408a2918a3bcbc3d7fe4fb2dc2b3 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:55 +0200 Subject: KVM: nVMX: Re-evaluate L1 pending events when running L2 and L1 got posted-interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In case posted-interrupt was delivered to CPU while it is in host (outside guest), then posted-interrupt delivery will be done by calling sync_pir_to_irr() at vmentry after interrupts are disabled. sync_pir_to_irr() will check vmx->pi_desc.control ON bit and if set, it will sync vmx->pi_desc.pir to IRR and afterwards update RVI to ensure virtual-interrupt-delivery will dispatch interrupt to guest. However, it is possible that L1 will receive a posted-interrupt while CPU runs at host and is about to enter L2. In this case, the call to sync_pir_to_irr() will indeed update the L1's APIC IRR but vcpu_enter_guest() will then just resume into L2 guest without re-evaluating if it should exit from L2 to L1 as a result of this new pending L1 event. To address this case, if sync_pir_to_irr() has a new L1 injectable interrupt and CPU is running L2, we force exit GUEST_MODE which will result in another iteration of vcpu_run() run loop which will call kvm_vcpu_running() which will call check_nested_events() which will handle the pending L1 event properly. Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan Reviewed-by: Liam Merwick Signed-off-by: Liam Merwick Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5ea482bb1b9c..5fe94e375d2d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8978,6 +8978,7 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); int max_irr; + bool max_irr_updated; WARN_ON(!vcpu->arch.apicv_active); if (pi_test_on(&vmx->pi_desc)) { @@ -8987,7 +8988,16 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * But on x86 this is just a compiler barrier anyway. */ smp_mb__after_atomic(); - kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + max_irr_updated = + kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr); + + /* + * If we are running L2 and L1 has a new pending interrupt + * which can be injected, we should re-evaluate + * what should be done with this new L1 interrupt. + */ + if (is_guest_mode(vcpu) && max_irr_updated) + kvm_vcpu_exiting_guest_mode(vcpu); } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } -- cgit v1.2.3 From 851c1a18c5412fd321e387cfe60739387cdbf37d Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Sun, 24 Dec 2017 18:12:56 +0200 Subject: KVM: nVMX: Fix injection to L2 when L1 don't intercept external-interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before each vmentry to guest, vcpu_enter_guest() calls sync_pir_to_irr() which calls vmx_hwapic_irr_update() to update RVI. Currently, vmx_hwapic_irr_update() contains a tweak in case it is called when CPU is running L2 and L1 don't intercept external-interrupts. In that case, code injects interrupt directly into L2 instead of updating RVI. Besides being hacky (wouldn't expect function updating RVI to also inject interrupt), it also doesn't handle this case correctly. The code contains several issues: 1. When code calls kvm_queue_interrupt() it just passes it max_irr which represents the highest IRR currently pending in L1 LAPIC. This is problematic as interrupt was injected to guest but it's bit is still set in LAPIC IRR instead of being cleared from IRR and set in ISR. 2. Code doesn't check if LAPIC PPR is set to accept an interrupt of max_irr priority. It just checks if interrupts are enabled in guest with vmx_interrupt_allowed(). To fix the above issues: 1. Simplify vmx_hwapic_irr_update() to just update RVI. Note that this shouldn't happen when CPU is running L2 (See comment in code). 2. Since now vmx_hwapic_irr_update() only does logic for L1 virtual-interrupt-delivery, inject_pending_event() should be the one responsible for injecting the interrupt directly into L2. Therefore, change kvm_cpu_has_injectable_intr() to check L1 LAPIC when CPU is running L2. 3. Change vmx_sync_pir_to_irr() to set KVM_REQ_EVENT when L1 has a pending injectable interrupt. Fixes: 963fee165660 ("KVM: nVMX: Fix virtual interrupt delivery injection") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan Reviewed-by: Liam Merwick Signed-off-by: Liam Merwick Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/vmx.c | 41 +++++++++++++++++------------------------ 2 files changed, 18 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 5c24811e8b0b..f171051eecf3 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -79,7 +79,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v) if (kvm_cpu_has_extint(v)) return 1; - if (kvm_vcpu_apicv_active(v)) + if (!is_guest_mode(v) && kvm_vcpu_apicv_active(v)) return 0; return kvm_apic_has_interrupt(v) != -1; /* LAPIC */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5fe94e375d2d..2a96b1abe77c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8948,30 +8948,16 @@ static void vmx_set_rvi(int vector) static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { - if (!is_guest_mode(vcpu)) { - vmx_set_rvi(max_irr); - return; - } - - if (max_irr == -1) - return; - /* - * In guest mode. If a vmexit is needed, vmx_check_nested_events - * handles it. + * When running L2, updating RVI is only relevant when + * vmcs12 virtual-interrupt-delivery enabled. + * However, it can be enabled only when L1 also + * intercepts external-interrupts and in that case + * we should not update vmcs02 RVI but instead intercept + * interrupt. Therefore, do nothing when running L2. */ - if (nested_exit_on_intr(vcpu)) - return; - - /* - * Else, fall back to pre-APICv interrupt injection since L2 - * is run without virtual interrupt delivery. - */ - if (!kvm_event_needs_reinjection(vcpu) && - vmx_interrupt_allowed(vcpu)) { - kvm_queue_interrupt(vcpu, max_irr, false); - vmx_inject_irq(vcpu); - } + if (!is_guest_mode(vcpu)) + vmx_set_rvi(max_irr); } static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) @@ -8995,9 +8981,16 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) * If we are running L2 and L1 has a new pending interrupt * which can be injected, we should re-evaluate * what should be done with this new L1 interrupt. + * If L1 intercepts external-interrupts, we should + * exit from L2 to L1. Otherwise, interrupt should be + * delivered directly to L2. */ - if (is_guest_mode(vcpu) && max_irr_updated) - kvm_vcpu_exiting_guest_mode(vcpu); + if (is_guest_mode(vcpu) && max_irr_updated) { + if (nested_exit_on_intr(vcpu)) + kvm_vcpu_exiting_guest_mode(vcpu); + else + kvm_make_request(KVM_REQ_EVENT, vcpu); + } } else { max_irr = kvm_lapic_find_highest_irr(vcpu); } -- cgit v1.2.3 From 6b6977117f50d60455ace86b2d256f6fb4f3de05 Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Thu, 9 Nov 2017 20:27:20 +0200 Subject: KVM: nVMX: Fix races when sending nested PI while dest enters/leaves L2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Consider the following scenario: 1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI to CPU B via virtual posted-interrupt mechanism. 2. CPU B is currently executing L2 guest. 3. vmx_deliver_nested_posted_interrupt() calls kvm_vcpu_trigger_posted_interrupt() which will note that vcpu->mode == IN_GUEST_MODE. 4. Assume that before CPU A sends the physical POSTED_INTR_NESTED_VECTOR IPI, CPU B exits from L2 to L0 during event-delivery (valid IDT-vectoring-info). 5. CPU A now sends the physical IPI. The IPI is received in host and it's handler (smp_kvm_posted_intr_nested_ipi()) does nothing. 6. Assume that before CPU A sets pi_pending=true and KVM_REQ_EVENT, CPU B continues to run in L0 and reach vcpu_enter_guest(). As KVM_REQ_EVENT is not set yet, vcpu_enter_guest() will continue and resume L2 guest. 7. At this point, CPU A sets pi_pending=true and KVM_REQ_EVENT but it's too late! CPU B already entered L2 and KVM_REQ_EVENT will only be consumed at next L2 entry! Another scenario to consider: 1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI to CPU B via virtual posted-interrupt mechanism. 2. Assume that before CPU A calls kvm_vcpu_trigger_posted_interrupt(), CPU B is at L0 and is about to resume into L2. Further assume that it is in vcpu_enter_guest() after check for KVM_REQ_EVENT. 3. At this point, CPU A calls kvm_vcpu_trigger_posted_interrupt() which will note that vcpu->mode != IN_GUEST_MODE. Therefore, do nothing and return false. Then, will set pi_pending=true and KVM_REQ_EVENT. 4. Now CPU B continue and resumes into L2 guest without processing the posted-interrupt until next L2 entry! To fix both issues, we just need to change vmx_deliver_nested_posted_interrupt() to set pi_pending=true and KVM_REQ_EVENT before calling kvm_vcpu_trigger_posted_interrupt(). It will fix the first scenario by chaging step (6) to note that KVM_REQ_EVENT and pi_pending=true and therefore process nested posted-interrupt. It will fix the second scenario by two possible ways: 1. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B has changed vcpu->mode to IN_GUEST_MODE, physical IPI will be sent and will be received when CPU resumes into L2. 2. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B hasn't yet changed vcpu->mode to IN_GUEST_MODE, then after CPU B will change vcpu->mode it will call kvm_request_pending() which will return true and therefore force another round of vcpu_enter_guest() which will note that KVM_REQ_EVENT and pi_pending=true and therefore process nested posted-interrupt. Cc: stable@vger.kernel.org Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing") Signed-off-by: Liran Alon Reviewed-by: Nikita Leshenko Reviewed-by: Krish Sadhukhan [Add kvm_vcpu_kick to also handle the case where L1 doesn't intercept L2 HLT and L2 executes HLT instruction. - Paolo] Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2a96b1abe77c..2c8749ee3221 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5146,14 +5146,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu, if (is_guest_mode(vcpu) && vector == vmx->nested.posted_intr_nv) { - /* the PIR and ON have been set by L1. */ - kvm_vcpu_trigger_posted_interrupt(vcpu, true); /* * If a posted intr is not recognized by hardware, * we will accomplish it in the next vmentry. */ vmx->nested.pi_pending = true; kvm_make_request(KVM_REQ_EVENT, vcpu); + /* the PIR and ON have been set by L1. */ + if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true)) + kvm_vcpu_kick(vcpu); return 0; } return -1; -- cgit v1.2.3 From c5d167b27e00026711ad19a33a23d5d3d562148a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 11:05:19 +0100 Subject: KVM: vmx: shadow more fields that are read/written on every vmexits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compared to when VMCS shadowing was added to KVM, we are reading/writing a few more fields: the PML index, the interrupt status and the preemption timer value. The first two are because we are exposing more features to nested guests, the preemption timer is simply because we have grown a new optimization. Adding them to the shadow VMCS field lists reduces the cost of a vmexit by about 1000 clock cycles for each field that exists on bare metal. On the other hand, the guest BNDCFGS and TSC offset are not written on fast paths, so remove them. Suggested-by: Jim Mattson Cc: Jim Mattson Cc: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2c8749ee3221..7b114a293c66 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -726,11 +726,12 @@ static unsigned long shadow_read_write_fields[] = { GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, - GUEST_BNDCFGS, + GUEST_PML_INDEX, + GUEST_INTR_STATUS, + VMX_PREEMPTION_TIMER_VALUE, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, - TSC_OFFSET, EXCEPTION_BITMAP, CPU_BASED_VM_EXEC_CONTROL, VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -3903,9 +3904,22 @@ static void init_vmcs_shadow_fields(void) /* No checks for read only fields yet */ for (i = j = 0; i < max_shadow_read_write_fields; i++) { + /* + * PML and the preemption timer can be emulated, but the + * processor cannot vmwrite to fields that don't exist + * on bare metal. + */ switch (shadow_read_write_fields[i]) { - case GUEST_BNDCFGS: - if (!kvm_mpx_supported()) + case GUEST_PML_INDEX: + if (!cpu_has_vmx_pml()) + continue; + break; + case VMX_PREEMPTION_TIMER_VALUE: + if (!cpu_has_vmx_preemption_timer()) + continue; + break; + case GUEST_INTR_STATUS: + if (!cpu_has_vmx_apicv()) continue; break; default: @@ -6802,11 +6816,6 @@ static __init int hardware_setup(void) !(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global())) enable_vpid = 0; - if (!cpu_has_vmx_shadow_vmcs()) - enable_shadow_vmcs = 0; - if (enable_shadow_vmcs) - init_vmcs_shadow_fields(); - if (!cpu_has_vmx_ept() || !cpu_has_vmx_ept_4levels() || !cpu_has_vmx_ept_mt_wb() || @@ -6926,6 +6935,11 @@ static __init int hardware_setup(void) kvm_x86_ops->cancel_hv_timer = NULL; } + if (!cpu_has_vmx_shadow_vmcs()) + enable_shadow_vmcs = 0; + if (enable_shadow_vmcs) + init_vmcs_shadow_fields(); + kvm_set_posted_intr_wakeup_handler(wakeup_handler); kvm_mce_cap_supported |= MCG_LMCE_P; -- cgit v1.2.3 From 44900ba65e16ab3c6608e105654f38f54d030caa Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 12:58:02 +0100 Subject: KVM: VMX: optimize shadow VMCS copying MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because all fields can be read/written with a single vmread/vmwrite on 64-bit kernels, the switch statements in copy_vmcs12_to_shadow and copy_shadow_to_vmcs12 are unnecessary. What I did in this patch is to copy the two parts of 64-bit fields separately on 32-bit kernels, to keep all complicated #ifdef-ery in init_vmcs_shadow_fields. The disadvantage is that 64-bit fields have to be listed separately in shadow_read_only/read_write_fields, but those are few and we can validate the arrays when building the VMREAD and VMWRITE bitmaps. This saves a few hundred clock cycles per nested vmexit. However there is still a "switch" in vmcs_read_any and vmcs_write_any. So, while at it, this patch reorders the fields by type, hoping that the branch predictor appreciates it. Cc: Jim Mattson Cc: Wanpeng Li Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 143 ++++++++++++++++++++++++----------------------------- 1 file changed, 65 insertions(+), 78 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7b114a293c66..bdd3f155efae 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -686,7 +686,7 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) [number##_HIGH] = VMCS12_OFFSET(name)+4 -static unsigned long shadow_read_only_fields[] = { +static u16 shadow_read_only_fields[] = { /* * We do NOT shadow fields that are modified when L0 * traps and emulates any vmx instruction (e.g. VMPTRLD, @@ -699,49 +699,59 @@ static unsigned long shadow_read_only_fields[] = { * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified * by nested_vmx_failValid) */ + /* 32-bits */ VM_EXIT_REASON, VM_EXIT_INTR_INFO, VM_EXIT_INSTRUCTION_LEN, IDT_VECTORING_INFO_FIELD, IDT_VECTORING_ERROR_CODE, VM_EXIT_INTR_ERROR_CODE, + + /* Natural width */ EXIT_QUALIFICATION, GUEST_LINEAR_ADDRESS, - GUEST_PHYSICAL_ADDRESS + + /* 64-bit */ + GUEST_PHYSICAL_ADDRESS, + GUEST_PHYSICAL_ADDRESS_HIGH, }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); -static unsigned long shadow_read_write_fields[] = { +static u16 shadow_read_write_fields[] = { + /* 16-bits */ + GUEST_CS_SELECTOR, + GUEST_INTR_STATUS, + GUEST_PML_INDEX, + HOST_FS_SELECTOR, + HOST_GS_SELECTOR, + + /* 32-bits */ + CPU_BASED_VM_EXEC_CONTROL, + EXCEPTION_BITMAP, + VM_ENTRY_EXCEPTION_ERROR_CODE, + VM_ENTRY_INTR_INFO_FIELD, + VM_ENTRY_INSTRUCTION_LEN, TPR_THRESHOLD, + GUEST_CS_LIMIT, + GUEST_CS_AR_BYTES, + GUEST_INTERRUPTIBILITY_INFO, + VMX_PREEMPTION_TIMER_VALUE, + + /* Natural width */ GUEST_RIP, GUEST_RSP, GUEST_CR0, GUEST_CR3, GUEST_CR4, - GUEST_INTERRUPTIBILITY_INFO, GUEST_RFLAGS, - GUEST_CS_SELECTOR, - GUEST_CS_AR_BYTES, - GUEST_CS_LIMIT, GUEST_CS_BASE, GUEST_ES_BASE, - GUEST_PML_INDEX, - GUEST_INTR_STATUS, - VMX_PREEMPTION_TIMER_VALUE, CR0_GUEST_HOST_MASK, CR0_READ_SHADOW, CR4_READ_SHADOW, - EXCEPTION_BITMAP, - CPU_BASED_VM_EXEC_CONTROL, - VM_ENTRY_EXCEPTION_ERROR_CODE, - VM_ENTRY_INTR_INFO_FIELD, - VM_ENTRY_INSTRUCTION_LEN, - VM_ENTRY_EXCEPTION_ERROR_CODE, HOST_FS_BASE, HOST_GS_BASE, - HOST_FS_SELECTOR, - HOST_GS_SELECTOR }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); @@ -3901,15 +3911,39 @@ static void init_vmcs_shadow_fields(void) { int i, j; - /* No checks for read only fields yet */ + for (i = j = 0; i < max_shadow_read_only_fields; i++) { + u16 field = shadow_read_only_fields[i]; + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + (i + 1 == max_shadow_read_only_fields || + shadow_read_only_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_only_field %x\n", + field + 1); + + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif + if (j < i) + shadow_read_only_fields[j] = field; + j++; + } + max_shadow_read_only_fields = j; for (i = j = 0; i < max_shadow_read_write_fields; i++) { + u16 field = shadow_read_write_fields[i]; + if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + (i + 1 == max_shadow_read_write_fields || + shadow_read_write_fields[i + 1] != field + 1)) + pr_err("Missing field from shadow_read_write_field %x\n", + field + 1); + /* * PML and the preemption timer can be emulated, but the * processor cannot vmwrite to fields that don't exist * on bare metal. */ - switch (shadow_read_write_fields[i]) { + switch (field) { case GUEST_PML_INDEX: if (!cpu_has_vmx_pml()) continue; @@ -3926,31 +3960,17 @@ static void init_vmcs_shadow_fields(void) break; } + clear_bit(field, vmx_vmwrite_bitmap); + clear_bit(field, vmx_vmread_bitmap); +#ifdef CONFIG_X86_64 + if (field & 1) + continue; +#endif if (j < i) - shadow_read_write_fields[j] = - shadow_read_write_fields[i]; + shadow_read_write_fields[j] = field; j++; } max_shadow_read_write_fields = j; - - /* shadowed fields guest access without vmexit */ - for (i = 0; i < max_shadow_read_write_fields; i++) { - unsigned long field = shadow_read_write_fields[i]; - - clear_bit(field, vmx_vmwrite_bitmap); - clear_bit(field, vmx_vmread_bitmap); - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) { - clear_bit(field + 1, vmx_vmwrite_bitmap); - clear_bit(field + 1, vmx_vmread_bitmap); - } - } - for (i = 0; i < max_shadow_read_only_fields; i++) { - unsigned long field = shadow_read_only_fields[i]; - - clear_bit(field, vmx_vmread_bitmap); - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64) - clear_bit(field + 1, vmx_vmread_bitmap); - } } static __init int alloc_kvm_area(void) @@ -7538,7 +7558,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long field; u64 field_value; struct vmcs *shadow_vmcs = vmx->vmcs01.shadow_vmcs; - const unsigned long *fields = shadow_read_write_fields; + const u16 *fields = shadow_read_write_fields; const int num_fields = max_shadow_read_write_fields; preempt_disable(); @@ -7547,23 +7567,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) for (i = 0; i < num_fields; i++) { field = fields[i]; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - field_value = vmcs_read16(field); - break; - case VMCS_FIELD_TYPE_U32: - field_value = vmcs_read32(field); - break; - case VMCS_FIELD_TYPE_U64: - field_value = vmcs_read64(field); - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - field_value = vmcs_readl(field); - break; - default: - WARN_ON(1); - continue; - } + field_value = __vmcs_readl(field); vmcs12_write_any(&vmx->vcpu, field, field_value); } @@ -7575,7 +7579,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) { - const unsigned long *fields[] = { + const u16 *fields[] = { shadow_read_write_fields, shadow_read_only_fields }; @@ -7594,24 +7598,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) for (i = 0; i < max_fields[q]; i++) { field = fields[q][i]; vmcs12_read_any(&vmx->vcpu, field, &field_value); - - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: - vmcs_write16(field, (u16)field_value); - break; - case VMCS_FIELD_TYPE_U32: - vmcs_write32(field, (u32)field_value); - break; - case VMCS_FIELD_TYPE_U64: - vmcs_write64(field, (u64)field_value); - break; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: - vmcs_writel(field, (long)field_value); - break; - default: - WARN_ON(1); - break; - } + __vmcs_writel(field, field_value); } } -- cgit v1.2.3 From 5b15706dbf5bdf0c9708ca9bb1f37e95e315daf2 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 22 Dec 2017 12:11:12 -0800 Subject: kvm: vmx: Introduce VMCS12_MAX_FIELD_INDEX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is the highest index value used in any supported VMCS12 field encoding. It is used to populate the IA32_VMX_VMCS_ENUM MSR. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bdd3f155efae..bbfbed714dec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -407,6 +407,12 @@ struct __packed vmcs12 { */ #define VMCS12_SIZE 0x1000 +/* + * VMCS12_MAX_FIELD_INDEX is the highest index value used in any + * supported VMCS12 field encoding. + */ +#define VMCS12_MAX_FIELD_INDEX 0x17 + /* * The nested_vmx structure is part of vcpu_vmx, and holds information we need * for correct emulation of VMX (i.e., nested VMX) on this vcpu. @@ -2923,7 +2929,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); /* highest index: VMX_PREEMPTION_TIMER_VALUE */ - vmx->nested.nested_vmx_vmcs_enum = 0x2e; + vmx->nested.nested_vmx_vmcs_enum = VMCS12_MAX_FIELD_INDEX << 1; } /* -- cgit v1.2.3 From d37f4267a79f7d423103cb9fdd67a4a3a8194335 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 22 Dec 2017 12:12:16 -0800 Subject: kvm: vmx: Change vmcs_field_type to vmcs_field_width MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per the SDM, "[VMCS] Fields are grouped by width (16-bit, 32-bit, etc.) and type (guest-state, host-state, etc.)." Previously, the width was indicated by vmcs_field_type. To avoid confusion when we start dealing with both field width and field type, change vmcs_field_type to vmcs_field_width. Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bbfbed714dec..f1d41dd3744c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3894,17 +3894,17 @@ static void free_kvm_area(void) } } -enum vmcs_field_type { - VMCS_FIELD_TYPE_U16 = 0, - VMCS_FIELD_TYPE_U64 = 1, - VMCS_FIELD_TYPE_U32 = 2, - VMCS_FIELD_TYPE_NATURAL_WIDTH = 3 +enum vmcs_field_width { + VMCS_FIELD_WIDTH_U16 = 0, + VMCS_FIELD_WIDTH_U64 = 1, + VMCS_FIELD_WIDTH_U32 = 2, + VMCS_FIELD_WIDTH_NATURAL_WIDTH = 3 }; -static inline int vmcs_field_type(unsigned long field) +static inline int vmcs_field_width(unsigned long field) { if (0x1 & field) /* the *_HIGH fields are all 32 bit */ - return VMCS_FIELD_TYPE_U32; + return VMCS_FIELD_WIDTH_U32; return (field >> 13) & 0x3 ; } @@ -3919,7 +3919,7 @@ static void init_vmcs_shadow_fields(void) for (i = j = 0; i < max_shadow_read_only_fields; i++) { u16 field = shadow_read_only_fields[i]; - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_only_fields || shadow_read_only_fields[i + 1] != field + 1)) pr_err("Missing field from shadow_read_only_field %x\n", @@ -3938,7 +3938,7 @@ static void init_vmcs_shadow_fields(void) for (i = j = 0; i < max_shadow_read_write_fields; i++) { u16 field = shadow_read_write_fields[i]; - if (vmcs_field_type(field) == VMCS_FIELD_TYPE_U64 && + if (vmcs_field_width(field) == VMCS_FIELD_WIDTH_U64 && (i + 1 == max_shadow_read_write_fields || shadow_read_write_fields[i + 1] != field + 1)) pr_err("Missing field from shadow_read_write_field %x\n", @@ -7511,17 +7511,17 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu, p = ((char *)(get_vmcs12(vcpu))) + offset; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_NATURAL_WIDTH: + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *ret = *((natural_width *)p); return 0; - case VMCS_FIELD_TYPE_U16: + case VMCS_FIELD_WIDTH_U16: *ret = *((u16 *)p); return 0; - case VMCS_FIELD_TYPE_U32: + case VMCS_FIELD_WIDTH_U32: *ret = *((u32 *)p); return 0; - case VMCS_FIELD_TYPE_U64: + case VMCS_FIELD_WIDTH_U64: *ret = *((u64 *)p); return 0; default: @@ -7538,17 +7538,17 @@ static inline int vmcs12_write_any(struct kvm_vcpu *vcpu, if (offset < 0) return offset; - switch (vmcs_field_type(field)) { - case VMCS_FIELD_TYPE_U16: + switch (vmcs_field_width(field)) { + case VMCS_FIELD_WIDTH_U16: *(u16 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_U32: + case VMCS_FIELD_WIDTH_U32: *(u32 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_U64: + case VMCS_FIELD_WIDTH_U64: *(u64 *)p = field_value; return 0; - case VMCS_FIELD_TYPE_NATURAL_WIDTH: + case VMCS_FIELD_WIDTH_NATURAL_WIDTH: *(natural_width *)p = field_value; return 0; default: -- cgit v1.2.3 From 58e9ffae5efd3ee9b655ee36702f02c36d51ead9 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Fri, 22 Dec 2017 12:13:13 -0800 Subject: kvm: vmx: Reduce size of vmcs_field_to_offset_table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The vmcs_field_to_offset_table was a rather sparse table of short integers with a maximum index of 0x6c16, amounting to 55342 bytes. Now that we are considering support for multiple VMCS12 formats, it would be unfortunate to replicate that large, sparse table. Rotating the field encoding (as a 16-bit integer) left by 6 reduces that table to 5926 bytes. Suggested-by: Paolo Bonzini Signed-off-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f1d41dd3744c..d49bb747ebea 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -686,10 +686,12 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) return &(to_vmx(vcpu)->pi_desc); } +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) -#define FIELD(number, name) [number] = VMCS12_OFFSET(name) -#define FIELD64(number, name) [number] = VMCS12_OFFSET(name), \ - [number##_HIGH] = VMCS12_OFFSET(name)+4 +#define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) +#define FIELD64(number, name) \ + FIELD(number, name), \ + [ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32) static u16 shadow_read_only_fields[] = { @@ -908,9 +910,13 @@ static const unsigned short vmcs_field_to_offset_table[] = { static inline short vmcs_field_to_offset(unsigned long field) { - BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); + unsigned index; + + if (field >> 15) + return -ENOENT; - if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) + index = ROL16(field, 6); + if (index >= ARRAY_SIZE(vmcs_field_to_offset_table)) return -ENOENT; /* @@ -919,10 +925,10 @@ static inline short vmcs_field_to_offset(unsigned long field) */ asm("lfence"); - if (vmcs_field_to_offset_table[field] == 0) + if (vmcs_field_to_offset_table[index] == 0) return -ENOENT; - return vmcs_field_to_offset_table[field]; + return vmcs_field_to_offset_table[index]; } static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu) -- cgit v1.2.3 From c9e9deae76b8fbbd48e7357247689016bb0d6242 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 13:16:29 +0100 Subject: KVM: VMX: split list of shadowed VMCS field to a separate file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prepare for multiple inclusions of the list. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 64 +++--------------------------------- arch/x86/kvm/vmx_shadow_fields.h | 71 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 60 deletions(-) create mode 100644 arch/x86/kvm/vmx_shadow_fields.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d49bb747ebea..4530b2ba63b9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -695,71 +695,15 @@ static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) static u16 shadow_read_only_fields[] = { - /* - * We do NOT shadow fields that are modified when L0 - * traps and emulates any vmx instruction (e.g. VMPTRLD, - * VMXON...) executed by L1. - * For example, VM_INSTRUCTION_ERROR is read - * by L1 if a vmx instruction fails (part of the error path). - * Note the code assumes this logic. If for some reason - * we start shadowing these fields then we need to - * force a shadow sync when L0 emulates vmx instructions - * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified - * by nested_vmx_failValid) - */ - /* 32-bits */ - VM_EXIT_REASON, - VM_EXIT_INTR_INFO, - VM_EXIT_INSTRUCTION_LEN, - IDT_VECTORING_INFO_FIELD, - IDT_VECTORING_ERROR_CODE, - VM_EXIT_INTR_ERROR_CODE, - - /* Natural width */ - EXIT_QUALIFICATION, - GUEST_LINEAR_ADDRESS, - - /* 64-bit */ - GUEST_PHYSICAL_ADDRESS, - GUEST_PHYSICAL_ADDRESS_HIGH, +#define SHADOW_FIELD_RO(x) x, +#include "vmx_shadow_fields.h" }; static int max_shadow_read_only_fields = ARRAY_SIZE(shadow_read_only_fields); static u16 shadow_read_write_fields[] = { - /* 16-bits */ - GUEST_CS_SELECTOR, - GUEST_INTR_STATUS, - GUEST_PML_INDEX, - HOST_FS_SELECTOR, - HOST_GS_SELECTOR, - - /* 32-bits */ - CPU_BASED_VM_EXEC_CONTROL, - EXCEPTION_BITMAP, - VM_ENTRY_EXCEPTION_ERROR_CODE, - VM_ENTRY_INTR_INFO_FIELD, - VM_ENTRY_INSTRUCTION_LEN, - TPR_THRESHOLD, - GUEST_CS_LIMIT, - GUEST_CS_AR_BYTES, - GUEST_INTERRUPTIBILITY_INFO, - VMX_PREEMPTION_TIMER_VALUE, - - /* Natural width */ - GUEST_RIP, - GUEST_RSP, - GUEST_CR0, - GUEST_CR3, - GUEST_CR4, - GUEST_RFLAGS, - GUEST_CS_BASE, - GUEST_ES_BASE, - CR0_GUEST_HOST_MASK, - CR0_READ_SHADOW, - CR4_READ_SHADOW, - HOST_FS_BASE, - HOST_GS_BASE, +#define SHADOW_FIELD_RW(x) x, +#include "vmx_shadow_fields.h" }; static int max_shadow_read_write_fields = ARRAY_SIZE(shadow_read_write_fields); diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h new file mode 100644 index 000000000000..31d7a15338ac --- /dev/null +++ b/arch/x86/kvm/vmx_shadow_fields.h @@ -0,0 +1,71 @@ +#ifndef SHADOW_FIELD_RO +#define SHADOW_FIELD_RO(x) +#endif +#ifndef SHADOW_FIELD_RW +#define SHADOW_FIELD_RW(x) +#endif + +/* + * We do NOT shadow fields that are modified when L0 + * traps and emulates any vmx instruction (e.g. VMPTRLD, + * VMXON...) executed by L1. + * For example, VM_INSTRUCTION_ERROR is read + * by L1 if a vmx instruction fails (part of the error path). + * Note the code assumes this logic. If for some reason + * we start shadowing these fields then we need to + * force a shadow sync when L0 emulates vmx instructions + * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified + * by nested_vmx_failValid) + * + * Keeping the fields ordered by size is an attempt at improving + * branch prediction in vmcs_read_any and vmcs_write_any. + */ + +/* 16-bits */ +SHADOW_FIELD_RW(GUEST_CS_SELECTOR) +SHADOW_FIELD_RW(GUEST_INTR_STATUS) +SHADOW_FIELD_RW(GUEST_PML_INDEX) +SHADOW_FIELD_RW(HOST_FS_SELECTOR) +SHADOW_FIELD_RW(HOST_GS_SELECTOR) + +/* 32-bits */ +SHADOW_FIELD_RO(VM_EXIT_REASON) +SHADOW_FIELD_RO(VM_EXIT_INTR_INFO) +SHADOW_FIELD_RO(VM_EXIT_INSTRUCTION_LEN) +SHADOW_FIELD_RO(IDT_VECTORING_INFO_FIELD) +SHADOW_FIELD_RO(IDT_VECTORING_ERROR_CODE) +SHADOW_FIELD_RO(VM_EXIT_INTR_ERROR_CODE) +SHADOW_FIELD_RW(CPU_BASED_VM_EXEC_CONTROL) +SHADOW_FIELD_RW(EXCEPTION_BITMAP) +SHADOW_FIELD_RW(VM_ENTRY_EXCEPTION_ERROR_CODE) +SHADOW_FIELD_RW(VM_ENTRY_INTR_INFO_FIELD) +SHADOW_FIELD_RW(VM_ENTRY_INSTRUCTION_LEN) +SHADOW_FIELD_RW(TPR_THRESHOLD) +SHADOW_FIELD_RW(GUEST_CS_LIMIT) +SHADOW_FIELD_RW(GUEST_CS_AR_BYTES) +SHADOW_FIELD_RW(GUEST_INTERRUPTIBILITY_INFO) +SHADOW_FIELD_RW(VMX_PREEMPTION_TIMER_VALUE) + +/* Natural width */ +SHADOW_FIELD_RO(EXIT_QUALIFICATION) +SHADOW_FIELD_RO(GUEST_LINEAR_ADDRESS) +SHADOW_FIELD_RW(GUEST_RIP) +SHADOW_FIELD_RW(GUEST_RSP) +SHADOW_FIELD_RW(GUEST_CR0) +SHADOW_FIELD_RW(GUEST_CR3) +SHADOW_FIELD_RW(GUEST_CR4) +SHADOW_FIELD_RW(GUEST_RFLAGS) +SHADOW_FIELD_RW(GUEST_CS_BASE) +SHADOW_FIELD_RW(GUEST_ES_BASE) +SHADOW_FIELD_RW(CR0_GUEST_HOST_MASK) +SHADOW_FIELD_RW(CR0_READ_SHADOW) +SHADOW_FIELD_RW(CR4_READ_SHADOW) +SHADOW_FIELD_RW(HOST_FS_BASE) +SHADOW_FIELD_RW(HOST_GS_BASE) + +/* 64-bit */ +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS) +SHADOW_FIELD_RO(GUEST_PHYSICAL_ADDRESS_HIGH) + +#undef SHADOW_FIELD_RO +#undef SHADOW_FIELD_RW -- cgit v1.2.3 From 74a497fae754bd970ad0573c26d0a3c5e784fa0c Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 13:55:39 +0100 Subject: KVM: nVMX: track dirty state of non-shadowed VMCS fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit VMCS12 fields that are not handled through shadow VMCS are rarely written, and thus they are also almost constant in the vmcs02. We can thus optimize prepare_vmcs02 by skipping all the work for non-shadowed fields in the common case. This patch introduces the (pretty simple) tracking infrastructure; the next patches will move work to prepare_vmcs02_full and save a few hundred clock cycles per VMRESUME on a Haswell Xeon E5 system: before after cpuid 14159 13869 vmcall 15290 14951 inl_from_kernel 17703 17447 outl_to_kernel 16011 14692 self_ipi_sti_nop 16763 15825 self_ipi_tpr_sti_nop 17341 15935 wr_tsc_adjust_msr 14510 14264 rd_tsc_adjust_msr 15018 14311 mmio-wildcard-eventfd:pci-mem 16381 14947 mmio-datamatch-eventfd:pci-mem 18620 17858 portio-wildcard-eventfd:pci-io 15121 14769 portio-datamatch-eventfd:pci-io 15761 14831 (average savings 748, stdev 460). Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 29 ++++++++++++++++++++++++++++- arch/x86/kvm/vmx_shadow_fields.h | 6 ++++++ 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4530b2ba63b9..1cc787f63972 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -436,6 +436,7 @@ struct nested_vmx { * data hold by vmcs12 */ bool sync_shadow_vmcs; + bool dirty_vmcs12; bool change_vmcs01_virtual_x2apic_mode; /* L2 must run next, and mustn't decide to exit to L1. */ @@ -7623,8 +7624,10 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) { unsigned long field; gva_t gva; + struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + /* The value to write might be 32 or 64 bits, depending on L1's long * mode, and eventually we need to write that into a field of several * possible lengths. The code below first zero-extends the value to 64 @@ -7667,6 +7670,20 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return kvm_skip_emulated_instruction(vcpu); } + switch (field) { +#define SHADOW_FIELD_RW(x) case x: +#include "vmx_shadow_fields.h" + /* + * The fields that can be updated by L1 without a vmexit are + * always updated in the vmcs02, the others go down the slow + * path of prepare_vmcs02. + */ + break; + default: + vmx->nested.dirty_vmcs12 = true; + break; + } + nested_vmx_succeed(vcpu); return kvm_skip_emulated_instruction(vcpu); } @@ -7681,6 +7698,7 @@ static void set_current_vmptr(struct vcpu_vmx *vmx, gpa_t vmptr) __pa(vmx->vmcs01.shadow_vmcs)); vmx->nested.sync_shadow_vmcs = true; } + vmx->nested.dirty_vmcs12 = true; } /* Emulate the VMPTRLD instruction */ @@ -10297,6 +10315,11 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne return 0; } +static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry) +{ +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -10592,7 +10615,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx_flush_tlb(vcpu, true); } - } if (enable_pml) { @@ -10641,6 +10663,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ vmx_set_efer(vcpu, vcpu->arch.efer); + if (vmx->nested.dirty_vmcs12) { + prepare_vmcs02_full(vcpu, vmcs12, from_vmentry); + vmx->nested.dirty_vmcs12 = false; + } + /* Shadow page tables on either EPT or shadow page tables. */ if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_cpu_has_ept(vmcs12), entry_failure_code)) diff --git a/arch/x86/kvm/vmx_shadow_fields.h b/arch/x86/kvm/vmx_shadow_fields.h index 31d7a15338ac..cd0c75f6d037 100644 --- a/arch/x86/kvm/vmx_shadow_fields.h +++ b/arch/x86/kvm/vmx_shadow_fields.h @@ -17,6 +17,12 @@ * (e.g. force a sync if VM_INSTRUCTION_ERROR is modified * by nested_vmx_failValid) * + * When adding or removing fields here, note that shadowed + * fields must always be synced by prepare_vmcs02, not just + * prepare_vmcs02_full. + */ + +/* * Keeping the fields ordered by size is an attempt at improving * branch prediction in vmcs_read_any and vmcs_write_any. */ -- cgit v1.2.3 From 8665c3f973203269fdc799da833f28c5dc217523 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 13:56:53 +0100 Subject: KVM: nVMX: initialize descriptor cache fields in prepare_vmcs02_full MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This part is separate for ease of review, because git prefers to move prepare_vmcs02 below the initial long sequence of vmcs_write* operations. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 56 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1cc787f63972..f83f5e7bf87a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10317,28 +10317,10 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, bool from_vmentry) -{ -} - -/* - * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested - * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it - * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 - * guest in a way that will both be appropriate to L1's requests, and our - * needs. In addition to modifying the active vmcs (which is vmcs02), this - * function also has additional necessary side-effects, like setting various - * vcpu->arch fields. - * Returns 0 on success, 1 on failure. Invalid state exit qualification code - * is assigned to entry_failure_code on failure. - */ -static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, - bool from_vmentry, u32 *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 exec_control, vmcs12_exec_ctrl; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); - vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector); vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector); vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector); @@ -10346,7 +10328,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector); vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector); vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit); - vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit); vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit); vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit); @@ -10356,15 +10337,12 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit); vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit); vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes); - vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes); vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes); vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes); vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes); vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes); vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes); - vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); - vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base); vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base); vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base); @@ -10373,6 +10351,40 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); +} + +/* + * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested + * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it + * with L0's requirements for its guest (a.k.a. vmcs01), so we can run the L2 + * guest in a way that will both be appropriate to L1's requests, and our + * needs. In addition to modifying the active vmcs (which is vmcs02), this + * function also has additional necessary side-effects, like setting various + * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + bool from_vmentry, u32 *entry_failure_code) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + u32 exec_control, vmcs12_exec_ctrl; + + /* + * First, the fields that are shadowed. This must be kept in sync + * with vmx_shadow_fields.h. + */ + + vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); + vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit); + vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes); + vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base); + vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base); + + /* + * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR, + * HOST_FS_BASE, HOST_GS_BASE. + */ if (from_vmentry && (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) { -- cgit v1.2.3 From 25a2e4fe8ef719de564370349dc408ac7263f455 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 14:05:21 +0100 Subject: KVM: nVMX: initialize more non-shadowed fields in prepare_vmcs02_full MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fields are also simple copies of the data in the vmcs12 struct. For some of them, prepare_vmcs02 was skipping the copy when the field was unused. In prepare_vmcs02_full, we copy them always as long as the field exists on the host, because the corresponding execution control might be one of the shadowed fields. Optimization opportunities remain for MSRs that, depending on the entry/exit controls, have to be copied from either the vmcs01 or the vmcs12: EFER (whose value is partly stored in the entry controls too), PAT, DEBUGCTL (and also DR7). Before moving these three and the entry/exit controls to prepare_vmcs02_full, KVM would have to set dirty_vmcs12 on writes to the L1 MSRs. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 162 +++++++++++++++++++++++++++-------------------------- 1 file changed, 83 insertions(+), 79 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f83f5e7bf87a..9b9e02c0f25e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10351,6 +10351,88 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base); vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); + + vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); + vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, + vmcs12->guest_pending_dbg_exceptions); + vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); + vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); + + if (nested_cpu_has_xsaves(vmcs12)) + vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); + vmcs_write64(VMCS_LINK_POINTER, -1ull); + + if (cpu_has_vmx_posted_intr()) + vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); + + /* + * Whether page-faults are trapped is determined by a combination of + * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. + * If enable_ept, L0 doesn't care about page faults and we should + * set all of these to L1's desires. However, if !enable_ept, L0 does + * care about (at least some) page faults, and because it is not easy + * (if at all possible?) to merge L0 and L1's desires, we simply ask + * to exit on each and every L2 page fault. This is done by setting + * MASK=MATCH=0 and (see below) EB.PF=1. + * Note that below we don't need special code to set EB.PF beyond the + * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, + * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when + * !enable_ept, EB.PF is 1, so the "or" will always be 1. + */ + vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, + enable_ept ? vmcs12->page_fault_error_code_mask : 0); + vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, + enable_ept ? vmcs12->page_fault_error_code_match : 0); + + /* All VMFUNCs are currently emulated through L0 vmexits. */ + if (cpu_has_vmx_vmfunc()) + vmcs_write64(VM_FUNCTION_CONTROL, 0); + + if (cpu_has_vmx_apicv()) { + vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0); + vmcs_write64(EOI_EXIT_BITMAP1, vmcs12->eoi_exit_bitmap1); + vmcs_write64(EOI_EXIT_BITMAP2, vmcs12->eoi_exit_bitmap2); + vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3); + } + + /* + * Set host-state according to L0's settings (vmcs12 is irrelevant here) + * Some constant fields are set here by vmx_set_constant_host_state(). + * Other fields are different per CPU, and will be set later when + * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. + */ + vmx_set_constant_host_state(vmx); + + /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + set_cr4_guest_host_mask(vmx); + + if (vmx_mpx_supported()) + vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); + + if (enable_vpid) { + if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); + else + vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); + } + + /* + * L1 may access the L2's PDPTR, so save them to construct vmcs12 + */ + if (enable_ept) { + vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); + vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); + vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); + vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); + } } /* @@ -10408,16 +10490,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } else { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); } - vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); vmx_set_rflags(vcpu, vmcs12->guest_rflags); - vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, - vmcs12->guest_pending_dbg_exceptions); - vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp); - vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip); - - if (nested_cpu_has_xsaves(vmcs12)) - vmcs_write64(XSS_EXIT_BITMAP, vmcs12->xss_exit_bitmap); - vmcs_write64(VMCS_LINK_POINTER, -1ull); exec_control = vmcs12->pin_based_vm_exec_control; @@ -10431,7 +10504,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_posted_intr(vmcs12)) { vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; vmx->nested.pi_pending = false; - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_NESTED_VECTOR); } else { exec_control &= ~PIN_BASED_POSTED_INTR; } @@ -10442,25 +10514,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (nested_cpu_has_preemption_timer(vmcs12)) vmx_start_preemption_timer(vcpu); - /* - * Whether page-faults are trapped is determined by a combination of - * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. - * If enable_ept, L0 doesn't care about page faults and we should - * set all of these to L1's desires. However, if !enable_ept, L0 does - * care about (at least some) page faults, and because it is not easy - * (if at all possible?) to merge L0 and L1's desires, we simply ask - * to exit on each and every L2 page fault. This is done by setting - * MASK=MATCH=0 and (see below) EB.PF=1. - * Note that below we don't need special code to set EB.PF beyond the - * "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept, - * vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when - * !enable_ept, EB.PF is 1, so the "or" will always be 1. - */ - vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, - enable_ept ? vmcs12->page_fault_error_code_mask : 0); - vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, - enable_ept ? vmcs12->page_fault_error_code_match : 0); - if (cpu_has_secondary_exec_ctrls()) { exec_control = vmx->secondary_exec_control; @@ -10479,22 +10532,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, exec_control |= vmcs12_exec_ctrl; } - /* All VMFUNCs are currently emulated through L0 vmexits. */ - if (exec_control & SECONDARY_EXEC_ENABLE_VMFUNC) - vmcs_write64(VM_FUNCTION_CONTROL, 0); - - if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) { - vmcs_write64(EOI_EXIT_BITMAP0, - vmcs12->eoi_exit_bitmap0); - vmcs_write64(EOI_EXIT_BITMAP1, - vmcs12->eoi_exit_bitmap1); - vmcs_write64(EOI_EXIT_BITMAP2, - vmcs12->eoi_exit_bitmap2); - vmcs_write64(EOI_EXIT_BITMAP3, - vmcs12->eoi_exit_bitmap3); + if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) vmcs_write16(GUEST_INTR_STATUS, vmcs12->guest_intr_status); - } /* * Write an illegal value to APIC_ACCESS_ADDR. Later, @@ -10507,24 +10547,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } - - /* - * Set host-state according to L0's settings (vmcs12 is irrelevant here) - * Some constant fields are set here by vmx_set_constant_host_state(). - * Other fields are different per CPU, and will be set later when - * vmx_vcpu_load() is called, and when vmx_save_host_state() is called. - */ - vmx_set_constant_host_state(vmx); - - /* - * Set the MSR load/store lists to match L0's settings. - */ - vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); - vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); - vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); - vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); - /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value @@ -10594,12 +10616,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat); } - set_cr4_guest_host_mask(vmx); - - if (from_vmentry && - vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) - vmcs_write64(GUEST_BNDCFGS, vmcs12->guest_bndcfgs); - if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset + vmcs12->tsc_offset); @@ -10618,13 +10634,11 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, * even if spawn a lot of nested vCPUs. */ if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) { vmx->nested.last_vpid = vmcs12->virtual_processor_id; __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02, true); } } else { - vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); vmx_flush_tlb(vcpu, true); } } @@ -10688,16 +10702,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested; - /* - * L1 may access the L2's PDPTR, so save them to construct vmcs12 - */ - if (enable_ept) { - vmcs_write64(GUEST_PDPTR0, vmcs12->guest_pdptr0); - vmcs_write64(GUEST_PDPTR1, vmcs12->guest_pdptr1); - vmcs_write64(GUEST_PDPTR2, vmcs12->guest_pdptr2); - vmcs_write64(GUEST_PDPTR3, vmcs12->guest_pdptr3); - } - kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); return 0; -- cgit v1.2.3 From 07f36616cde482a9fd65da9a64f504190c7a0edb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 1 Jan 2018 22:53:42 +0100 Subject: KVM: nVMX: remove unnecessary vmwrite from L2->L1 vmexit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The POSTED_INTR_NV field is constant (though it differs between the vmcs01 and vmcs02), there is no need to reload it on vmexit to L1. Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9b9e02c0f25e..bbadd8c7e592 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11339,9 +11339,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, */ vmx_flush_tlb(vcpu, true); } - /* Restore posted intr vector. */ - if (nested_cpu_has_posted_intr(vmcs12)) - vmcs_write16(POSTED_INTR_NV, POSTED_INTR_VECTOR); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs); vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp); -- cgit v1.2.3 From 1f6e5b25643e539e55a4c7553e4be6562c88fb76 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 20 Dec 2017 11:32:16 +0100 Subject: KVM: vmx: simplify MSR bitmap setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The APICv-enabled MSR bitmap is a superset of the APICv-disabled bitmap. Make that obvious in vmx_disable_intercept_msr_x2apic. Signed-off-by: Paolo Bonzini [Resolved rebase conflict after removing Intel PT. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bbadd8c7e592..1f4a3037b99e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5017,14 +5017,13 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) +static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only) { - if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, - msr, type); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, - msr, type); - } else { + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, + msr, type); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, + msr, type); + if (!apicv_only) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, msr, type); __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, @@ -6872,7 +6871,6 @@ static __init int hardware_setup(void) * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use. */ - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); /* EOI */ -- cgit v1.2.3 From c992384bde84fb2f0318bdb4f6c710605dd7a217 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 13 Dec 2017 14:16:30 +0100 Subject: KVM: vmx: speed up MSR bitmap merge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bulk of the MSR bitmap is either immutable, or can be copied from the L1 bitmap. By initializing it at VMXON time, and copying the mutable parts one long at a time on vmentry (rather than one bit), about 4000 clock cycles (30%) can be saved on a nested VMLAUNCH/VMRESUME. The resulting for loop only has four iterations, so it is cheap enough to reinitialize the MSR write bitmaps on every iteration, and it makes the code simpler. Suggested-by: Jim Mattson Signed-off-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 78 +++++++++++++++++++++++++++++------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1f4a3037b99e..71b2c9354423 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4972,11 +4972,6 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, { int f = sizeof(unsigned long); - if (!cpu_has_vmx_msr_bitmap()) { - WARN_ON(1); - return; - } - /* * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals * have the write-low and read-high bitmap offsets the wrong way round. @@ -7177,6 +7172,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu) (unsigned long *)__get_free_page(GFP_KERNEL); if (!vmx->nested.msr_bitmap) goto out_msr_bitmap; + memset(vmx->nested.msr_bitmap, 0xff, PAGE_SIZE); } vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL); @@ -9844,8 +9840,8 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu, } } -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12); +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12); static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) @@ -9934,11 +9930,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu, (unsigned long)(vmcs12->posted_intr_desc_addr & (PAGE_SIZE - 1))); } - if (cpu_has_vmx_msr_bitmap() && - nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS) && - nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) - ; - else + if (!nested_vmx_prepare_msr_bitmap(vcpu, vmcs12)) vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, CPU_BASED_USE_MSR_BITMAPS); } @@ -10006,14 +9998,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. */ -static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, - struct vmcs12 *vmcs12) +static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, + struct vmcs12 *vmcs12) { int msr; struct page *page; unsigned long *msr_bitmap_l1; unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap; + /* Nothing to do if the MSR bitmap is not in use. */ + if (!cpu_has_vmx_msr_bitmap() || + !nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS)) + return false; + /* This shortcut is ok because we support only x2APIC MSRs so far. */ if (!nested_cpu_has_virt_x2apic_mode(vmcs12)) return false; @@ -10021,32 +10018,41 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu, page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap); if (is_error_page(page)) return false; - msr_bitmap_l1 = (unsigned long *)kmap(page); - memset(msr_bitmap_l0, 0xff, PAGE_SIZE); + msr_bitmap_l1 = (unsigned long *)kmap(page); + if (nested_cpu_has_apic_reg_virt(vmcs12)) { + /* + * L0 need not intercept reads for MSRs between 0x800 and 0x8ff, it + * just lets the processor take the value from the virtual-APIC page; + * take those 256 bits directly from the L1 bitmap. + */ + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = msr_bitmap_l1[word]; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } else { + for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) { + unsigned word = msr / BITS_PER_LONG; + msr_bitmap_l0[word] = ~0; + msr_bitmap_l0[word + (0x800 / sizeof(long))] = ~0; + } + } - if (nested_cpu_has_virt_x2apic_mode(vmcs12)) { - if (nested_cpu_has_apic_reg_virt(vmcs12)) - for (msr = 0x800; msr <= 0x8ff; msr++) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - msr, MSR_TYPE_R); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_TASKPRI >> 4), + MSR_TYPE_W); + if (nested_cpu_has_vid(vmcs12)) { nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), - MSR_TYPE_R | MSR_TYPE_W); - - if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_EOI >> 4), - MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), - MSR_TYPE_W); - } + msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_EOI >> 4), + MSR_TYPE_W); + nested_vmx_disable_intercept_for_msr( + msr_bitmap_l1, msr_bitmap_l0, + APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + MSR_TYPE_W); } kunmap(page); kvm_release_page_clean(page); -- cgit v1.2.3 From d7231e75f73f424068ad205358bd8ba6a7a2e113 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 21 Dec 2017 00:47:55 +0100 Subject: KVM: VMX: introduce X2APIC_MSR macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove duplicate expression in nested_vmx_prepare_msr_bitmap, and make the register names clearer in hardware_setup. Suggested-by: Jim Mattson Reviewed-by: Jim Mattson Signed-off-by: Paolo Bonzini [Resolved rebase conflict after removing Intel PT. - Radim] Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 71b2c9354423..1e2ca9e8662f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5012,6 +5012,8 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } +#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4)) + static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_only) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, @@ -6857,7 +6859,7 @@ static __init int hardware_setup(void) set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ for (msr = 0x800; msr <= 0x8ff; msr++) { - if (msr == 0x839 /* TMCCT */) + if (msr == X2APIC_MSR(APIC_TMCCT)) continue; vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); } @@ -6866,12 +6868,9 @@ static __init int hardware_setup(void) * TPR reads and writes can be virtualized even if virtual interrupt * delivery is not in use. */ - vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - - /* EOI */ - vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); - /* SELF-IPI */ - vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W, false); + vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_EOI), MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W, true); if (enable_ept) vmx_enable_tdp(); @@ -10041,17 +10040,17 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_TASKPRI >> 4), + X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_EOI >> 4), + X2APIC_MSR(APIC_EOI), MSR_TYPE_W); nested_vmx_disable_intercept_for_msr( msr_bitmap_l1, msr_bitmap_l0, - APIC_BASE_MSR + (APIC_SELF_IPI >> 4), + X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } kunmap(page); -- cgit v1.2.3 From 89a8f6d4904c8cf3ff8fee9fdaff392a6bbb8bf6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:31 +0100 Subject: x86/hyperv: Check for required priviliges in hyperv_init() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In hyperv_init() its presumed that it always has access to VP index and hypercall MSRs while according to the specification it should be checked if it's allowed to access the corresponding MSRs before accessing them. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-2-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 189a398290db..21f9d53d9f00 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -110,12 +110,19 @@ static int hv_cpu_init(unsigned int cpu) */ void hyperv_init(void) { - u64 guest_id; + u64 guest_id, required_msrs; union hv_x64_msr_hypercall_contents hypercall_msr; if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; + /* Absolutely required MSRs */ + required_msrs = HV_X64_MSR_HYPERCALL_AVAILABLE | + HV_X64_MSR_VP_INDEX_AVAILABLE; + + if ((ms_hyperv.features & required_msrs) != required_msrs) + return; + /* Allocate percpu VP index */ hv_vp_index = kmalloc_array(num_possible_cpus(), sizeof(*hv_vp_index), GFP_KERNEL); -- cgit v1.2.3 From e2768eaa1ca4fbb7b778da5615cce3dd310352e6 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:32 +0100 Subject: x86/hyperv: Add a function to read both TSC and TSC page value simulateneously MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is going to be used from KVM code where both TSC and TSC page value are needed. Nothing is supposed to use the function when Hyper-V code is compiled out, just BUG(). Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-3-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 1 + arch/x86/include/asm/mshyperv.h | 23 +++++++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 21f9d53d9f00..1a6c63f721bc 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -37,6 +37,7 @@ struct ms_hyperv_tsc_page *hv_get_tsc_page(void) { return tsc_pg; } +EXPORT_SYMBOL_GPL(hv_get_tsc_page); static u64 read_hv_clock_tsc(struct clocksource *arg) { diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 8bf450b13d9f..6b1d4ea78270 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -325,9 +325,10 @@ static inline void hyperv_setup_mmu_ops(void) {} #ifdef CONFIG_HYPERV_TSCPAGE struct ms_hyperv_tsc_page *hv_get_tsc_page(void); -static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc) { - u64 scale, offset, cur_tsc; + u64 scale, offset; u32 sequence; /* @@ -358,7 +359,7 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) scale = READ_ONCE(tsc_pg->tsc_scale); offset = READ_ONCE(tsc_pg->tsc_offset); - cur_tsc = rdtsc_ordered(); + *cur_tsc = rdtsc_ordered(); /* * Make sure we read sequence after we read all other values @@ -368,7 +369,14 @@ static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); - return mul_u64_u64_shr(cur_tsc, scale, 64) + offset; + return mul_u64_u64_shr(*cur_tsc, scale, 64) + offset; +} + +static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +{ + u64 cur_tsc; + + return hv_read_tsc_page_tsc(tsc_pg, &cur_tsc); } #else @@ -376,5 +384,12 @@ static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) { return NULL; } + +static inline u64 hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc) +{ + BUG(); + return U64_MAX; +} #endif #endif -- cgit v1.2.3 From 93286261de1b46339aa27cd4c639b21778f6cade Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:33 +0100 Subject: x86/hyperv: Reenlightenment notifications support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V supports Live Migration notification. This is supposed to be used in conjunction with TSC emulation: when a VM is migrated to a host with different TSC frequency for some short period the host emulates the accesses to TSC and sends an interrupt to notify about the event. When the guest is done updating everything it can disable TSC emulation and everything will start working fast again. These notifications weren't required until now as Hyper-V guests are not supposed to use TSC as a clocksource: in Linux the TSC is even marked as unstable on boot. Guests normally use 'tsc page' clocksource and host updates its values on migrations automatically. Things change when with nested virtualization: even when the PV clocksources (kvm-clock or tsc page) are passed through to the nested guests the TSC frequency and frequency changes need to be know.. Hyper-V Top Level Functional Specification (as of v5.0b) wrongly specifies EAX:BIT(12) of CPUID:0x40000009 as the feature identification bit. The right one to check is EAX:BIT(13) of CPUID:0x40000003. I was assured that the fix in on the way. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-4-vkuznets@redhat.com --- arch/x86/entry/entry_32.S | 3 ++ arch/x86/entry/entry_64.S | 3 ++ arch/x86/hyperv/hv_init.c | 89 ++++++++++++++++++++++++++++++++++++++ arch/x86/include/asm/irq_vectors.h | 7 ++- arch/x86/include/asm/mshyperv.h | 9 ++++ arch/x86/include/uapi/asm/hyperv.h | 27 ++++++++++++ arch/x86/kernel/cpu/mshyperv.c | 6 +++ 7 files changed, 143 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2a35b1e0fb90..7a796eeddf99 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -895,6 +895,9 @@ BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR, BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, hyperv_vector_handler) +BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_intr) + #endif /* CONFIG_HYPERV */ ENTRY(page_fault) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a83570495162..553aa49909ce 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1245,6 +1245,9 @@ apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ #if IS_ENABLED(CONFIG_HYPERV) apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \ hyperv_callback_vector hyperv_vector_handler + +apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \ + hyperv_reenlightenment_vector hyperv_reenlightenment_intr #endif /* CONFIG_HYPERV */ idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 1a6c63f721bc..712ac40081f7 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -18,6 +18,8 @@ */ #include +#include +#include #include #include #include @@ -102,6 +104,93 @@ static int hv_cpu_init(unsigned int cpu) return 0; } +static void (*hv_reenlightenment_cb)(void); + +static void hv_reenlightenment_notify(struct work_struct *dummy) +{ + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + /* Don't issue the callback if TSC accesses are not emulated */ + if (hv_reenlightenment_cb && emu_status.inprogress) + hv_reenlightenment_cb(); +} +static DECLARE_DELAYED_WORK(hv_reenlightenment_work, hv_reenlightenment_notify); + +void hyperv_stop_tsc_emulation(void) +{ + u64 freq; + struct hv_tsc_emulation_status emu_status; + + rdmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + emu_status.inprogress = 0; + wrmsrl(HV_X64_MSR_TSC_EMULATION_STATUS, *(u64 *)&emu_status); + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + tsc_khz = div64_u64(freq, 1000); +} +EXPORT_SYMBOL_GPL(hyperv_stop_tsc_emulation); + +static inline bool hv_reenlightenment_available(void) +{ + /* + * Check for required features and priviliges to make TSC frequency + * change notifications work. + */ + return ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE && + ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT; +} + +__visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) +{ + entering_ack_irq(); + + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); + + exiting_irq(); +} + +void set_hv_tscchange_cb(void (*cb)(void)) +{ + struct hv_reenlightenment_control re_ctrl = { + .vector = HYPERV_REENLIGHTENMENT_VECTOR, + .enabled = 1, + .target_vp = hv_vp_index[smp_processor_id()] + }; + struct hv_tsc_emulation_control emu_ctrl = {.enabled = 1}; + + if (!hv_reenlightenment_available()) { + pr_warn("Hyper-V: reenlightenment support is unavailable\n"); + return; + } + + hv_reenlightenment_cb = cb; + + /* Make sure callback is registered before we write to MSRs */ + wmb(); + + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + wrmsrl(HV_X64_MSR_TSC_EMULATION_CONTROL, *((u64 *)&emu_ctrl)); +} +EXPORT_SYMBOL_GPL(set_hv_tscchange_cb); + +void clear_hv_tscchange_cb(void) +{ + struct hv_reenlightenment_control re_ctrl; + + if (!hv_reenlightenment_available()) + return; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + re_ctrl.enabled = 0; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *(u64 *)&re_ctrl); + + hv_reenlightenment_cb = NULL; +} +EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 67421f649cfa..e71c1120426b 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -103,7 +103,12 @@ #endif #define MANAGED_IRQ_SHUTDOWN_VECTOR 0xef -#define LOCAL_TIMER_VECTOR 0xee + +#if IS_ENABLED(CONFIG_HYPERV) +#define HYPERV_REENLIGHTENMENT_VECTOR 0xee +#endif + +#define LOCAL_TIMER_VECTOR 0xed #define NR_VECTORS 256 diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 6b1d4ea78270..1790002a2052 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -160,6 +160,7 @@ static inline void vmbus_signal_eom(struct hv_message *msg, u32 old_msg_type) #define hv_set_synint_state(int_num, val) wrmsrl(int_num, val) void hyperv_callback_vector(void); +void hyperv_reenlightenment_vector(void); #ifdef CONFIG_TRACING #define trace_hyperv_callback_vector hyperv_callback_vector #endif @@ -316,11 +317,19 @@ void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs, long err); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); + +void hyperv_reenlightenment_intr(struct pt_regs *regs); +void set_hv_tscchange_cb(void (*cb)(void)); +void clear_hv_tscchange_cb(void); +void hyperv_stop_tsc_emulation(void); #else /* CONFIG_HYPERV */ static inline void hyperv_init(void) {} static inline bool hv_is_hypercall_page_setup(void) { return false; } static inline void hyperv_cleanup(void) {} static inline void hyperv_setup_mmu_ops(void) {} +static inline void set_hv_tscchange_cb(void (*cb)(void)) {} +static inline void clear_hv_tscchange_cb(void) {} +static inline void hyperv_stop_tsc_emulation(void) {}; #endif /* CONFIG_HYPERV */ #ifdef CONFIG_HYPERV_TSCPAGE diff --git a/arch/x86/include/uapi/asm/hyperv.h b/arch/x86/include/uapi/asm/hyperv.h index 1a5bfead93b4..197c2e6c7376 100644 --- a/arch/x86/include/uapi/asm/hyperv.h +++ b/arch/x86/include/uapi/asm/hyperv.h @@ -40,6 +40,9 @@ */ #define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) +/* AccessReenlightenmentControls privilege */ +#define HV_X64_ACCESS_REENLIGHTENMENT BIT(13) + /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available @@ -234,6 +237,30 @@ #define HV_X64_MSR_CRASH_PARAMS \ (1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0)) +/* TSC emulation after migration */ +#define HV_X64_MSR_REENLIGHTENMENT_CONTROL 0x40000106 + +struct hv_reenlightenment_control { + u64 vector:8; + u64 reserved1:8; + u64 enabled:1; + u64 reserved2:15; + u64 target_vp:32; +}; + +#define HV_X64_MSR_TSC_EMULATION_CONTROL 0x40000107 +#define HV_X64_MSR_TSC_EMULATION_STATUS 0x40000108 + +struct hv_tsc_emulation_control { + u64 enabled:1; + u64 reserved:63; +}; + +struct hv_tsc_emulation_status { + u64 inprogress:1; + u64 reserved:63; +}; + #define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \ diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 85eb5fc180c8..9340f41ce8d3 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -251,6 +251,12 @@ static void __init ms_hyperv_init_platform(void) hyperv_setup_mmu_ops(); /* Setup the IDT for hypervisor callback */ alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector); + + /* Setup the IDT for reenlightenment notifications */ + if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) + alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR, + hyperv_reenlightenment_vector); + #endif } -- cgit v1.2.3 From e7c4e36c447daca2b7df49024f6bf230871cb155 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:34 +0100 Subject: x86/hyperv: Redirect reenlightment notifications on CPU offlining MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is very unlikely for CPUs to get offlined when running on Hyper-V as there is a protection in the vmbus module which prevents it when the guest has any VMBus devices assigned. This, however, may change in future if an option to reassign an already active channel will be added. It is also possible to run without any Hyper-V devices or to have a CPU with no assigned channels. Reassign reenlightenment notifications to some other active CPU when the CPU which is assigned to them goes offline. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-5-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 712ac40081f7..e4377e2f2a10 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -191,6 +191,26 @@ void clear_hv_tscchange_cb(void) } EXPORT_SYMBOL_GPL(clear_hv_tscchange_cb); +static int hv_cpu_die(unsigned int cpu) +{ + struct hv_reenlightenment_control re_ctrl; + unsigned int new_cpu; + + if (hv_reenlightenment_cb == NULL) + return 0; + + rdmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + if (re_ctrl.target_vp == hv_vp_index[cpu]) { + /* Reassign to some other online CPU */ + new_cpu = cpumask_any_but(cpu_online_mask, cpu); + + re_ctrl.target_vp = hv_vp_index[new_cpu]; + wrmsrl(HV_X64_MSR_REENLIGHTENMENT_CONTROL, *((u64 *)&re_ctrl)); + } + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -220,7 +240,7 @@ void hyperv_init(void) return; if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", - hv_cpu_init, NULL) < 0) + hv_cpu_init, hv_cpu_die) < 0) goto free_vp_index; /* -- cgit v1.2.3 From 51d4e5daa32808df4d50db511d167fde19fa114e Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:35 +0100 Subject: x86/irq: Count Hyper-V reenlightenment interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hyper-V reenlightenment interrupts arrive when the VM is migrated, While they are not interesting in general it's important when L2 nested guests are running. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: Paolo Bonzini Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-6-vkuznets@redhat.com --- arch/x86/hyperv/hv_init.c | 2 ++ arch/x86/include/asm/hardirq.h | 3 +++ arch/x86/kernel/irq.c | 9 +++++++++ 3 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index e4377e2f2a10..a3adece392f1 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -147,6 +147,8 @@ __visible void __irq_entry hyperv_reenlightenment_intr(struct pt_regs *regs) { entering_ack_irq(); + inc_irq_stat(irq_hv_reenlightenment_count); + schedule_delayed_work(&hv_reenlightenment_work, HZ/10); exiting_irq(); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 51cc979dd364..7c341a74ec8c 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -38,6 +38,9 @@ typedef struct { #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN) unsigned int irq_hv_callback_count; #endif +#if IS_ENABLED(CONFIG_HYPERV) + unsigned int irq_hv_reenlightenment_count; +#endif } ____cacheline_aligned irq_cpustat_t; DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 68e1867cca80..45fb4d2565f8 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -141,6 +141,15 @@ int arch_show_interrupts(struct seq_file *p, int prec) irq_stats(j)->irq_hv_callback_count); seq_puts(p, " Hypervisor callback interrupts\n"); } +#endif +#if IS_ENABLED(CONFIG_HYPERV) + if (test_bit(HYPERV_REENLIGHTENMENT_VECTOR, system_vectors)) { + seq_printf(p, "%*s: ", prec, "HRE"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", + irq_stats(j)->irq_hv_reenlightenment_count); + seq_puts(p, " Hyper-V reenlightenment interrupts\n"); + } #endif seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); #if defined(CONFIG_X86_IO_APIC) -- cgit v1.2.3 From b0c39dc68e3b3d22bf9d2984f62f6c86788a49e7 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:36 +0100 Subject: x86/kvm: Pass stable clocksource to guests when running nested on Hyper-V MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, KVM is able to work in 'masterclock' mode passing PVCLOCK_TSC_STABLE_BIT to guests when the clocksource which is used on the host is TSC. When running nested on Hyper-V the guest normally uses a different one: TSC page which is resistant to TSC frequency changes on events like L1 migration. Add support for it in KVM. The only non-trivial change is in vgettsc(): when updating the gtod copy both the clock readout and tsc value have to be updated now. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-7-vkuznets@redhat.com --- arch/x86/kvm/x86.c | 93 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 68 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c53298dfbf50..b1ce368a07af 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -67,6 +67,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -1377,6 +1378,11 @@ static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) return tsc; } +static inline int gtod_is_based_on_tsc(int mode) +{ + return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK; +} + static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) { #ifdef CONFIG_X86_64 @@ -1396,7 +1402,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu) * perform request to enable masterclock. */ if (ka->use_master_clock || - (gtod->clock.vclock_mode == VCLOCK_TSC && vcpus_matched)) + (gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched)) kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc, @@ -1459,6 +1465,19 @@ static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) vcpu->arch.tsc_offset = offset; } +static inline bool kvm_check_tsc_unstable(void) +{ +#ifdef CONFIG_X86_64 + /* + * TSC is marked unstable when we're running on Hyper-V, + * 'TSC page' clocksource is good. + */ + if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK) + return false; +#endif + return check_tsc_unstable(); +} + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1504,7 +1523,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) */ if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { - if (!check_tsc_unstable()) { + if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { @@ -1604,18 +1623,43 @@ static u64 read_tsc(void) return last; } -static inline u64 vgettsc(u64 *cycle_now) +static inline u64 vgettsc(u64 *tsc_timestamp, int *mode) { long v; struct pvclock_gtod_data *gtod = &pvclock_gtod_data; + u64 tsc_pg_val; + + switch (gtod->clock.vclock_mode) { + case VCLOCK_HVCLOCK: + tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(), + tsc_timestamp); + if (tsc_pg_val != U64_MAX) { + /* TSC page valid */ + *mode = VCLOCK_HVCLOCK; + v = (tsc_pg_val - gtod->clock.cycle_last) & + gtod->clock.mask; + } else { + /* TSC page invalid */ + *mode = VCLOCK_NONE; + } + break; + case VCLOCK_TSC: + *mode = VCLOCK_TSC; + *tsc_timestamp = read_tsc(); + v = (*tsc_timestamp - gtod->clock.cycle_last) & + gtod->clock.mask; + break; + default: + *mode = VCLOCK_NONE; + } - *cycle_now = read_tsc(); + if (*mode == VCLOCK_NONE) + *tsc_timestamp = v = 0; - v = (*cycle_now - gtod->clock.cycle_last) & gtod->clock.mask; return v * gtod->clock.mult; } -static int do_monotonic_boot(s64 *t, u64 *cycle_now) +static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1624,9 +1668,8 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) do { seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; ns = gtod->nsec_base; - ns += vgettsc(cycle_now); + ns += vgettsc(tsc_timestamp, &mode); ns >>= gtod->clock.shift; ns += gtod->boot_ns; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -1635,7 +1678,7 @@ static int do_monotonic_boot(s64 *t, u64 *cycle_now) return mode; } -static int do_realtime(struct timespec *ts, u64 *cycle_now) +static int do_realtime(struct timespec *ts, u64 *tsc_timestamp) { struct pvclock_gtod_data *gtod = &pvclock_gtod_data; unsigned long seq; @@ -1644,10 +1687,9 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now) do { seq = read_seqcount_begin(>od->seq); - mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ns = gtod->nsec_base; - ns += vgettsc(cycle_now); + ns += vgettsc(tsc_timestamp, &mode); ns >>= gtod->clock.shift; } while (unlikely(read_seqcount_retry(>od->seq, seq))); @@ -1657,25 +1699,26 @@ static int do_realtime(struct timespec *ts, u64 *cycle_now) return mode; } -/* returns true if host is using tsc clocksource */ -static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *cycle_now) +/* returns true if host is using TSC based clocksource */ +static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) { /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return do_monotonic_boot(kernel_ns, cycle_now) == VCLOCK_TSC; + return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns, + tsc_timestamp)); } -/* returns true if host is using tsc clocksource */ +/* returns true if host is using TSC based clocksource */ static bool kvm_get_walltime_and_clockread(struct timespec *ts, - u64 *cycle_now) + u64 *tsc_timestamp) { /* checked again under seqlock below */ - if (pvclock_gtod_data.clock.vclock_mode != VCLOCK_TSC) + if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode)) return false; - return do_realtime(ts, cycle_now) == VCLOCK_TSC; + return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp)); } #endif @@ -2869,13 +2912,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); - if (check_tsc_unstable()) { + if (kvm_check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_vcpu_write_tsc_offset(vcpu, offset); @@ -6110,9 +6153,9 @@ static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, update_pvclock_gtod(tk); /* disable master clock if host does not trust, or does not - * use, TSC clocksource + * use, TSC based clocksource. */ - if (gtod->clock.vclock_mode != VCLOCK_TSC && + if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) && atomic_read(&kvm_guest_has_master_clock) != 0) queue_work(system_long_wq, &pvclock_gtod_work); @@ -7767,7 +7810,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, { struct kvm_vcpu *vcpu; - if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) + if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0) printk_once(KERN_WARNING "kvm: SMP vm created on host with unstable TSC; " "guest TSC will not be reliable\n"); @@ -7924,7 +7967,7 @@ int kvm_arch_hardware_enable(void) return ret; local_tsc = rdtsc(); - stable = !check_tsc_unstable(); + stable = !kvm_check_tsc_unstable(); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { if (!stable && vcpu->cpu == smp_processor_id()) -- cgit v1.2.3 From 0092e4346f49558e5fe5a927c6d78d401dc4ed73 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Jan 2018 14:23:37 +0100 Subject: x86/kvm: Support Hyper-V reenlightenment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When running nested KVM on Hyper-V guests its required to update masterclocks for all guests when L1 migrates to a host with different TSC frequency. Implement the procedure in the following way: - Pause all guests. - Tell the host (Hyper-V) to stop emulating TSC accesses. - Update the gtod copy, recompute clocks. - Unpause all guests. This is somewhat similar to cpufreq but there are two important differences: - TSC emulation can only be disabled globally (on all CPUs) - The new TSC frequency is not known until emulation is turned off so there is no way to 'prepare' for the event upfront. Signed-off-by: Vitaly Kuznetsov Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal Link: https://lkml.kernel.org/r/20180124132337.30138-8-vkuznets@redhat.com --- arch/x86/kvm/x86.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b1ce368a07af..879a99987401 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -68,6 +68,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -5932,6 +5933,43 @@ static void tsc_khz_changed(void *data) __this_cpu_write(cpu_tsc_khz, khz); } +static void kvm_hyperv_tsc_notifier(void) +{ +#ifdef CONFIG_X86_64 + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int cpu; + + spin_lock(&kvm_lock); + list_for_each_entry(kvm, &vm_list, vm_list) + kvm_make_mclock_inprogress_request(kvm); + + hyperv_stop_tsc_emulation(); + + /* TSC frequency always matches when on Hyper-V */ + for_each_present_cpu(cpu) + per_cpu(cpu_tsc_khz, cpu) = tsc_khz; + kvm_max_guest_tsc_khz = tsc_khz; + + list_for_each_entry(kvm, &vm_list, vm_list) { + struct kvm_arch *ka = &kvm->arch; + + spin_lock(&ka->pvclock_gtod_sync_lock); + + pvclock_update_vm_gtod_copy(kvm); + + kvm_for_each_vcpu(cpu, vcpu, kvm) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); + + kvm_for_each_vcpu(cpu, vcpu, kvm) + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); + + spin_unlock(&ka->pvclock_gtod_sync_lock); + } + spin_unlock(&kvm_lock); +#endif +} + static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { @@ -6217,6 +6255,9 @@ int kvm_arch_init(void *opaque) kvm_lapic_init(); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); + + if (x86_hyper_type == X86_HYPER_MS_HYPERV) + set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif return 0; @@ -6229,6 +6270,10 @@ out: void kvm_arch_exit(void) { +#ifdef CONFIG_X86_64 + if (x86_hyper_type == X86_HYPER_MS_HYPERV) + clear_hv_tscchange_cb(); +#endif kvm_lapic_exit(); perf_unregister_guest_info_callbacks(&kvm_guest_cbs); -- cgit v1.2.3 From 5fa4ec9cb2e6679e2f828033726f758ea314b9c5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 31 Jan 2018 09:41:40 +0100 Subject: x86/kvm: Make it compile on 32bit and with HYPYERVISOR_GUEST=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reenlightment support for hyperv slapped a direct reference to x86_hyper_type into the kvm code which results in the following build failure when CONFIG_HYPERVISOR_GUEST=n: arch/x86/kvm/x86.c:6259:6: error: ‘x86_hyper_type’ undeclared (first use in this function) arch/x86/kvm/x86.c:6259:6: note: each undeclared identifier is reported only once for each function it appears in Use the proper helper function to cure that. The 32bit compile fails because of: arch/x86/kvm/x86.c:5936:13: warning: ‘kvm_hyperv_tsc_notifier’ defined but not used [-Wunused-function] which is a real trainwreck engineering artwork. The callsite is wrapped into #ifdef CONFIG_X86_64, but the function itself has the #ifdef inside the function body. Make the function itself wrapped into the ifdef to cure that. Qualiteee.... Fixes: 0092e4346f49 ("x86/kvm: Support Hyper-V reenlightenment") Reported-by: Ingo Molnar Signed-off-by: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Paolo Bonzini Cc: Stephen Hemminger Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Haiyang Zhang Cc: "Michael Kelley (EOSG)" Cc: Roman Kagan Cc: Andy Lutomirski Cc: devel@linuxdriverproject.org Cc: "K. Y. Srinivasan" Cc: Cathy Avery Cc: Mohammed Gamal --- arch/x86/kvm/x86.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 879a99987401..cd3b3bc67c5a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5933,9 +5933,9 @@ static void tsc_khz_changed(void *data) __this_cpu_write(cpu_tsc_khz, khz); } +#ifdef CONFIG_X86_64 static void kvm_hyperv_tsc_notifier(void) { -#ifdef CONFIG_X86_64 struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; @@ -5967,8 +5967,8 @@ static void kvm_hyperv_tsc_notifier(void) spin_unlock(&ka->pvclock_gtod_sync_lock); } spin_unlock(&kvm_lock); -#endif } +#endif static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -6256,7 +6256,7 @@ int kvm_arch_init(void *opaque) #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); - if (x86_hyper_type == X86_HYPER_MS_HYPERV) + if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); #endif @@ -6271,7 +6271,7 @@ out: void kvm_arch_exit(void) { #ifdef CONFIG_X86_64 - if (x86_hyper_type == X86_HYPER_MS_HYPERV) + if (hypervisor_is_type(X86_HYPER_MS_HYPERV)) clear_hv_tscchange_cb(); #endif kvm_lapic_exit(); -- cgit v1.2.3 From d391f1207067268261add0485f0f34503539c5b0 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Thu, 25 Jan 2018 16:37:07 +0100 Subject: x86/kvm/vmx: do not use vm-exit instruction length for fast MMIO when running nested MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I was investigating an issue with seabios >= 1.10 which stopped working for nested KVM on Hyper-V. The problem appears to be in handle_ept_violation() function: when we do fast mmio we need to skip the instruction so we do kvm_skip_emulated_instruction(). This, however, depends on VM_EXIT_INSTRUCTION_LEN field being set correctly in VMCS. However, this is not the case. Intel's manual doesn't mandate VM_EXIT_INSTRUCTION_LEN to be set when EPT MISCONFIG occurs. While on real hardware it was observed to be set, some hypervisors follow the spec and don't set it; we end up advancing IP with some random value. I checked with Microsoft and they confirmed they don't fill VM_EXIT_INSTRUCTION_LEN on EPT MISCONFIG. Fix the issue by doing instruction skip through emulator when running nested. Fixes: 68c3b4d1676d870f0453c31d5a52e7e65c7448ae Suggested-by: Radim Krčmář Suggested-by: Paolo Bonzini Signed-off-by: Vitaly Kuznetsov Acked-by: Michael S. Tsirkin Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 16 +++++++++++++++- arch/x86/kvm/x86.c | 3 ++- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1e2ca9e8662f..438802d0b01d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6577,7 +6577,21 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) if (!is_guest_mode(vcpu) && !kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { trace_kvm_fast_mmio(gpa); - return kvm_skip_emulated_instruction(vcpu); + /* + * Doing kvm_skip_emulated_instruction() depends on undefined + * behavior: Intel's manual doesn't mandate + * VM_EXIT_INSTRUCTION_LEN to be set in VMCS when EPT MISCONFIG + * occurs and while on real hardware it was observed to be set, + * other hypervisors (namely Hyper-V) don't set it, we end up + * advancing IP with some random value. Disable fast mmio when + * running nested and keep it for real hardware in hope that + * VM_EXIT_INSTRUCTION_LEN will always be set correctly. + */ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return kvm_skip_emulated_instruction(vcpu); + else + return x86_emulate_instruction(vcpu, gpa, EMULTYPE_SKIP, + NULL, 0) == EMULATE_DONE; } ret = kvm_mmu_page_fault(vcpu, gpa, PFERR_RSVD_MASK, NULL, 0); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0204b2b8a293..01571102b50c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5773,7 +5773,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, * handle watchpoints yet, those would be handled in * the emulate_ops. */ - if (kvm_vcpu_check_breakpoint(vcpu, &r)) + if (!(emulation_type & EMULTYPE_SKIP) && + kvm_vcpu_check_breakpoint(vcpu, &r)) return r; ctxt->interruptibility = 0; -- cgit v1.2.3 From 806793f5f76eb9c000ed05e307a8f7e9450b3291 Mon Sep 17 00:00:00 2001 From: Stanislav Lanci Date: Mon, 29 Jan 2018 11:39:44 -0500 Subject: KVM: x86: AMD Processor Topology Information MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch allow to enable x86 feature TOPOEXT. This is needed to provide information about SMT on AMD Zen CPUs to the guest. Signed-off-by: Stanislav Lanci Tested-by: Nick Sarnie Reviewed-by: Paolo Bonzini Signed-off-by: Babu Moger Signed-off-by: Radim Krčmář --- arch/x86/kvm/cpuid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ac0041c2f5af..20e491b94f44 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -371,7 +371,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | - 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM) | + F(TOPOEXT); /* cpuid 0xC0000001.edx */ const u32 kvm_cpuid_C000_0001_edx_x86_features = -- cgit v1.2.3 From 87cedc6be55954c6efd6eca2e694132513f65a2a Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" Date: Fri, 26 Jan 2018 17:34:08 +0800 Subject: kvm: x86: remove efer_reload entry in kvm_vcpu_stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The efer_reload is never used since commit 26bb0981b3ff ("KVM: VMX: Use shared msr infrastructure"), so remove it. Signed-off-by: Longpeng(Mike) Signed-off-by: Radim Krčmář --- arch/x86/include/asm/kvm_host.h | 1 - arch/x86/kvm/x86.c | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index ea7e40e9c1f0..dd6f57a54a26 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -895,7 +895,6 @@ struct kvm_vcpu_stat { u64 request_irq_exits; u64 irq_exits; u64 host_state_reload; - u64 efer_reload; u64 fpu_reload; u64 insn_emulation; u64 insn_emulation_fail; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 01571102b50c..c13cd14c4780 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -177,7 +177,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "request_irq", VCPU_STAT(request_irq_exits) }, { "irq_exits", VCPU_STAT(irq_exits) }, { "host_state_reload", VCPU_STAT(host_state_reload) }, - { "efer_reload", VCPU_STAT(efer_reload) }, { "fpu_reload", VCPU_STAT(fpu_reload) }, { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, -- cgit v1.2.3 From 8dbfb2bf1bb3848a8069164e205635b2675c24fe Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 20 Dec 2017 16:24:27 -0800 Subject: KVM: x86: don't forget vcpu_put() in kvm_arch_vcpu_ioctl_set_sregs() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Due to a bad merge resolution between commit f29810335965 ("KVM/x86: Check input paging mode when cs.l is set") and commit b4ef9d4e8cb8 ("KVM: Move vcpu_load to arch-specific kvm_arch_vcpu_ioctl_set_sregs"), there is a case in kvm_arch_vcpu_ioctl_set_sregs() where vcpu_put() is not called after vcpu_get(). Fix it. Reported-by: syzbot Signed-off-by: Eric Biggers Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0e27ee573bd5..07d1c7f66d97 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7706,7 +7706,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, goto out; if (kvm_valid_sregs(vcpu, sregs)) - return -EINVAL; + goto out; apic_base_msr.data = sregs->apic_base; apic_base_msr.host_initiated = true; -- cgit v1.2.3