From 7fe864dc942c041cc4f56e287c4025d54a8e6c1e Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:03 +0300 Subject: KVM: x86: Mark VEX-prefix instructions emulation as unimplemented Currently the emulator does not recognize vex-prefix instructions. However, it may incorrectly decode lgdt/lidt instructions and try to execute them. This patch returns unhandlable error on their emulation. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index e4e833d3d7d7..8ec4a3ebf403 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4314,6 +4314,13 @@ done_prefixes: if (ctxt->d & ModRM) ctxt->modrm = insn_fetch(u8, ctxt); + /* vex-prefix instructions are not implemented */ + if (ctxt->opcode_len == 1 && (ctxt->b == 0xc5 || ctxt->b == 0xc4) && + (mode == X86EMUL_MODE_PROT64 || + (mode >= X86EMUL_MODE_PROT16 && (ctxt->modrm & 0x80)))) { + ctxt->d = NotImpl; + } + while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: -- cgit v1.2.3 From e37a75a13cdae5deaa2ea2cbf8d55b5dd08638b6 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:04 +0300 Subject: KVM: x86: Emulator ignores LDTR/TR extended base on LLDT/LTR The current implementation ignores the LDTR/TR base high 32-bits on long-mode. As a result the loaded segment descriptor may be incorrect. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8ec4a3ebf403..136088fb038b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1422,6 +1422,7 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ulong desc_addr; int ret; u16 dummy; + u32 base3 = 0; memset(&seg_desc, 0, sizeof seg_desc); @@ -1538,9 +1539,14 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt, ret = write_segment_descriptor(ctxt, selector, &seg_desc); if (ret != X86EMUL_CONTINUE) return ret; + } else if (ctxt->mode == X86EMUL_MODE_PROT64) { + ret = ctxt->ops->read_std(ctxt, desc_addr+8, &base3, + sizeof(base3), &ctxt->exception); + if (ret != X86EMUL_CONTINUE) + return ret; } load: - ctxt->ops->set_segment(ctxt, selector, &seg_desc, 0, seg); + ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); -- cgit v1.2.3 From 2eedcac8a97cef43c9c5236398fc8c9d0fd9cc0c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:05 +0300 Subject: KVM: x86: Loading segments on 64-bit mode may be wrong The current emulator implementation ignores the high 32 bits of the base in long-mode. During segment load from the LDT, the base of the LDT is calculated incorrectly and may cause the wrong segment to be loaded. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 136088fb038b..7e4a45cab400 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1358,17 +1358,19 @@ static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { const struct x86_emulate_ops *ops = ctxt->ops; + u32 base3 = 0; if (selector & 1 << 2) { struct desc_struct desc; u16 sel; memset (dt, 0, sizeof *dt); - if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) + if (!ops->get_segment(ctxt, &sel, &desc, &base3, + VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ - dt->address = get_desc_base(&desc); + dt->address = get_desc_base(&desc) | ((u64)base3 << 32); } else ops->get_gdt(ctxt, dt); } -- cgit v1.2.3 From 606b1c3e87597c2d6c9f3eb833a7251262390295 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:06 +0300 Subject: KVM: x86: sgdt and sidt are not privilaged The SGDT and SIDT instructions are not privilaged, i.e. they can be executed with CPL>0. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 7e4a45cab400..a16bf225cab0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3642,8 +3642,8 @@ static const struct opcode group6[] = { }; static const struct group_dual group7 = { { - II(Mov | DstMem | Priv, em_sgdt, sgdt), - II(Mov | DstMem | Priv, em_sidt, sidt), + II(Mov | DstMem, em_sgdt, sgdt), + II(Mov | DstMem, em_sidt, sidt), II(SrcMem | Priv, em_lgdt, lgdt), II(SrcMem | Priv, em_lidt, lidt), II(SrcNone | DstMem | Mov, em_smsw, smsw), N, -- cgit v1.2.3 From 37c564f2854bf75969d0ac26e03f5cf2bb7d639f Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:07 +0300 Subject: KVM: x86: cmpxchg emulation should compare in reverse order The current implementation of cmpxchg does not update the flags correctly, since the accumulator should be compared with the destination and not the other way around. The current implementation does not update the flags correctly. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a16bf225cab0..4cb0da660283 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2052,8 +2052,10 @@ static int em_ret_far_imm(struct x86_emulate_ctxt *ctxt) static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) { /* Save real source value, then compare EAX against destination. */ + ctxt->dst.orig_val = ctxt->dst.val; + ctxt->dst.val = reg_read(ctxt, VCPU_REGS_RAX); ctxt->src.orig_val = ctxt->src.val; - ctxt->src.val = reg_read(ctxt, VCPU_REGS_RAX); + ctxt->src.val = ctxt->dst.orig_val; fastop(ctxt, em_cmp); if (ctxt->eflags & EFLG_ZF) { @@ -2063,6 +2065,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt) /* Failure: write the value we saw to EAX. */ ctxt->dst.type = OP_REG; ctxt->dst.addr.reg = reg_rmw(ctxt, VCPU_REGS_RAX); + ctxt->dst.val = ctxt->dst.orig_val; } return X86EMUL_CONTINUE; } -- cgit v1.2.3 From 3b32004a66e96e17d2a031c08d3304245c506dfc Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:08 +0300 Subject: KVM: x86: movnti minimum op size of 32-bit is not kept If the operand-size prefix (0x66) is used in 64-bit mode, the emulator would assume the destination operand is 64-bit, when it should be 32-bit. Reminder: movnti does not support 16-bit operands and its default operand size is 32-bit. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4cb0da660283..be3f7645e39d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4836,8 +4836,8 @@ twobyte_insn: break; case 0xc3: /* movnti */ ctxt->dst.bytes = ctxt->op_bytes; - ctxt->dst.val = (ctxt->op_bytes == 4) ? (u32) ctxt->src.val : - (u64) ctxt->src.val; + ctxt->dst.val = (ctxt->op_bytes == 8) ? (u64) ctxt->src.val : + (u32) ctxt->src.val; break; default: goto cannot_emulate; -- cgit v1.2.3 From 67f4d4288c353734d29c45f6725971c71af96791 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:09 +0300 Subject: KVM: x86: rdpmc emulation checks the counter incorrectly The rdpmc emulation checks that the counter (ECX) is not higher than 2, without taking into considerations bits 30:31 role (e.g., bit 30 marks whether the counter is fixed). The fix uses the pmu information for checking the validity of the pmu counter. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/emulate.c | 2 +- arch/x86/kvm/pmu.c | 9 +++++++++ arch/x86/kvm/x86.c | 7 +++++++ 5 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index a04fe4eb237d..ffa2671a7f2f 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -194,6 +194,7 @@ struct x86_emulate_ops { int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + int (*check_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc); int (*read_pmc)(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata); void (*halt)(struct x86_emulate_ctxt *ctxt); void (*wbinvd)(struct x86_emulate_ctxt *ctxt); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49314155b66c..63e020be3da7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1070,6 +1070,7 @@ void kvm_pmu_cpuid_update(struct kvm_vcpu *vcpu); bool kvm_pmu_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); +int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc); int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); void kvm_handle_pmu_event(struct kvm_vcpu *vcpu); void kvm_deliver_pmi(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index be3f7645e39d..3da8d82acb31 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3507,7 +3507,7 @@ static int check_rdpmc(struct x86_emulate_ctxt *ctxt) u64 rcx = reg_read(ctxt, VCPU_REGS_RCX); if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || - (rcx > 3)) + ctxt->ops->check_pmc(ctxt, rcx)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index cbecaa90399c..3dd6accb64ec 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -428,6 +428,15 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; } +int kvm_pmu_check_pmc(struct kvm_vcpu *vcpu, unsigned pmc) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + bool fixed = pmc & (1u << 30); + pmc &= ~(3u << 30); + return (!fixed && pmc >= pmu->nr_arch_gp_counters) || + (fixed && pmc >= pmu->nr_arch_fixed_counters); +} + int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) { struct kvm_pmu *pmu = &vcpu->arch.pmu; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f32a02578c0d..451d6acea808 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4762,6 +4762,12 @@ static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, return kvm_set_msr(emul_to_vcpu(ctxt), &msr); } +static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, + u32 pmc) +{ + return kvm_pmu_check_pmc(emul_to_vcpu(ctxt), pmc); +} + static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc, u64 *pdata) { @@ -4838,6 +4844,7 @@ static const struct x86_emulate_ops emulate_ops = { .set_dr = emulator_set_dr, .set_msr = emulator_set_msr, .get_msr = emulator_get_msr, + .check_pmc = emulator_check_pmc, .read_pmc = emulator_read_pmc, .halt = emulator_halt, .wbinvd = emulator_wbinvd, -- cgit v1.2.3 From aaa05f2437b9450f30b301db962ec4d45ec90fbb Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:10 +0300 Subject: KVM: x86: Return error on cmpxchg16b emulation cmpxchg16b is currently unimplemented in the emulator. The least we can do is return error upon the emulation of this instruction. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3da8d82acb31..a151f8d24a1d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1999,6 +1999,9 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt) { u64 old = ctxt->dst.orig_val64; + if (ctxt->dst.bytes == 16) + return X86EMUL_UNHANDLEABLE; + if (((u32) (old >> 0) != (u32) reg_read(ctxt, VCPU_REGS_RAX)) || ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) { *reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0); @@ -4077,7 +4080,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->orig_val = op->val; break; case OpMem64: - ctxt->memop.bytes = 8; + ctxt->memop.bytes = (ctxt->op_bytes == 8) ? 16 : 8; goto mem_common; case OpAcc: op->type = OP_REG; -- cgit v1.2.3 From 32e94d0696c26c6ba4f3ff53e70f6e0e825979bc Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 2 Jun 2014 18:34:11 +0300 Subject: KVM: x86: smsw emulation is incorrect in 64-bit mode In 64-bit mode, when the destination is a register, the assignment is done according to the operand size. Otherwise (memory operand or no 64-bit mode), a 16-bit assignment is performed. Currently, 16-bit assignment is always done to the destination. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a151f8d24a1d..9b5d97db7631 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3235,7 +3235,8 @@ static int em_lidt(struct x86_emulate_ctxt *ctxt) static int em_smsw(struct x86_emulate_ctxt *ctxt) { - ctxt->dst.bytes = 2; + if (ctxt->dst.type == OP_MEM) + ctxt->dst.bytes = 2; ctxt->dst.val = ctxt->ops->get_cr(ctxt, 0); return X86EMUL_CONTINUE; } -- cgit v1.2.3 From a5457e7bcf9a76ec5c2de5d311d9b0d3b724edc6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 5 Jun 2014 17:29:34 +0200 Subject: KVM: emulate: POP SS triggers a MOV SS shadow too We did not do that when interruptibility was added to the emulator, because at the time pop to segment was not implemented. Now it is, add it. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 9b5d97db7631..bc670675223d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1762,6 +1762,9 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) if (rc != X86EMUL_CONTINUE) return rc; + if (ctxt->modrm_reg == VCPU_SREG_SS) + ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS; + rc = load_segment_descriptor(ctxt, (u16)selector, seg); return rc; } -- cgit v1.2.3 From 968889771749d8e730d794deed2bd2e363a98a54 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2014 14:54:19 +0200 Subject: KVM: emulate: simplify BitOp handling Memory is always the destination for BitOp instructions. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index bc670675223d..ea976e478dca 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4079,7 +4079,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, mem_common: *op = ctxt->memop; ctxt->memopp = op; - if ((ctxt->d & BitOp) && op == &ctxt->dst) + if (ctxt->d & BitOp) fetch_bit_operand(ctxt); op->orig_val = op->val; break; -- cgit v1.2.3 From bdc907222c5e4edd848da0c031deb55b59f1cf9a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 May 2014 14:03:29 +0200 Subject: KVM: emulate: fix harmless typo in MMX decoding It was using the wrong member of the union. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ea976e478dca..8a1796b1eef2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1093,7 +1093,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, if (ctxt->d & Mmx) { op->type = OP_MM; op->bytes = 8; - op->addr.xmm = ctxt->modrm_rm & 7; + op->addr.mm = ctxt->modrm_rm & 7; return rc; } fetch_register_operand(op); -- cgit v1.2.3 From bc39c4db7110f88f338cbbabe53d3e43c7400a59 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Sat, 14 Jun 2014 23:44:29 +0200 Subject: arch/x86/kvm/vmx.c: use PAGE_ALIGNED instead of IS_ALIGNED(PAGE_SIZE use mm.h definition Cc: Gleb Natapov Cc: Paolo Bonzini Signed-off-by: Fabian Frederick Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 801332edefc3..4f84be0d2d1e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5918,7 +5918,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, * which replaces physical address width with 32 * */ - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); skip_emulated_instruction(vcpu); return 1; @@ -5936,7 +5936,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, vmx->nested.vmxon_ptr = vmptr; break; case EXIT_REASON_VMCLEAR: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -5951,7 +5951,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, } break; case EXIT_REASON_VMPTRLD: - if (!IS_ALIGNED(vmptr, PAGE_SIZE) || (vmptr >> maxphyaddr)) { + if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); skip_emulated_instruction(vcpu); @@ -8113,14 +8113,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if ((vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_MSR_BITMAPS) && - !IS_ALIGNED(vmcs12->msr_bitmap, PAGE_SIZE)) { + !PAGE_ALIGNED(vmcs12->msr_bitmap)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) && - !IS_ALIGNED(vmcs12->apic_access_addr, PAGE_SIZE)) { + !PAGE_ALIGNED(vmcs12->apic_access_addr)) { /*TODO: Also verify bits beyond physical address width are 0*/ nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; -- cgit v1.2.3 From 7dec5603b6b8dc4c3e1c65d318bd2a5a8c62a424 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 15 Jun 2014 16:12:57 +0300 Subject: KVM: x86: bit-ops emulation ignores offset on 64-bit The current emulation of bit operations ignores the offset from the destination on 64-bit target memory operands. This patch fixes this behavior. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8a1796b1eef2..439a3286406a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1220,12 +1220,14 @@ static void fetch_bit_operand(struct x86_emulate_ctxt *ctxt) long sv = 0, mask; if (ctxt->dst.type == OP_MEM && ctxt->src.type == OP_REG) { - mask = ~(ctxt->dst.bytes * 8 - 1); + mask = ~((long)ctxt->dst.bytes * 8 - 1); if (ctxt->src.bytes == 2) sv = (s16)ctxt->src.val & (s16)mask; else if (ctxt->src.bytes == 4) sv = (s32)ctxt->src.val & (s32)mask; + else + sv = (s64)ctxt->src.val & (s64)mask; ctxt->dst.addr.mem.ea += (sv >> 3); } -- cgit v1.2.3 From ee212297cd425620867d4398d55d068c4203768c Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 15 Jun 2014 16:12:58 +0300 Subject: KVM: x86: Wrong emulation on 'xadd X, X' The emulator does not emulate the xadd instruction correctly if the two operands are the same. In this (unlikely) situation the result should be the sum of X and X (2X) when it is currently X. The solution is to first perform writeback to the source, before writing to the destination. The only instruction which should be affected is xadd, as the other instructions that perform writeback to the source use the extended accumlator (e.g., RAX:RDX). Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 439a3286406a..0d5ecbd45882 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4736,17 +4736,17 @@ special_insn: goto done; writeback: - if (!(ctxt->d & NoWrite)) { - rc = writeback(ctxt, &ctxt->dst); - if (rc != X86EMUL_CONTINUE) - goto done; - } if (ctxt->d & SrcWrite) { BUG_ON(ctxt->src.type == OP_MEM || ctxt->src.type == OP_MEM_STR); rc = writeback(ctxt, &ctxt->src); if (rc != X86EMUL_CONTINUE) goto done; } + if (!(ctxt->d & NoWrite)) { + rc = writeback(ctxt, &ctxt->dst); + if (rc != X86EMUL_CONTINUE) + goto done; + } /* * restore dst type in case the decoding will be reused -- cgit v1.2.3 From 9e8919ae793f4edfaa29694a70f71a515ae9942a Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 15 Jun 2014 16:12:59 +0300 Subject: KVM: x86: Inter-privilege level ret emulation is not implemeneted Return unhandlable error on inter-privilege level ret instruction. This is since the current emulation does not check the privilege level correctly when loading the CS, and does not pop RSP/SS as needed. Cc: stable@vger.kernel.org Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0d5ecbd45882..27ce38693f09 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2033,6 +2033,7 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) { int rc; unsigned long cs; + int cpl = ctxt->ops->cpl(ctxt); rc = emulate_pop(ctxt, &ctxt->_eip, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) @@ -2042,6 +2043,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) rc = emulate_pop(ctxt, &cs, ctxt->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; + /* Outer-privilege level return is not implemented */ + if (ctxt->mode >= X86EMUL_MODE_PROT16 && (cs & 3) > cpl) + return X86EMUL_UNHANDLEABLE; rc = load_segment_descriptor(ctxt, (u16)cs, VCPU_SREG_CS); return rc; } -- cgit v1.2.3 From 140bad89fd25db1aab60f80ed7874e9a9bdbae3b Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 15 Jun 2014 16:13:00 +0300 Subject: KVM: x86: emulation of dword cmov on long-mode should clear [63:32] Even if the condition of cmov is not satisfied, bits[63:32] should be cleared. This is clearly stated in Intel's CMOVcc documentation. The solution is to reassign the destination onto itself if the condition is unsatisfied. For that matter the original destination value needs to be read. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 27ce38693f09..6f09b2e555ef 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -3923,7 +3923,7 @@ static const struct opcode twobyte_table[256] = { N, N, N, N, N, N, N, N, N, N, /* 0x40 - 0x4F */ - X16(D(DstReg | SrcMem | ModRM | Mov)), + X16(D(DstReg | SrcMem | ModRM)), /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ @@ -4824,8 +4824,10 @@ twobyte_insn: ops->get_dr(ctxt, ctxt->modrm_reg, &ctxt->dst.val); break; case 0x40 ... 0x4f: /* cmov */ - ctxt->dst.val = ctxt->dst.orig_val = ctxt->src.val; - if (!test_cc(ctxt->b, ctxt->eflags)) + if (test_cc(ctxt->b, ctxt->eflags)) + ctxt->dst.val = ctxt->src.val; + else if (ctxt->mode != X86EMUL_MODE_PROT64 || + ctxt->op_bytes != 4) ctxt->dst.type = OP_NONE; /* no writeback */ break; case 0x80 ... 0x8f: /* jnz rel, etc*/ -- cgit v1.2.3 From a825f5cc4a8455663562809748240169cb9bc2c0 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 15 Jun 2014 16:13:01 +0300 Subject: KVM: x86: NOP emulation clears (incorrectly) the high 32-bits of RAX On long-mode the current NOP (0x90) emulation still writes back to RAX. As a result, EAX is zero-extended and the high 32-bits of RAX are cleared. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6f09b2e555ef..84dc4ba0364d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4688,8 +4688,9 @@ special_insn: break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (ctxt->dst.addr.reg == reg_rmw(ctxt, VCPU_REGS_RAX)) - break; - rc = em_xchg(ctxt); + ctxt->dst.type = OP_NONE; + else + rc = em_xchg(ctxt); break; case 0x98: /* cbw/cwde/cdqe */ switch (ctxt->op_bytes) { -- cgit v1.2.3 From e4aa5288ff07766d101751de9a8420d666c61735 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 12 Jun 2014 19:40:32 +0200 Subject: KVM: x86: Fix constant value of VM_{EXIT_SAVE,ENTRY_LOAD}_DEBUG_CONTROLS The spec says those controls are at bit position 2 - makes 4 as value. The impact of this mistake is effectively zero as we only use them to ensure that these features are set at position 2 (or, previously, 1) in MSR_IA32_VMX_{EXIT,ENTRY}_CTLS - which is and will be always true according to the spec. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 7004d21e6219..d989829d3304 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -76,7 +76,7 @@ #define PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x00000016 -#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002 +#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000004 #define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 #define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000 #define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 @@ -89,7 +89,7 @@ #define VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR 0x00036dff -#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002 +#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000004 #define VM_ENTRY_IA32E_MODE 0x00000200 #define VM_ENTRY_SMM 0x00000400 #define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 -- cgit v1.2.3 From 3dbcd8da7b564194f93271b003a1c46ef404cbdb Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 16 Jun 2014 13:59:40 +0200 Subject: KVM: nVMX: Advertise support for MSR_IA32_VMX_TRUE_*_CTLS We already implemented them but failed to advertise them. Currently they all return the identical values to the capability MSRs they are augmenting. So there is no change in exposed features yet. Drop related comments at this chance that are partially incorrect and redundant anyway. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/msr-index.h | 1 + arch/x86/kvm/vmx.c | 13 ++----------- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index fcf2b3ae1bf0..eaefcc66c855 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -558,6 +558,7 @@ /* VMX_BASIC bits and bitmasks */ #define VMX_BASIC_VMCS_SIZE_SHIFT 32 +#define VMX_BASIC_TRUE_CTLS (1ULL << 55) #define VMX_BASIC_64 0x0001000000000000LLU #define VMX_BASIC_MEM_TYPE_SHIFT 50 #define VMX_BASIC_MEM_TYPE_MASK 0x003c000000000000LLU diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4f84be0d2d1e..31379faf952e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2265,21 +2265,13 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* pin-based controls */ rdmsr(MSR_IA32_VMX_PINBASED_CTLS, nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high); - /* - * According to the Intel spec, if bit 55 of VMX_BASIC is off (as it is - * in our case), bits 1, 2 and 4 (i.e., 0x16) must be 1 in this MSR. - */ nested_vmx_pinbased_ctls_low |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_pinbased_ctls_high &= PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | PIN_BASED_VIRTUAL_NMIS; nested_vmx_pinbased_ctls_high |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR | PIN_BASED_VMX_PREEMPTION_TIMER; - /* - * Exit controls - * If bit 55 of VMX_BASIC is off, bits 0-8 and 10, 11, 13, 14, 16 and - * 17 must be 1. - */ + /* exit controls */ rdmsr(MSR_IA32_VMX_EXIT_CTLS, nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); nested_vmx_exit_ctls_low = VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; @@ -2299,7 +2291,6 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); - /* If bit 55 of VMX_BASIC is off, bits 0-8 and 12 must be 1. */ nested_vmx_entry_ctls_low = VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_entry_ctls_high &= #ifdef CONFIG_X86_64 @@ -2394,7 +2385,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) * guest, and the VMCS structure we give it - not about the * VMX support of the underlying hardware. */ - *pdata = VMCS12_REVISION | + *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); break; -- cgit v1.2.3 From 3dcdf3ec6e48d918741ea11349d4436d0c5aac93 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 16 Jun 2014 13:59:41 +0200 Subject: KVM: nVMX: Allow to disable CR3 access interception We already have this control enabled by exposing a broken MSR_IA32_VMX_PROCBASED_CTLS value. This will properly advertise our capability once the value is fixed by clearing the right bits in MSR_IA32_VMX_TRUE_PROCBASED_CTLS. We also have to ensure to test the right value on L2 entry. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 31379faf952e..e55e404b5dba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2239,6 +2239,7 @@ static inline bool nested_vmx_allowed(struct kvm_vcpu *vcpu) * or other means. */ static u32 nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high; +static u32 nested_vmx_true_procbased_ctls_low; static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; @@ -2328,6 +2329,10 @@ static __init void nested_vmx_setup_ctls_msrs(void) */ nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; + /* We support free control of CR3 access interception. */ + nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & + ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); + /* secondary cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high); @@ -2395,6 +2400,9 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_pinbased_ctls_high); break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_procbased_ctls_low, + nested_vmx_procbased_ctls_high); + break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr(nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); @@ -8127,7 +8135,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high) || + nested_vmx_true_procbased_ctls_low, + nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, -- cgit v1.2.3 From 560b7ee12ca5e1ebc1675d7eb4008bb22708277a Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 16 Jun 2014 13:59:42 +0200 Subject: KVM: nVMX: Fix returned value of MSR_IA32_VMX_PROCBASED_CTLS SDM says bits 1, 4-6, 8, 13-16, and 26 have to be set. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/vmx.h | 3 +++ arch/x86/kvm/vmx.c | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index d989829d3304..bcbfade26d8d 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -51,6 +51,9 @@ #define CPU_BASED_MONITOR_EXITING 0x20000000 #define CPU_BASED_PAUSE_EXITING 0x40000000 #define CPU_BASED_ACTIVATE_SECONDARY_CONTROLS 0x80000000 + +#define CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR 0x0401e172 + /* * Definitions of Secondary Processor-Based VM-Execution Controls. */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e55e404b5dba..66365a009cff 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2306,7 +2306,7 @@ static __init void nested_vmx_setup_ctls_msrs(void) /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); - nested_vmx_procbased_ctls_low = 0; + nested_vmx_procbased_ctls_low = CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; nested_vmx_procbased_ctls_high &= CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING | @@ -2327,7 +2327,8 @@ static __init void nested_vmx_setup_ctls_msrs(void) * can use it to avoid exits to L1 - even when L0 runs L2 * without MSR bitmaps. */ - nested_vmx_procbased_ctls_high |= CPU_BASED_USE_MSR_BITMAPS; + nested_vmx_procbased_ctls_high |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR | + CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ nested_vmx_true_procbased_ctls_low = nested_vmx_procbased_ctls_low & -- cgit v1.2.3 From 2996fca0690f03a5220203588f4a0d8c5acba2b0 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 16 Jun 2014 13:59:43 +0200 Subject: KVM: nVMX: Allow to disable VM_{ENTRY_LOAD,EXIT_SAVE}_DEBUG_CONTROLS Allow L1 to "leak" its debug controls into L2, i.e. permit cleared VM_{ENTRY_LOAD,EXIT_SAVE}_DEBUG_CONTROLS. This requires to manually transfer the state of DR7 and IA32_DEBUGCTLMSR from L1 into L2 as both run on different VMCS. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 44 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 66365a009cff..b93e2ae2bb62 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -383,6 +383,9 @@ struct nested_vmx { struct hrtimer preemption_timer; bool preemption_timer_expired; + + /* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */ + u64 vmcs01_debugctl; }; #define POSTED_INTR_ON 0 @@ -2243,7 +2246,9 @@ static u32 nested_vmx_true_procbased_ctls_low; static u32 nested_vmx_secondary_ctls_low, nested_vmx_secondary_ctls_high; static u32 nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high; static u32 nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high; +static u32 nested_vmx_true_exit_ctls_low; static u32 nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high; +static u32 nested_vmx_true_entry_ctls_low; static u32 nested_vmx_misc_low, nested_vmx_misc_high; static u32 nested_vmx_ept_caps; static __init void nested_vmx_setup_ctls_msrs(void) @@ -2289,6 +2294,10 @@ static __init void nested_vmx_setup_ctls_msrs(void) if (vmx_mpx_supported()) nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; + /* We support free control of debug control saving. */ + nested_vmx_true_exit_ctls_low = nested_vmx_exit_ctls_low & + ~VM_EXIT_SAVE_DEBUG_CONTROLS; + /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); @@ -2303,6 +2312,10 @@ static __init void nested_vmx_setup_ctls_msrs(void) if (vmx_mpx_supported()) nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; + /* We support free control of debug control loading. */ + nested_vmx_true_entry_ctls_low = nested_vmx_entry_ctls_low & + ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, nested_vmx_procbased_ctls_low, nested_vmx_procbased_ctls_high); @@ -2409,11 +2422,17 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) nested_vmx_procbased_ctls_high); break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_exit_ctls_low, + nested_vmx_exit_ctls_high); + break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr(nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high); break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + *pdata = vmx_control_msr(nested_vmx_true_entry_ctls_low, + nested_vmx_entry_ctls_high); + break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr(nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high); @@ -7836,7 +7855,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base); vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base); - vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) { + kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl); + } else { + kvm_set_dr(vcpu, 7, vcpu->arch.dr7); + vmcs_write64(GUEST_IA32_DEBUGCTL, vmx->nested.vmcs01_debugctl); + } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, vmcs12->vm_entry_intr_info_field); vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, @@ -7846,7 +7871,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, vmcs12->guest_interruptibility_info); vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs); - kvm_set_dr(vcpu, 7, vmcs12->guest_dr7); vmx_set_rflags(vcpu, vmcs12->guest_rflags); vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, vmcs12->guest_pending_dbg_exceptions); @@ -8143,9 +8167,11 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) !vmx_control_verify(vmcs12->pin_based_vm_exec_control, nested_vmx_pinbased_ctls_low, nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - nested_vmx_exit_ctls_low, nested_vmx_exit_ctls_high) || + nested_vmx_true_exit_ctls_low, + nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - nested_vmx_entry_ctls_low, nested_vmx_entry_ctls_high)) + nested_vmx_true_entry_ctls_low, + nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; @@ -8222,6 +8248,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET); + if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) + vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + cpu = get_cpu(); vmx->loaded_vmcs = vmcs02; vmx_vcpu_put(vcpu); @@ -8399,7 +8428,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, vmcs12->guest_cr0 = vmcs12_guest_cr0(vcpu, vmcs12); vmcs12->guest_cr4 = vmcs12_guest_cr4(vcpu, vmcs12); - kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); vmcs12->guest_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); vmcs12->guest_rip = kvm_register_read(vcpu, VCPU_REGS_RIP); vmcs12->guest_rflags = vmcs_readl(GUEST_RFLAGS); @@ -8478,9 +8506,13 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, (vmcs12->vm_entry_controls & ~VM_ENTRY_IA32E_MODE) | (vm_entry_controls_get(to_vmx(vcpu)) & VM_ENTRY_IA32E_MODE); + if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_DEBUG_CONTROLS) { + kvm_get_dr(vcpu, 7, (unsigned long *)&vmcs12->guest_dr7); + vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); + } + /* TODO: These cannot have changed unless we have MSR bitmaps and * the relevant bit asks not to trap the change */ - vmcs12->guest_ia32_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT) vmcs12->guest_ia32_pat = vmcs_read64(GUEST_IA32_PAT); if (vmcs12->vm_exit_controls & VM_EXIT_SAVE_IA32_EFER) -- cgit v1.2.3 From 5381417f6a51293e7b8af1eb18aefa5d47976a71 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 16 Jun 2014 13:59:44 +0200 Subject: KVM: nVMX: Fix returned value of MSR_IA32_VMX_VMCS_ENUM Many real CPUs get this wrong as well, but ours is totally off: bits 9:1 define the highest index value. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b93e2ae2bb62..a717c13b9466 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2461,7 +2461,7 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) *pdata = -1ULL; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x1f; + *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr(nested_vmx_secondary_ctls_low, -- cgit v1.2.3 From 5777392e83c96e3a0799dd2985598e0fc76cf4aa Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 18 Jun 2014 17:19:23 +0300 Subject: KVM: x86: check DR6/7 high-bits are clear only on long-mode When the guest sets DR6 and DR7, KVM asserts the high 32-bits are clear, and otherwise injects a #GP exception. This exception should only be injected only if running in long-mode. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.h | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a717c13b9466..b362a1a38e51 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5184,7 +5184,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 1; kvm_register_write(vcpu, reg, val); } else - if (kvm_set_dr(vcpu, dr, kvm_register_read(vcpu, reg))) + if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; skip_emulated_instruction(vcpu); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 8c97bac9a895..c5b61a7eb144 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -47,6 +47,16 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu) #endif } +static inline bool is_64_bit_mode(struct kvm_vcpu *vcpu) +{ + int cs_db, cs_l; + + if (!is_long_mode(vcpu)) + return false; + kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); + return cs_l; +} + static inline bool mmu_is_nested(struct kvm_vcpu *vcpu) { return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu; @@ -108,6 +118,14 @@ static inline bool vcpu_match_mmio_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) return false; } +static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + unsigned long val = kvm_register_read(vcpu, reg); + + return is_64_bit_mode(vcpu) ? val : (u32)val; +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -- cgit v1.2.3 From a449c7aa51e10c9bde0ea9bee4e682d6d067ebab Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 18 Jun 2014 17:19:24 +0300 Subject: KVM: x86: Hypercall handling does not considers opsize correctly Currently, the hypercall handling routine only considers LME as an indication to whether the guest uses 32/64-bit mode. This is incosistent with hyperv hypercalls handling and against the common sense of considering cs.l as well. This patch uses is_64_bit_mode instead of is_long_mode for that matter. In addition, the result is masked in respect to the guest execution mode. Last, it changes kvm_hv_hypercall to use is_64_bit_mode as well to simplify the code. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 451d6acea808..874607ae0583 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5669,7 +5669,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) u64 param, ingpa, outgpa, ret; uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0; bool fast, longmode; - int cs_db, cs_l; /* * hypercall generates UD from non zero cpl and real mode @@ -5680,8 +5679,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 0; } - kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - longmode = is_long_mode(vcpu) && cs_l == 1; + longmode = is_64_bit_mode(vcpu); if (!longmode) { param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) | @@ -5746,7 +5744,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int r = 1; + int op_64_bit, r = 1; if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -5759,7 +5757,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) trace_kvm_hypercall(nr, a0, a1, a2, a3); - if (!is_long_mode(vcpu)) { + op_64_bit = is_64_bit_mode(vcpu); + if (!op_64_bit) { nr &= 0xFFFFFFFF; a0 &= 0xFFFFFFFF; a1 &= 0xFFFFFFFF; @@ -5785,6 +5784,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) break; } out: + if (!op_64_bit) + ret = (u32)ret; kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; -- cgit v1.2.3 From 1e32c07955b43e7f827174bf320ed35971117275 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 18 Jun 2014 17:19:25 +0300 Subject: KVM: vmx: handle_cr ignores 32/64-bit mode On 32-bit mode only bits [31:0] of the CR should be used for setting the CR value. Otherwise, the host may incorrectly assume the value is invalid if bits [63:32] are not zero. Moreover, the CR is currently being read twice when CR8 is used. Last, nested mov-cr exiting is modified to handle the CR value correctly as well. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b362a1a38e51..45024bf0e229 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5058,7 +5058,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - val = kvm_register_read(vcpu, reg); + val = kvm_register_readl(vcpu, reg); trace_kvm_cr_write(cr, val); switch (cr) { case 0: @@ -5075,7 +5075,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return 1; case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); - u8 cr8 = kvm_register_read(vcpu, reg); + u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); kvm_complete_insn_gp(vcpu, err); if (irqchip_in_kernel(vcpu->kvm)) @@ -6770,7 +6770,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu, unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); int cr = exit_qualification & 15; int reg = (exit_qualification >> 8) & 15; - unsigned long val = kvm_register_read(vcpu, reg); + unsigned long val = kvm_register_readl(vcpu, reg); switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ -- cgit v1.2.3 From 27e6fb5dae2819d17f38dc9224692b771e989981 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 18 Jun 2014 17:19:26 +0300 Subject: KVM: vmx: vmx instructions handling does not consider cs.l VMX instructions use 32-bit operands in 32-bit mode, and 64-bit operands in 64-bit mode. The current implementation is broken since it does not use the register operands correctly, and always uses 64-bit for reads and writes. Moreover, write to memory in vmwrite only considers long-mode, so it ignores cs.l. This patch fixes this behavior. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 12 ++++++------ arch/x86/kvm/x86.h | 9 +++++++++ 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 45024bf0e229..8748c2e19ed6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6403,7 +6403,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) return 1; /* Decode instruction info and find the field to read */ - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (!vmcs12_read_any(vcpu, field, &field_value)) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); @@ -6416,7 +6416,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) * on the guest's mode (32 or 64 bit), not on the given field's length. */ if (vmx_instruction_info & (1u << 10)) { - kvm_register_write(vcpu, (((vmx_instruction_info) >> 3) & 0xf), + kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf), field_value); } else { if (get_vmx_mem_address(vcpu, exit_qualification, @@ -6453,21 +6453,21 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) return 1; if (vmx_instruction_info & (1u << 10)) - field_value = kvm_register_read(vcpu, + field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); else { if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, &gva)) return 1; if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, - &field_value, (is_long_mode(vcpu) ? 8 : 4), &e)) { + &field_value, (is_64_bit_mode(vcpu) ? 8 : 4), &e)) { kvm_inject_page_fault(vcpu, &e); return 1; } } - field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); + field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); @@ -6590,7 +6590,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); types = (nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c5b61a7eb144..306a1b77581f 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -126,6 +126,15 @@ static inline unsigned long kvm_register_readl(struct kvm_vcpu *vcpu, return is_64_bit_mode(vcpu) ? val : (u32)val; } +static inline void kvm_register_writel(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + if (!is_64_bit_mode(vcpu)) + val = (u32)val; + return kvm_register_write(vcpu, reg, val); +} + void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -- cgit v1.2.3 From 9f6226a762c7ae02f6a23a3d4fc552dafa57ea23 Mon Sep 17 00:00:00 2001 From: Rickard Strandqvist Date: Wed, 25 Jun 2014 14:25:58 +0200 Subject: arch: x86: kvm: x86.c: Cleaning up variable is set more than once A struct member variable is set to the same value more than once This was found using a static code analysis program called cppcheck. Signed-off-by: Rickard Strandqvist Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 874607ae0583..5a8691b0ed76 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4730,7 +4730,6 @@ static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, if (desc->g) var.limit = (var.limit << 12) | 0xfff; var.type = desc->type; - var.present = desc->p; var.dpl = desc->dpl; var.db = desc->d; var.s = desc->s; -- cgit v1.2.3 From 62baf44cad3bc6b37115cc21e4228fe53d4f3474 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 29 Jun 2014 21:55:53 +0200 Subject: KVM: nSVM: Do not report CLTS via SVM_EXIT_WRITE_CR0 to L1 CLTS only changes TS which is not monitored by selected CR0 interception. So skip any attempt to translate WRITE_CR0 to CR0_SEL_WRITE for this instruction. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b5e994ad0135..c79766e1f1e0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4205,7 +4205,8 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (info->intercept == x86_intercept_cr_write) icpt_info.exit_code += info->modrm_reg; - if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || + info->intercept == x86_intercept_clts) break; intercept = svm->nested.intercept; -- cgit v1.2.3 From 9bf418335e24da995ea682a028926d7e1036be6f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 30 Jun 2014 10:54:17 +0200 Subject: KVM: nSVM: Fix IOIO bitmap evaluation First, kvm_read_guest returns 0 on success. And then we need to take the access size into account when testing the bitmap: intercept if any of bits corresponding to the access is set. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c79766e1f1e0..3483ac978c76 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2116,22 +2116,27 @@ static void nested_svm_unmap(struct page *page) static int nested_svm_intercept_ioio(struct vcpu_svm *svm) { - unsigned port; - u8 val, bit; + unsigned port, size, iopm_len; + u16 val, mask; + u8 start_bit; u64 gpa; if (!(svm->nested.intercept & (1ULL << INTERCEPT_IOIO_PROT))) return NESTED_EXIT_HOST; port = svm->vmcb->control.exit_info_1 >> 16; + size = (svm->vmcb->control.exit_info_1 & SVM_IOIO_SIZE_MASK) >> + SVM_IOIO_SIZE_SHIFT; gpa = svm->nested.vmcb_iopm + (port / 8); - bit = port % 8; - val = 0; + start_bit = port % 8; + iopm_len = (start_bit + size > 8) ? 2 : 1; + mask = (0xf >> (4 - size)) << start_bit; + val = 0; - if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, 1)) - val &= (1 << bit); + if (kvm_read_guest(svm->vcpu.kvm, gpa, &val, iopm_len)) + return NESTED_EXIT_DONE; - return val ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; + return (val & mask) ? NESTED_EXIT_DONE : NESTED_EXIT_HOST; } static int nested_svm_exit_handled_msr(struct vcpu_svm *svm) -- cgit v1.2.3 From 6493f1574e898b46370e2b2315836d76a1980f2c Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 30 Jun 2014 11:07:05 +0200 Subject: KVM: nSVM: Fix IOIO size reported on emulation The access size of an in/ins is reported in dst_bytes, and that of out/outs in src_bytes. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3483ac978c76..1824949821f9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4261,9 +4261,9 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { exit_info |= SVM_IOIO_TYPE_MASK; - bytes = info->src_bytes; - } else { bytes = info->dst_bytes; + } else { + bytes = info->src_bytes; } if (info->intercept == x86_intercept_outs || -- cgit v1.2.3 From 6cbc5f5a80a9ae5a80bc81efc574b5a85bfd4a84 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 30 Jun 2014 12:52:55 +0200 Subject: KVM: nSVM: Set correct port for IOIO interception evaluation Obtaining the port number from DX is bogus as a) there are immediate port accesses and b) user space may have changed the register content while processing the PIO access. Forward the correct value from the instruction emulator instead. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/svm.c | 6 +++--- 3 files changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index ffa2671a7f2f..0e0151c13b2c 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -37,6 +37,7 @@ struct x86_instruction_info { u8 modrm_reg; /* index of register used */ u8 modrm_rm; /* rm part of modrm */ u64 src_val; /* value of source operand */ + u64 dst_val; /* value of destination operand */ u8 src_bytes; /* size of source operand */ u8 dst_bytes; /* size of destination operand */ u8 ad_bytes; /* size of src/dst address */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 84dc4ba0364d..15453e569f3d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -426,6 +426,7 @@ static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, .modrm_reg = ctxt->modrm_reg, .modrm_rm = ctxt->modrm_rm, .src_val = ctxt->src.val64, + .dst_val = ctxt->dst.val64, .src_bytes = ctxt->src.bytes, .dst_bytes = ctxt->dst.bytes, .ad_bytes = ctxt->ad_bytes, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1824949821f9..85d4458a0b35 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4256,13 +4256,13 @@ static int svm_check_intercept(struct kvm_vcpu *vcpu, u64 exit_info; u32 bytes; - exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; - if (info->intercept == x86_intercept_in || info->intercept == x86_intercept_ins) { - exit_info |= SVM_IOIO_TYPE_MASK; + exit_info = ((info->src_val & 0xffff) << 16) | + SVM_IOIO_TYPE_MASK; bytes = info->dst_bytes; } else { + exit_info = (info->dst_val & 0xffff) << 16; bytes = info->src_bytes; } -- cgit v1.2.3 From 0d3da0d26e3c3515997c99451ce3b0ad1a69a36c Mon Sep 17 00:00:00 2001 From: Tomasz Grabiec Date: Tue, 24 Jun 2014 09:42:43 +0200 Subject: KVM: x86: fix TSC matching I've observed kvmclock being marked as unstable on a modern single-socket system with a stable TSC and qemu-1.6.2 or qemu-2.0.0. The culprit was failure in TSC matching because of overflow of kvm_arch::nr_vcpus_matched_tsc in case there were multiple TSC writes in a single synchronization cycle. Turns out that qemu does multiple TSC writes during init, below is the evidence of that (qemu-2.0.0): The first one: 0xffffffffa08ff2b4 : vmx_write_tsc_offset+0xa4/0xb0 [kvm_intel] 0xffffffffa04c9c05 : kvm_write_tsc+0x1a5/0x360 [kvm] 0xffffffffa04cfd6b : kvm_arch_vcpu_postcreate+0x4b/0x80 [kvm] 0xffffffffa04b8188 : kvm_vm_ioctl+0x418/0x750 [kvm] The second one: 0xffffffffa08ff2b4 : vmx_write_tsc_offset+0xa4/0xb0 [kvm_intel] 0xffffffffa04c9c05 : kvm_write_tsc+0x1a5/0x360 [kvm] 0xffffffffa090610d : vmx_set_msr+0x29d/0x350 [kvm_intel] 0xffffffffa04be83b : do_set_msr+0x3b/0x60 [kvm] 0xffffffffa04c10a8 : msr_io+0xc8/0x160 [kvm] 0xffffffffa04caeb6 : kvm_arch_vcpu_ioctl+0xc86/0x1060 [kvm] 0xffffffffa04b6797 : kvm_vcpu_ioctl+0xc7/0x5a0 [kvm] #0 kvm_vcpu_ioctl at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1780 #1 kvm_put_msrs at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1270 #2 kvm_arch_put_registers at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1909 #3 kvm_cpu_synchronize_post_init at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1641 #4 cpu_synchronize_post_init at /build/buildd/qemu-2.0.0+dfsg/include/sysemu/kvm.h:330 #5 cpu_synchronize_all_post_init () at /build/buildd/qemu-2.0.0+dfsg/cpus.c:521 #6 main at /build/buildd/qemu-2.0.0+dfsg/vl.c:4390 The third one: 0xffffffffa08ff2b4 : vmx_write_tsc_offset+0xa4/0xb0 [kvm_intel] 0xffffffffa04c9c05 : kvm_write_tsc+0x1a5/0x360 [kvm] 0xffffffffa090610d : vmx_set_msr+0x29d/0x350 [kvm_intel] 0xffffffffa04be83b : do_set_msr+0x3b/0x60 [kvm] 0xffffffffa04c10a8 : msr_io+0xc8/0x160 [kvm] 0xffffffffa04caeb6 : kvm_arch_vcpu_ioctl+0xc86/0x1060 [kvm] 0xffffffffa04b6797 : kvm_vcpu_ioctl+0xc7/0x5a0 [kvm] #0 kvm_vcpu_ioctl at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1780 #1 kvm_put_msrs at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1270 #2 kvm_arch_put_registers at /build/buildd/qemu-2.0.0+dfsg/target-i386/kvm.c:1909 #3 kvm_cpu_synchronize_post_reset at /build/buildd/qemu-2.0.0+dfsg/kvm-all.c:1635 #4 cpu_synchronize_post_reset at /build/buildd/qemu-2.0.0+dfsg/include/sysemu/kvm.h:323 #5 cpu_synchronize_all_post_reset () at /build/buildd/qemu-2.0.0+dfsg/cpus.c:512 #6 main at /build/buildd/qemu-2.0.0+dfsg/vl.c:4482 The fix is to count each vCPU only once when matched, so that nr_vcpus_matched_tsc holds the size of the matched set. This is achieved by reusing generation counters. Every vCPU with this_tsc_generation == cur_tsc_generation is in the matched set. The match set is cleared by setting cur_tsc_generation to a value which no other vCPU is set to (by incrementing it). I needed to bump up the counter size form u8 to u64 to ensure it never overflows. Otherwise in cases TSC is not written the same number of times on each vCPU the counter could overflow and incorrectly indicate some vCPUs as being in the matched set. This scenario seems unlikely but I'm not sure if it can be disregarded. Signed-off-by: Tomasz Grabiec Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 ++-- arch/x86/kvm/x86.c | 11 +++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 63e020be3da7..af36f89fe67a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -448,7 +448,7 @@ struct kvm_vcpu_arch { u64 tsc_offset_adjustment; u64 this_tsc_nsec; u64 this_tsc_write; - u8 this_tsc_generation; + u64 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; @@ -591,7 +591,7 @@ struct kvm_arch { u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; - u8 cur_tsc_generation; + u64 cur_tsc_generation; int nr_vcpus_matched_tsc; spinlock_t pvclock_gtod_sync_lock; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5a8691b0ed76..f056f855f8e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1215,6 +1215,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) unsigned long flags; s64 usdiff; bool matched; + bool already_matched; u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); @@ -1279,6 +1280,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; + already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); } else { /* * We split periods of matched TSC writes into generations. @@ -1294,7 +1296,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %u, clock %llu\n", + pr_debug("kvm: new tsc generation %llu, clock %llu\n", kvm->arch.cur_tsc_generation, data); } @@ -1319,10 +1321,11 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); spin_lock(&kvm->arch.pvclock_gtod_sync_lock); - if (matched) - kvm->arch.nr_vcpus_matched_tsc++; - else + if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; + } else if (!already_matched) { + kvm->arch.nr_vcpus_matched_tsc++; + } kvm_track_tsc_matching(vcpu); spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); -- cgit v1.2.3 From 98eff52ab5c0ff5cb96940a93e99a1aeb2f11c89 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Sun, 29 Jun 2014 12:28:51 +0300 Subject: KVM: x86: Fix lapic.c debug prints In two cases lapic.c does not use the apic_debug macro correctly. This patch fixes them. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 006911858174..3855103f71fd 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1451,7 +1451,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr=" + apic_debug("%s: vcpu=%p, id=%d, base_msr=" "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, vcpu, kvm_apic_id(apic), vcpu->arch.apic_base, apic->base_address); @@ -1895,7 +1895,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - pr_debug("vcpu %d received sipi with vector # %x\n", + apic_debug("vcpu %d received sipi with vector # %x\n", vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; -- cgit v1.2.3 From 80112c89ed872c725e7dc39ccf6c37d1a585e161 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 8 Jul 2014 09:47:41 +0530 Subject: KVM: Synthesize G bit for all segments. We have noticed that qemu-kvm hangs early in the BIOS when runnning nested under some versions of VMware ESXi. The problem we believe is because KVM assumes that the platform preserves the 'G' but for any segment register. The SVM specification itemizes the segment attribute bits that are observed by the CPU, but the (G)ranularity bit is not one of the bits itemized, for any segment. Though current AMD CPUs keep track of the (G)ranularity bit for all segment registers other than CS, the specification does not require it. VMware's virtual CPU may not track the (G)ranularity bit for any segment register. Since kvm already synthesizes the (G)ranularity bit for the CS segment. It should do so for all segments. The patch below does that, and helps get rid of the hangs. Patch applies on top of Linus' tree. Signed-off-by: Jim Mattson Signed-off-by: Alok N Kataria Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 85d4458a0b35..4925a94a970a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1415,7 +1415,16 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; - var->g = (s->attrib >> SVM_SELECTOR_G_SHIFT) & 1; + + /* + * AMD CPUs circa 2014 track the G bit for all segments except CS. + * However, the SVM spec states that the G bit is not observed by the + * CPU, and some VMware virtual CPUs drop the G bit for all segments. + * So let's synthesize a legal G bit for all segments, this helps + * running KVM nested. It also helps cross-vendor migration, because + * Intel's vmentry has a check on the 'G' bit. + */ + var->g = s->limit > 0xfffff; /* * AMD's VMCB does not have an explicit unusable field, so emulate it @@ -1424,14 +1433,6 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, var->unusable = !var->present || (var->type == 0); switch (seg) { - case VCPU_SREG_CS: - /* - * SVM always stores 0 for the 'G' bit in the CS selector in - * the VMCB on a VMEXIT. This hurts cross-vendor migration: - * Intel's VMENTRY has a check on the 'G' bit. - */ - var->g = s->limit > 0xfffff; - break; case VCPU_SREG_TR: /* * Work around a bug where the busy flag in the tr selector -- cgit v1.2.3 From 5f7552d4a56c21a882c9854ac63c6eb73ca7d7c8 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 30 Jun 2014 12:03:02 +0300 Subject: KVM: x86: Pending interrupt may be delivered after INIT We encountered a scenario in which after an INIT is delivered, a pending interrupt is delivered, although it was sent before the INIT. As the SDM states in section 10.4.7.1, the ISR and the IRR should be cleared after INIT as KVM does. This also means that pending interrupts should be cleared. This patch clears upon reset (and INIT) the pending interrupts; and at the same occassion clears the pending exceptions, since they may cause a similar issue. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f056f855f8e6..b13f3a39ebec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -6845,6 +6845,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) atomic_set(&vcpu->arch.nmi_queued, 0); vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; + kvm_clear_interrupt_queue(vcpu); + kvm_clear_exception_queue(vcpu); memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); vcpu->arch.dr6 = DR6_FIXED_1; -- cgit v1.2.3 From 22d48b2d2aa0b078816eaa1e15e485811a2d03fa Mon Sep 17 00:00:00 2001 From: Matthias Lange Date: Thu, 26 Jun 2014 13:50:15 +0200 Subject: KVM: svm: writes to MSR_K7_HWCR generates GPE in guest Since commit 575203 the MCE subsystem in the Linux kernel for AMD sets bit 18 in MSR_K7_HWCR. Running such a kernel as a guest in KVM on an AMD host results in a GPE injected into the guest because kvm_set_msr_common returns 1. This patch fixes this by masking bit 18 from the MSR value desired by the guest. Signed-off-by: Matthias Lange Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b13f3a39ebec..7553530e3502 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2035,6 +2035,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ data &= ~(u64)0x8; /* ignore TLB cache disable */ + data &= ~(u64)0x40000; /* ignore Mc status write enable */ if (data != 0) { vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); -- cgit v1.2.3 From 98eb2f8b145cee711984d42eff5d6f19b6b1df69 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Mar 2014 09:51:52 +0100 Subject: KVM: vmx: speed up emulation of invalid guest state About 25% of the time spent in emulation of invalid guest state is wasted in checking whether emulation is required for the next instruction. However, this almost never changes except when a segment register (or TR or LDTR) changes, or when there is a mode transition (i.e. CR0 changes). In fact, vmx_set_segment and vmx_set_cr0 already modify vmx->emulation_required (except that the former for some reason uses |= instead of just an assignment). So there is no need to call guest_state_valid in the emulation loop. Emulation performance test results indicate 1650-2600 cycles for common instructions, versus 2300-3200 before this patch on a Sandy Bridge Xeon. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8748c2e19ed6..5c7bbde25bbf 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3672,7 +3672,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); out: - vmx->emulation_required |= emulation_required(vcpu); + vmx->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -5640,7 +5640,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING; - while (!guest_state_valid(vcpu) && count-- != 0) { + while (vmx->emulation_required && count-- != 0) { if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); @@ -5674,7 +5674,6 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) schedule(); } - vmx->emulation_required = emulation_required(vcpu); out: return ret; } -- cgit v1.2.3 From 37ccdcbe0757196ec98c0dcf9754bec8423807a5 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 20 May 2014 14:29:47 +0200 Subject: KVM: x86: return all bits from get_interrupt_shadow For the next patch we will need to know the full state of the interrupt shadow; we will then set KVM_REQ_EVENT when one bit is cleared. However, right now get_interrupt_shadow only returns the one corresponding to the emulated instruction, or an unconditional 0 if the emulated instruction does not have an interrupt shadow. This is confusing and does not allow us to check for cleared bits as mentioned above. Clean the callback up, and modify toggle_interruptibility to match the comment above the call. As a small result, the call to set_interrupt_shadow will be skipped in the common case where int_shadow == 0 && mask == 0. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/svm.c | 6 +++--- arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 10 +++++----- 4 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af36f89fe67a..b8a4480176b9 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -717,7 +717,7 @@ struct kvm_x86_ops { int (*handle_exit)(struct kvm_vcpu *vcpu); void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); - u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); + u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); void (*set_irq)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4925a94a970a..ddf742768ecf 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -486,14 +486,14 @@ static int is_external_interrupt(u32 info) return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR); } -static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); u32 ret = 0; if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) - ret |= KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; + return ret; } static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c7bbde25bbf..0c9569b994f9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1943,7 +1943,7 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) vmcs_writel(GUEST_RFLAGS, rflags); } -static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) +static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu) { u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); int ret = 0; @@ -1953,7 +1953,7 @@ static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) if (interruptibility & GUEST_INTR_STATE_MOV_SS) ret |= KVM_X86_SHADOW_INT_MOV_SS; - return ret & mask; + return ret; } static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7553530e3502..a56126e6bd75 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2978,9 +2978,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft; events->interrupt.nr = vcpu->arch.interrupt.nr; events->interrupt.soft = 0; - events->interrupt.shadow = - kvm_x86_ops->get_interrupt_shadow(vcpu, - KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); + events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); events->nmi.injected = vcpu->arch.nmi_injected; events->nmi.pending = vcpu->arch.nmi_pending != 0; @@ -4860,7 +4858,7 @@ static const struct x86_emulate_ops emulate_ops = { static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) { - u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask); + u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu); /* * an sti; sti; sequence only disable interrupts for the first * instruction. So, if the last instruction, be it emulated or @@ -4868,7 +4866,9 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) * means that the last instruction is an sti. We should not * leave the flag on in this case. The same goes for mov ss */ - if (!(int_shadow & mask)) + if (int_shadow & mask) + mask = 0; + if (unlikely(int_shadow || mask)) kvm_x86_ops->set_interrupt_shadow(vcpu, mask); } -- cgit v1.2.3 From 6addfc42992be4b073c39137ecfdf4b2aa2d487f Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Mar 2014 11:29:28 +0100 Subject: KVM: x86: avoid useless set of KVM_REQ_EVENT after emulation Despite the provisions to emulate up to 130 consecutive instructions, in practice KVM will emulate just one before exiting handle_invalid_guest_state, because x86_emulate_instruction always sets KVM_REQ_EVENT. However, we only need to do this if an interrupt could be injected, which happens a) if an interrupt shadow bit (STI or MOV SS) has gone away; b) if the interrupt flag has just been set (other instructions than STI can set it without enabling an interrupt shadow). This cuts another 700-900 cycles from the cost of emulating an instruction (measured on a Sandy Bridge Xeon: 1650-2600 cycles before the patch on kvm-unit-tests, 925-1700 afterwards). Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a56126e6bd75..cd9316786dca 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -87,6 +87,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -4868,8 +4869,11 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) */ if (int_shadow & mask) mask = 0; - if (unlikely(int_shadow || mask)) + if (unlikely(int_shadow || mask)) { kvm_x86_ops->set_interrupt_shadow(vcpu, mask); + if (!mask) + kvm_make_request(KVM_REQ_EVENT, vcpu); + } } static void inject_emulated_exception(struct kvm_vcpu *vcpu) @@ -5095,20 +5099,18 @@ static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, return dr6; } -static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, int *r) +static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflags, int *r) { struct kvm_run *kvm_run = vcpu->run; /* - * Use the "raw" value to see if TF was passed to the processor. - * Note that the new value of the flags has not been saved yet. + * rflags is the old, "raw" value of the flags. The new value has + * not been saved yet. * * This is correct even for TF set by the guest, because "the * processor will not generate this exception after the instruction * that sets the TF flag". */ - unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); - if (unlikely(rflags & X86_EFLAGS_TF)) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; @@ -5275,13 +5277,22 @@ restart: r = EMULATE_DONE; if (writeback) { + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); - kvm_make_request(KVM_REQ_EVENT, vcpu); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) - kvm_vcpu_check_singlestep(vcpu, &r); - kvm_set_rflags(vcpu, ctxt->eflags); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + __kvm_set_rflags(vcpu, ctxt->eflags); + + /* + * For STI, interrupts are shadowed; so KVM_REQ_EVENT will + * do nothing, and it will be requested again as soon as + * the shadow expires. But we still need to check here, + * because POPF has no interrupt shadow. + */ + if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) + kvm_make_request(KVM_REQ_EVENT, vcpu); } else vcpu->arch.emulate_regs_need_sync_to_vcpu = true; @@ -7406,12 +7417,17 @@ unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_get_rflags); -void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) rflags |= X86_EFLAGS_TF; kvm_x86_ops->set_rflags(vcpu, rflags); +} + +void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) +{ + __kvm_set_rflags(vcpu, rflags); kvm_make_request(KVM_REQ_EVENT, vcpu); } EXPORT_SYMBOL_GPL(kvm_set_rflags); -- cgit v1.2.3 From e24186e097b80c5995ff75e1bbcd541d09c9e42b Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Mar 2014 12:00:57 +0100 Subject: KVM: emulate: move around some checks The only purpose of this patch is to make the next patch simpler to review. No semantic change. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 15453e569f3d..d79677c6056d 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4388,12 +4388,15 @@ done_prefixes: ctxt->d |= opcode.flags; } + /* Unrecognised? */ + if (ctxt->d == 0) + return EMULATION_FAILED; + ctxt->execute = opcode.u.execute; ctxt->check_perm = opcode.check_perm; ctxt->intercept = opcode.intercept; - /* Unrecognised? */ - if (ctxt->d == 0 || (ctxt->d & NotImpl)) + if (ctxt->d & NotImpl) return EMULATION_FAILED; if (!(ctxt->d & EmulateOnUD) && ctxt->ud) @@ -4535,19 +4538,19 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) ctxt->mem_read.pos = 0; - if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || - (ctxt->d & Undefined)) { + /* LOCK prefix is allowed only with some instructions */ + if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { rc = emulate_ud(ctxt); goto done; } - /* LOCK prefix is allowed only with some instructions */ - if (ctxt->lock_prefix && (!(ctxt->d & Lock) || ctxt->dst.type != OP_MEM)) { + if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) { rc = emulate_ud(ctxt); goto done; } - if ((ctxt->d & SrcMask) == SrcMemFAddr && ctxt->src.type != OP_MEM) { + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || + (ctxt->d & Undefined)) { rc = emulate_ud(ctxt); goto done; } -- cgit v1.2.3 From d40a6898e50c2589ca3d345ef5ca6671e2b35b1a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Mar 2014 11:58:02 +0100 Subject: KVM: emulate: protect checks on ctxt->d by a common "if (unlikely())" There are several checks for "peculiar" aspects of instructions in both x86_decode_insn and x86_emulate_insn. Group them together, and guard them with a single "if" that lets the processor quickly skip them all. Make this more effective by adding two more flag bits that say whether the .intercept and .check_perm fields are valid. We will reuse these flags later to avoid initializing fields of the emulate_ctxt struct. This skims about 30 cycles for each emulated instructions, which is approximately a 3% improvement. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 175 ++++++++++++++++++++++++++----------------------- 1 file changed, 94 insertions(+), 81 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index d79677c6056d..ea56dae3e67c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -162,6 +162,8 @@ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ +#define Intercept ((u64)1 << 48) /* Has valid intercept field */ +#define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -3546,9 +3548,9 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) } #define D(_y) { .flags = (_y) } -#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } -#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } +#define DI(_y, _i) { .flags = (_y)|Intercept, .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y)|Intercept|CheckPerm, \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define N D(NotImpl) #define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group | ModRM), .u.group = (_g) } @@ -3557,10 +3559,10 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt) #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } #define F(_f, _e) { .flags = (_f) | Fastop, .u.fastop = (_e) } #define II(_f, _e, _i) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } + { .flags = (_f)|Intercept, .u.execute = (_e), .intercept = x86_intercept_##_i } #define IIP(_f, _e, _i, _p) \ - { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ - .check_perm = (_p) } + { .flags = (_f)|Intercept|CheckPerm, .u.execute = (_e), \ + .intercept = x86_intercept_##_i, .check_perm = (_p) } #define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) @@ -4393,29 +4395,37 @@ done_prefixes: return EMULATION_FAILED; ctxt->execute = opcode.u.execute; - ctxt->check_perm = opcode.check_perm; - ctxt->intercept = opcode.intercept; - if (ctxt->d & NotImpl) - return EMULATION_FAILED; + if (unlikely(ctxt->d & + (NotImpl|EmulateOnUD|Stack|Op3264|Sse|Mmx|Intercept|CheckPerm))) { + /* + * These are copied unconditionally here, and checked unconditionally + * in x86_emulate_insn. + */ + ctxt->check_perm = opcode.check_perm; + ctxt->intercept = opcode.intercept; - if (!(ctxt->d & EmulateOnUD) && ctxt->ud) - return EMULATION_FAILED; + if (ctxt->d & NotImpl) + return EMULATION_FAILED; - if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) - ctxt->op_bytes = 8; + if (!(ctxt->d & EmulateOnUD) && ctxt->ud) + return EMULATION_FAILED; - if (ctxt->d & Op3264) { - if (mode == X86EMUL_MODE_PROT64) + if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8; - else - ctxt->op_bytes = 4; - } - if (ctxt->d & Sse) - ctxt->op_bytes = 16; - else if (ctxt->d & Mmx) - ctxt->op_bytes = 8; + if (ctxt->d & Op3264) { + if (mode == X86EMUL_MODE_PROT64) + ctxt->op_bytes = 8; + else + ctxt->op_bytes = 4; + } + + if (ctxt->d & Sse) + ctxt->op_bytes = 16; + else if (ctxt->d & Mmx) + ctxt->op_bytes = 8; + } /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { @@ -4549,75 +4559,78 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || - (ctxt->d & Undefined)) { - rc = emulate_ud(ctxt); - goto done; - } - - if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) - || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { - rc = emulate_ud(ctxt); - goto done; - } - - if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { - rc = emulate_nm(ctxt); - goto done; - } + if (unlikely(ctxt->d & + (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { + if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || + (ctxt->d & Undefined)) { + rc = emulate_ud(ctxt); + goto done; + } - if (ctxt->d & Mmx) { - rc = flush_pending_x87_faults(ctxt); - if (rc != X86EMUL_CONTINUE) + if (((ctxt->d & (Sse|Mmx)) && ((ops->get_cr(ctxt, 0) & X86_CR0_EM))) + || ((ctxt->d & Sse) && !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); goto done; - /* - * Now that we know the fpu is exception safe, we can fetch - * operands from it. - */ - fetch_possible_mmx_operand(ctxt, &ctxt->src); - fetch_possible_mmx_operand(ctxt, &ctxt->src2); - if (!(ctxt->d & Mov)) - fetch_possible_mmx_operand(ctxt, &ctxt->dst); - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_PRE_EXCEPT); - if (rc != X86EMUL_CONTINUE) + if ((ctxt->d & (Sse|Mmx)) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); goto done; - } + } - /* Privileged instruction can be executed only in CPL=0 */ - if ((ctxt->d & Priv) && ops->cpl(ctxt)) { - rc = emulate_gp(ctxt, 0); - goto done; - } + if (ctxt->d & Mmx) { + rc = flush_pending_x87_faults(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + /* + * Now that we know the fpu is exception safe, we can fetch + * operands from it. + */ + fetch_possible_mmx_operand(ctxt, &ctxt->src); + fetch_possible_mmx_operand(ctxt, &ctxt->src2); + if (!(ctxt->d & Mov)) + fetch_possible_mmx_operand(ctxt, &ctxt->dst); + } - /* Instruction can only be executed in protected mode */ - if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { - rc = emulate_ud(ctxt); - goto done; - } + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } - /* Do instruction specific permission checks */ - if (ctxt->check_perm) { - rc = ctxt->check_perm(ctxt); - if (rc != X86EMUL_CONTINUE) + /* Privileged instruction can be executed only in CPL=0 */ + if ((ctxt->d & Priv) && ops->cpl(ctxt)) { + rc = emulate_gp(ctxt, 0); goto done; - } + } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { - rc = emulator_check_intercept(ctxt, ctxt->intercept, - X86_ICPT_POST_EXCEPT); - if (rc != X86EMUL_CONTINUE) + /* Instruction can only be executed in protected mode */ + if ((ctxt->d & Prot) && ctxt->mode < X86EMUL_MODE_PROT16) { + rc = emulate_ud(ctxt); goto done; - } + } - if (ctxt->rep_prefix && (ctxt->d & String)) { - /* All REP prefixes have the same first termination condition */ - if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { - ctxt->eip = ctxt->_eip; - goto done; + /* Do instruction specific permission checks */ + if (ctxt->check_perm) { + rc = ctxt->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + rc = emulator_check_intercept(ctxt, ctxt->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (ctxt->rep_prefix && (ctxt->d & String)) { + /* All REP prefixes have the same first termination condition */ + if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { + ctxt->eip = ctxt->_eip; + goto done; + } } } -- cgit v1.2.3 From 54cfdb3e95d4f70409a7d3432a42cffc9a232be7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 27 Mar 2014 11:36:25 +0100 Subject: KVM: emulate: speed up emulated moves We can just blindly move all 16 bytes of ctxt->src's value to ctxt->dst. write_register_operand will take care of writing only the lower bytes. Avoiding a call to memcpy (the compiler optimizes it out) gains about 200 cycles on kvm-unit-tests for register-to-register moves, and makes them about as fast as arithmetic instructions. We could perhaps get a larger speedup by moving all instructions _except_ moves out of x86_emulate_insn, removing opcode_len, and replacing the switch statement with an inlined em_mov. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 2 +- arch/x86/kvm/emulate.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0e0151c13b2c..432447370044 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -233,7 +233,7 @@ struct operand { union { unsigned long val; u64 val64; - char valptr[sizeof(unsigned long) + 2]; + char valptr[sizeof(sse128_t)]; sse128_t vec_val; u64 mm_val; void *data; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ea56dae3e67c..27f677ef703e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2990,7 +2990,7 @@ static int em_rdpmc(struct x86_emulate_ctxt *ctxt) static int em_mov(struct x86_emulate_ctxt *ctxt) { - memcpy(ctxt->dst.valptr, ctxt->src.valptr, ctxt->op_bytes); + memcpy(ctxt->dst.valptr, ctxt->src.valptr, sizeof(ctxt->src.valptr)); return X86EMUL_CONTINUE; } -- cgit v1.2.3 From f5f87dfbc777f89148c3c66438741139845d3ac6 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 1 Apr 2014 13:23:24 +0200 Subject: KVM: emulate: simplify writeback The "if/return" checks are useless, because we return X86EMUL_CONTINUE anyway if we do not return. Reviewed-by: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 27f677ef703e..32d3da82da2e 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1588,34 +1588,28 @@ static void write_register_operand(struct operand *op) static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op) { - int rc; - switch (op->type) { case OP_REG: write_register_operand(op); break; case OP_MEM: if (ctxt->lock_prefix) - rc = segmented_cmpxchg(ctxt, + return segmented_cmpxchg(ctxt, + op->addr.mem, + &op->orig_val, + &op->val, + op->bytes); + else + return segmented_write(ctxt, op->addr.mem, - &op->orig_val, &op->val, op->bytes); - else - rc = segmented_write(ctxt, - op->addr.mem, - &op->val, - op->bytes); - if (rc != X86EMUL_CONTINUE) - return rc; break; case OP_MEM_STR: - rc = segmented_write(ctxt, - op->addr.mem, - op->data, - op->bytes * op->count); - if (rc != X86EMUL_CONTINUE) - return rc; + return segmented_write(ctxt, + op->addr.mem, + op->data, + op->bytes * op->count); break; case OP_XMM: write_sse_reg(ctxt, &op->vec_val, op->addr.xmm); -- cgit v1.2.3 From 1498507a47867596de158d4db8728e92385a4919 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 16 Apr 2014 12:46:09 -0400 Subject: KVM: emulate: move init_decode_cache to emulate.c Core emulator functions all belong in emulator.c, x86 should have no knowledge of emulator internals Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 1 + arch/x86/kvm/emulate.c | 13 +++++++++++++ arch/x86/kvm/x86.c | 13 ------------- 3 files changed, 14 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 432447370044..c22bd9af4311 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -409,6 +409,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_OK 0 #define EMULATION_RESTART 1 #define EMULATION_INTERCEPTED 2 +void init_decode_cache(struct x86_emulate_ctxt *ctxt); int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int idt_index, int reason, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 32d3da82da2e..851315d93bf7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4534,6 +4534,19 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) return X86EMUL_CONTINUE; } +void init_decode_cache(struct x86_emulate_ctxt *ctxt) +{ + memset(&ctxt->opcode_len, 0, + (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); + + ctxt->fetch.start = 0; + ctxt->fetch.end = 0; + ctxt->io_read.pos = 0; + ctxt->io_read.end = 0; + ctxt->mem_read.pos = 0; + ctxt->mem_read.end = 0; +} + int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) { const struct x86_emulate_ops *ops = ctxt->ops; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd9316786dca..905edf8557e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4888,19 +4888,6 @@ static void inject_emulated_exception(struct kvm_vcpu *vcpu) kvm_queue_exception(vcpu, ctxt->exception.vector); } -static void init_decode_cache(struct x86_emulate_ctxt *ctxt) -{ - memset(&ctxt->opcode_len, 0, - (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); - - ctxt->fetch.start = 0; - ctxt->fetch.end = 0; - ctxt->io_read.pos = 0; - ctxt->io_read.end = 0; - ctxt->mem_read.pos = 0; - ctxt->mem_read.end = 0; -} - static void init_emulate_ctxt(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; -- cgit v1.2.3 From 685bbf4ac406364a84a1d4237b4970dc570fd4cb Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 16 Apr 2014 12:46:10 -0400 Subject: KVM: emulate: Remove ctxt->intercept and ctxt->check_perm checks The same information can be gleaned from ctxt->d and avoids having to zero/NULL initialize intercept and check_perm Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 851315d93bf7..000fc7398832 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4599,7 +4599,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4619,13 +4619,13 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } /* Do instruction specific permission checks */ - if (ctxt->check_perm) { + if (ctxt->d & CheckPerm) { rc = ctxt->check_perm(ctxt); if (rc != X86EMUL_CONTINUE) goto done; } - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -4671,7 +4671,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->guest_mode) && ctxt->intercept) { + if (unlikely(ctxt->guest_mode) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) -- cgit v1.2.3 From 02357bdc8c30a60cd33dd438f851c1306c34f435 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 16 Apr 2014 12:46:11 -0400 Subject: KVM: emulate: cleanup decode_modrm Remove the if conditional - that will help us avoid an "else initialize to 0" Also, rearrange operators for slightly better code. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 000fc7398832..94f5f8b94ce9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1066,19 +1066,17 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, struct operand *op) { u8 sib; - int index_reg = 0, base_reg = 0, scale; + int index_reg, base_reg, scale; int rc = X86EMUL_CONTINUE; ulong modrm_ea = 0; - if (ctxt->rex_prefix) { - ctxt->modrm_reg = (ctxt->rex_prefix & 4) << 1; /* REX.R */ - index_reg = (ctxt->rex_prefix & 2) << 2; /* REX.X */ - ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ - } + ctxt->modrm_reg = ((ctxt->rex_prefix << 1) & 8); /* REX.R */ + index_reg = (ctxt->rex_prefix << 2) & 8; /* REX.X */ + base_reg = (ctxt->rex_prefix << 3) & 8; /* REX.B */ - ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; + ctxt->modrm_mod = (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; - ctxt->modrm_rm |= (ctxt->modrm & 0x07); + ctxt->modrm_rm = base_reg | (ctxt->modrm & 0x07); ctxt->modrm_seg = VCPU_SREG_DS; if (ctxt->modrm_mod == 3 || (ctxt->d & NoMod)) { -- cgit v1.2.3 From c44b4c6ab80eef3a9c52c7b3f0c632942e6489aa Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 16 Apr 2014 12:46:12 -0400 Subject: KVM: emulate: clean up initializations in init_decode_cache A lot of initializations are unnecessary as they get set to appropriate values before actually being used. Optimize placement of fields in x86_emulate_ctxt Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 20 +++++++++++--------- arch/x86/kvm/emulate.c | 7 ++----- 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index c22bd9af4311..d40a10a38a80 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -288,30 +288,32 @@ struct x86_emulate_ctxt { u8 opcode_len; u8 b; u8 intercept; - u8 lock_prefix; - u8 rep_prefix; u8 op_bytes; u8 ad_bytes; - u8 rex_prefix; struct operand src; struct operand src2; struct operand dst; - bool has_seg_override; - u8 seg_override; - u64 d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); + bool has_seg_override; + bool rip_relative; + u8 rex_prefix; + u8 lock_prefix; + u8 rep_prefix; + u8 seg_override; + /* bitmaps of registers in _regs[] that can be read */ + u32 regs_valid; + /* bitmaps of registers in _regs[] that have been written */ + u32 regs_dirty; /* modrm */ u8 modrm; u8 modrm_mod; u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; - bool rip_relative; + u64 d; unsigned long _eip; struct operand memop; - u32 regs_valid; /* bitmaps of registers in _regs[] that can be read */ - u32 regs_dirty; /* bitmaps of registers in _regs[] that have been written */ /* Fields above regs are cleared together. */ unsigned long _regs[NR_VCPU_REGS]; struct operand *memopp; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 94f5f8b94ce9..3e9bbdc4c76a 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4534,14 +4534,11 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) void init_decode_cache(struct x86_emulate_ctxt *ctxt) { - memset(&ctxt->opcode_len, 0, - (void *)&ctxt->_regs - (void *)&ctxt->opcode_len); + memset(&ctxt->has_seg_override, 0, + (void *)&ctxt->modrm - (void *)&ctxt->has_seg_override); - ctxt->fetch.start = 0; - ctxt->fetch.end = 0; ctxt->io_read.pos = 0; ctxt->io_read.end = 0; - ctxt->mem_read.pos = 0; ctxt->mem_read.end = 0; } -- cgit v1.2.3 From 573e80fe04db1aa44e8303037f65716ba5c3a343 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 16 Apr 2014 12:46:13 -0400 Subject: KVM: emulate: rework seg_override x86_decode_insn already sets a default for seg_override, so remove it from the zeroed area. Also replace set/get functions with direct access to the field. Signed-off-by: Bandan Das Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 3 +-- arch/x86/kvm/emulate.c | 41 +++++++++++++++----------------------- 2 files changed, 17 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index d40a10a38a80..6c808408326f 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -295,12 +295,10 @@ struct x86_emulate_ctxt { struct operand dst; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); - bool has_seg_override; bool rip_relative; u8 rex_prefix; u8 lock_prefix; u8 rep_prefix; - u8 seg_override; /* bitmaps of registers in _regs[] that can be read */ u32 regs_valid; /* bitmaps of registers in _regs[] that have been written */ @@ -311,6 +309,7 @@ struct x86_emulate_ctxt { u8 modrm_reg; u8 modrm_rm; u8 modrm_seg; + u8 seg_override; u64 d; unsigned long _eip; struct operand memop; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 3e9bbdc4c76a..08badf638fb0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -514,12 +514,6 @@ static u32 desc_limit_scaled(struct desc_struct *desc) return desc->g ? (limit << 12) | 0xfff : limit; } -static void set_seg_override(struct x86_emulate_ctxt *ctxt, int seg) -{ - ctxt->has_seg_override = true; - ctxt->seg_override = seg; -} - static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) { if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) @@ -528,14 +522,6 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) return ctxt->ops->get_cached_segment_base(ctxt, seg); } -static unsigned seg_override(struct x86_emulate_ctxt *ctxt) -{ - if (!ctxt->has_seg_override) - return 0; - - return ctxt->seg_override; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { @@ -4169,7 +4155,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; op->addr.mem.ea = register_address(ctxt, reg_read(ctxt, VCPU_REGS_RSI)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; op->count = 1; break; @@ -4180,7 +4166,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, register_address(ctxt, reg_read(ctxt, VCPU_REGS_RBX) + (reg_read(ctxt, VCPU_REGS_RAX) & 0xff)); - op->addr.mem.seg = seg_override(ctxt); + op->addr.mem.seg = ctxt->seg_override; op->val = 0; break; case OpImmFAddr: @@ -4227,6 +4213,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int mode = ctxt->mode; int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; + bool has_seg_override = false; struct opcode opcode; ctxt->memop.type = OP_NONE; @@ -4280,11 +4267,13 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) case 0x2e: /* CS override */ case 0x36: /* SS override */ case 0x3e: /* DS override */ - set_seg_override(ctxt, (ctxt->b >> 3) & 3); + has_seg_override = true; + ctxt->seg_override = (ctxt->b >> 3) & 3; break; case 0x64: /* FS override */ case 0x65: /* GS override */ - set_seg_override(ctxt, ctxt->b & 7); + has_seg_override = true; + ctxt->seg_override = ctxt->b & 7; break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -4422,17 +4411,19 @@ done_prefixes: /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { rc = decode_modrm(ctxt, &ctxt->memop); - if (!ctxt->has_seg_override) - set_seg_override(ctxt, ctxt->modrm_seg); + if (!has_seg_override) { + has_seg_override = true; + ctxt->seg_override = ctxt->modrm_seg; + } } else if (ctxt->d & MemAbs) rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; - if (!ctxt->has_seg_override) - set_seg_override(ctxt, VCPU_SREG_DS); + if (!has_seg_override) + ctxt->seg_override = VCPU_SREG_DS; - ctxt->memop.addr.mem.seg = seg_override(ctxt); + ctxt->memop.addr.mem.seg = ctxt->seg_override; if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; @@ -4534,8 +4525,8 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *)) void init_decode_cache(struct x86_emulate_ctxt *ctxt) { - memset(&ctxt->has_seg_override, 0, - (void *)&ctxt->modrm - (void *)&ctxt->has_seg_override); + memset(&ctxt->rip_relative, 0, + (void *)&ctxt->modrm - (void *)&ctxt->rip_relative); ctxt->io_read.pos = 0; ctxt->io_read.end = 0; -- cgit v1.2.3 From 41061cdb98a0bec464278b4db8e894a3121671f5 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Wed, 16 Apr 2014 12:46:14 -0400 Subject: KVM: emulate: do not initialize memopp rip_relative is only set if decode_modrm runs, and if you have ModRM you will also have a memopp. We can then access memopp unconditionally. Note that rip_relative cannot be hoisted up to decode_modrm, or you break "mov $0, xyz(%rip)". Also, move typecast on "out of range value" of mem.ea to decode_modrm. Together, all these optimizations save about 50 cycles on each emulated instructions (4-6%). Signed-off-by: Bandan Das [Fix immediate operands with rip-relative addressing. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 5 +++++ arch/x86/kvm/emulate.c | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6c808408326f..fcf58cd25ebd 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -295,6 +295,11 @@ struct x86_emulate_ctxt { struct operand dst; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); + /* + * The following six fields are cleared together, + * the rest are initialized unconditionally in x86_decode_insn + * or elsewhere + */ bool rip_relative; u8 rex_prefix; u8 lock_prefix; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 08badf638fb0..390400a54a9c 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1177,6 +1177,9 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } } op->addr.mem.ea = modrm_ea; + if (ctxt->ad_bytes != 8) + ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; + done: return rc; } @@ -4425,9 +4428,6 @@ done_prefixes: ctxt->memop.addr.mem.seg = ctxt->seg_override; - if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) - ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; - /* * Decode and fetch the source operand: register, memory * or immediate. @@ -4448,7 +4448,7 @@ done_prefixes: rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); done: - if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) + if (ctxt->rip_relative) ctxt->memopp->addr.mem.ea += ctxt->_eip; return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; -- cgit v1.2.3 From 285ca9e948fa047e51fe47082528034de5369e8d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 May 2014 12:24:32 +0200 Subject: KVM: emulate: speed up do_insn_fetch Hoist the common case up from do_insn_fetch_byte to do_insn_fetch, and prime the fetch_cache in x86_decode_insn. This helps a bit the compiler and the branch predictor, but above all it lays the ground for further changes in the next few patches. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 67 +++++++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 390400a54a9c..ea188a338afb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -705,51 +705,51 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, } /* - * Fetch the next byte of the instruction being emulated which is pointed to - * by ctxt->_eip, then increment ctxt->_eip. - * - * Also prefetch the remaining bytes of the instruction without crossing page + * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) +static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt) { struct fetch_cache *fc = &ctxt->fetch; int rc; int size, cur_size; - - if (ctxt->_eip == fc->end) { - unsigned long linear; - struct segmented_address addr = { .seg = VCPU_SREG_CS, - .ea = ctxt->_eip }; - cur_size = fc->end - fc->start; - size = min(15UL - cur_size, - PAGE_SIZE - offset_in_page(ctxt->_eip)); - rc = __linearize(ctxt, addr, size, false, true, &linear); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, - size, &ctxt->exception); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; - fc->end += size; - } - *dest = fc->data[ctxt->_eip - fc->start]; - ctxt->_eip++; + unsigned long linear; + + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = fc->end }; + cur_size = fc->end - fc->start; + size = min(15UL - cur_size, + PAGE_SIZE - offset_in_page(fc->end)); + if (unlikely(size == 0)) + return X86EMUL_UNHANDLEABLE; + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; + rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, + size, &ctxt->exception); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; + fc->end += size; return X86EMUL_CONTINUE; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - void *dest, unsigned size) + void *__dest, unsigned size) { int rc; + struct fetch_cache *fc = &ctxt->fetch; + u8 *dest = __dest; + u8 *src = &fc->data[ctxt->_eip - fc->start]; - /* x86 instructions are limited to 15 bytes. */ - if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) - return X86EMUL_UNHANDLEABLE; while (size--) { - rc = do_insn_fetch_byte(ctxt, dest++); - if (rc != X86EMUL_CONTINUE) - return rc; + if (unlikely(ctxt->_eip == fc->end)) { + rc = do_insn_fetch_bytes(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + } + *dest++ = *src++; + ctxt->_eip++; + continue; } return X86EMUL_CONTINUE; } @@ -4227,6 +4227,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); + else { + rc = do_insn_fetch_bytes(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + } switch (mode) { case X86EMUL_MODE_REAL: -- cgit v1.2.3 From 5cfc7e0f5e5e1adf998df94f8e36edaf5d30d38e Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 May 2014 13:05:25 +0200 Subject: KVM: emulate: avoid repeated calls to do_insn_fetch_bytes do_insn_fetch_bytes will only be called once in a given insn_fetch and insn_fetch_arr, because in fact it will only be called at most twice for any instruction and the first call is explicit in x86_decode_insn. This observation lets us hoist the call out of the memory copying loop. It does not buy performance, because most fetches are one byte long anyway, but it prepares for the next patch. The overflow check is tricky, but correct. Because do_insn_fetch_bytes has already been called once, we know that fc->end is at least 15. So it is okay to subtract the number of bytes we want to read. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ea188a338afb..ca82ec9c5ffe 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -708,7 +708,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ -static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt) +static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { struct fetch_cache *fc = &ctxt->fetch; int rc; @@ -720,7 +720,14 @@ static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt) cur_size = fc->end - fc->start; size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(fc->end)); - if (unlikely(size == 0)) + + /* + * One instruction can only straddle two pages, + * and one has been loaded at the beginning of + * x86_decode_insn. So, if not enough bytes + * still, we must have hit the 15-byte boundary. + */ + if (unlikely(size < op_size)) return X86EMUL_UNHANDLEABLE; rc = __linearize(ctxt, addr, size, false, true, &linear); if (unlikely(rc != X86EMUL_CONTINUE)) @@ -736,17 +743,18 @@ static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt) static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, void *__dest, unsigned size) { - int rc; struct fetch_cache *fc = &ctxt->fetch; u8 *dest = __dest; u8 *src = &fc->data[ctxt->_eip - fc->start]; + /* We have to be careful about overflow! */ + if (unlikely(ctxt->_eip > fc->end - size)) { + int rc = do_insn_fetch_bytes(ctxt, size); + if (rc != X86EMUL_CONTINUE) + return rc; + } + while (size--) { - if (unlikely(ctxt->_eip == fc->end)) { - rc = do_insn_fetch_bytes(ctxt); - if (rc != X86EMUL_CONTINUE) - return rc; - } *dest++ = *src++; ctxt->_eip++; continue; @@ -4228,7 +4236,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); else { - rc = do_insn_fetch_bytes(ctxt); + rc = do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) return rc; } -- cgit v1.2.3 From 9506d57de3bc8277a4e306e0d439976862f68c6d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 May 2014 13:05:25 +0200 Subject: KVM: emulate: avoid per-byte copying in instruction fetches We do not need a memory copying loop anymore in insn_fetch; we can use a byte-aligned pointer to access instruction fields directly from the fetch_cache. This eliminates 50-150 cycles (corresponding to a 5-10% improvement in performance) from each instruction. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index ca82ec9c5ffe..02c668aca2b6 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -708,7 +708,7 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, * Prefetch the remaining bytes of the instruction without crossing page * boundary if they are not in fetch_cache yet. */ -static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) +static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { struct fetch_cache *fc = &ctxt->fetch; int rc; @@ -740,41 +740,39 @@ static int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) return X86EMUL_CONTINUE; } -static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - void *__dest, unsigned size) +static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, + unsigned size) { - struct fetch_cache *fc = &ctxt->fetch; - u8 *dest = __dest; - u8 *src = &fc->data[ctxt->_eip - fc->start]; - /* We have to be careful about overflow! */ - if (unlikely(ctxt->_eip > fc->end - size)) { - int rc = do_insn_fetch_bytes(ctxt, size); - if (rc != X86EMUL_CONTINUE) - return rc; - } - - while (size--) { - *dest++ = *src++; - ctxt->_eip++; - continue; - } - return X86EMUL_CONTINUE; + if (unlikely(ctxt->_eip > ctxt->fetch.end - size)) + return __do_insn_fetch_bytes(ctxt, size); + else + return X86EMUL_CONTINUE; } /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _ctxt) \ -({ unsigned long _x; \ - rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ +({ _type _x; \ + struct fetch_cache *_fc; \ + \ + rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_type)_x; \ + _fc = &ctxt->fetch; \ + _x = *(_type __aligned(1) *) &_fc->data[ctxt->_eip - _fc->start]; \ + ctxt->_eip += sizeof(_type); \ + _x; \ }) #define insn_fetch_arr(_arr, _size, _ctxt) \ -({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ +({ \ + struct fetch_cache *_fc; \ + rc = do_insn_fetch_bytes(_ctxt, _size); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ + _fc = &ctxt->fetch; \ + memcpy(_arr, &_fc->data[ctxt->_eip - _fc->start], _size); \ + ctxt->_eip += (_size); \ }) /* @@ -4236,7 +4234,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); else { - rc = do_insn_fetch_bytes(ctxt, 1); + rc = __do_insn_fetch_bytes(ctxt, 1); if (rc != X86EMUL_CONTINUE) return rc; } -- cgit v1.2.3 From 17052f16a51af6d8f4b7eee0631af675ac204f65 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 6 May 2014 16:33:01 +0200 Subject: KVM: emulate: put pointers in the fetch_cache This simplifies the code a bit, especially the overflow checks. Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_emulate.h | 4 ++-- arch/x86/kvm/emulate.c | 34 +++++++++++++++------------------- arch/x86/kvm/trace.h | 6 +++--- 3 files changed, 20 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index fcf58cd25ebd..eb181178fe0b 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -242,8 +242,8 @@ struct operand { struct fetch_cache { u8 data[15]; - unsigned long start; - unsigned long end; + u8 *ptr; + u8 *end; }; struct read_cache { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 02c668aca2b6..c16314807756 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -710,16 +710,15 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, */ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { - struct fetch_cache *fc = &ctxt->fetch; int rc; - int size, cur_size; + int size; unsigned long linear; - + int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, - .ea = fc->end }; - cur_size = fc->end - fc->start; - size = min(15UL - cur_size, - PAGE_SIZE - offset_in_page(fc->end)); + .ea = ctxt->eip + cur_size }; + + size = min(15UL ^ cur_size, + PAGE_SIZE - offset_in_page(addr.ea)); /* * One instruction can only straddle two pages, @@ -732,19 +731,18 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) rc = __linearize(ctxt, addr, size, false, true, &linear); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; - rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, + rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception); if (unlikely(rc != X86EMUL_CONTINUE)) return rc; - fc->end += size; + ctxt->fetch.end += size; return X86EMUL_CONTINUE; } static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, unsigned size) { - /* We have to be careful about overflow! */ - if (unlikely(ctxt->_eip > ctxt->fetch.end - size)) + if (unlikely(ctxt->fetch.end - ctxt->fetch.ptr < size)) return __do_insn_fetch_bytes(ctxt, size); else return X86EMUL_CONTINUE; @@ -753,26 +751,24 @@ static __always_inline int do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, /* Fetch next part of the instruction being emulated. */ #define insn_fetch(_type, _ctxt) \ ({ _type _x; \ - struct fetch_cache *_fc; \ \ rc = do_insn_fetch_bytes(_ctxt, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - _fc = &ctxt->fetch; \ - _x = *(_type __aligned(1) *) &_fc->data[ctxt->_eip - _fc->start]; \ ctxt->_eip += sizeof(_type); \ + _x = *(_type __aligned(1) *) ctxt->fetch.ptr; \ + ctxt->fetch.ptr += sizeof(_type); \ _x; \ }) #define insn_fetch_arr(_arr, _size, _ctxt) \ ({ \ - struct fetch_cache *_fc; \ rc = do_insn_fetch_bytes(_ctxt, _size); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - _fc = &ctxt->fetch; \ - memcpy(_arr, &_fc->data[ctxt->_eip - _fc->start], _size); \ ctxt->_eip += (_size); \ + memcpy(_arr, ctxt->fetch.ptr, _size); \ + ctxt->fetch.ptr += (_size); \ }) /* @@ -4228,8 +4224,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) ctxt->memop.type = OP_NONE; ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; - ctxt->fetch.start = ctxt->_eip; - ctxt->fetch.end = ctxt->fetch.start + insn_len; + ctxt->fetch.ptr = ctxt->fetch.data; + ctxt->fetch.end = ctxt->fetch.data + insn_len; ctxt->opcode_len = 1; if (insn_len > 0) memcpy(ctxt->fetch.data, insn, insn_len); diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 33574c95220d..e850a7d332be 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -721,10 +721,10 @@ TRACE_EVENT(kvm_emulate_insn, ), TP_fast_assign( - __entry->rip = vcpu->arch.emulate_ctxt.fetch.start; __entry->csbase = kvm_x86_ops->get_segment_base(vcpu, VCPU_SREG_CS); - __entry->len = vcpu->arch.emulate_ctxt._eip - - vcpu->arch.emulate_ctxt.fetch.start; + __entry->len = vcpu->arch.emulate_ctxt.fetch.ptr + - vcpu->arch.emulate_ctxt.fetch.data; + __entry->rip = vcpu->arch.emulate_ctxt._eip - __entry->len; memcpy(__entry->insn, vcpu->arch.emulate_ctxt.fetch.data, 15); -- cgit v1.2.3 From 719d5a9b2487e0562f178f61e323c3dc18a8b200 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Jun 2014 11:37:06 +0200 Subject: KVM: x86: ensure emulator fetches do not span multiple pages When the CS base is not page-aligned, the linear address of the code could get close to the page boundary (e.g. 0x...ffe) even if the EIP value is not. So we need to first linearize the address, and only then compute the number of valid bytes that can be fetched. This happens relatively often when executing real mode code. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c16314807756..6a1d60956d63 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -711,14 +711,18 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) { int rc; - int size; + unsigned size; unsigned long linear; int cur_size = ctxt->fetch.end - ctxt->fetch.data; struct segmented_address addr = { .seg = VCPU_SREG_CS, .ea = ctxt->eip + cur_size }; - size = min(15UL ^ cur_size, - PAGE_SIZE - offset_in_page(addr.ea)); + size = 15UL ^ cur_size; + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (unlikely(rc != X86EMUL_CONTINUE)) + return rc; + + size = min_t(unsigned, size, PAGE_SIZE - offset_in_page(linear)); /* * One instruction can only straddle two pages, @@ -728,9 +732,6 @@ static int __do_insn_fetch_bytes(struct x86_emulate_ctxt *ctxt, int op_size) */ if (unlikely(size < op_size)) return X86EMUL_UNHANDLEABLE; - rc = __linearize(ctxt, addr, size, false, true, &linear); - if (unlikely(rc != X86EMUL_CONTINUE)) - return rc; rc = ctxt->ops->fetch(ctxt, linear, ctxt->fetch.end, size, &ctxt->exception); if (unlikely(rc != X86EMUL_CONTINUE)) -- cgit v1.2.3 From 44583cba9188b29b20ceeefe8ae23ad19e26d9a4 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 13 May 2014 14:02:13 +0200 Subject: KVM: x86: use kvm_read_guest_page for emulator accesses Emulator accesses are always done a page at a time, either by the emulator itself (for fetches) or because we need to query the MMU for address translations. Speed up these accesses by using kvm_read_guest_page and, in the case of fetches, by inlining kvm_read_guest_virt_helper and dropping the loop around kvm_read_guest_page. This final tweak saves 30-100 more clock cycles (4-10%), bringing the count (as measured by kvm-unit-tests) down to 720-1100 clock cycles on a Sandy Bridge Xeon host, compared to 2300-3200 before the whole series and 925-1700 after the first two low-hanging fruit changes. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 905edf8557e7..f750b69ca443 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4085,7 +4085,8 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, if (gpa == UNMAPPED_GVA) return X86EMUL_PROPAGATE_FAULT; - ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, data, + offset, toread); if (ret < 0) { r = X86EMUL_IO_NEEDED; goto out; @@ -4106,10 +4107,24 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + unsigned offset; + int ret; - return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, - access | PFERR_FETCH_MASK, - exception); + /* Inline kvm_read_guest_virt_helper for speed. */ + gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access|PFERR_FETCH_MASK, + exception); + if (unlikely(gpa == UNMAPPED_GVA)) + return X86EMUL_PROPAGATE_FAULT; + + offset = addr & (PAGE_SIZE-1); + if (WARN_ON(offset + bytes > PAGE_SIZE)) + bytes = (unsigned)PAGE_SIZE - offset; + ret = kvm_read_guest_page(vcpu->kvm, gpa >> PAGE_SHIFT, val, + offset, bytes); + if (unlikely(ret < 0)) + return X86EMUL_IO_NEEDED; + + return X86EMUL_CONTINUE; } int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, -- cgit v1.2.3 From 10e38fc7cabc668738e6a7b7b57cbcddb2234440 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 18 Jun 2014 17:19:34 +0300 Subject: KVM: x86: Emulator flag for instruction that only support 16-bit addresses in real mode Certain instructions, such as monitor and xsave do not support big real mode and cause a #GP exception if any of the accessed bytes effective address are not within [0, 0xffff]. This patch introduces a flag to mark these instructions, including the necassary checks. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 6a1d60956d63..b61ffe9d86a7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -164,6 +164,7 @@ #define NoMod ((u64)1 << 47) /* Mod field is ignored */ #define Intercept ((u64)1 << 48) /* Has valid intercept field */ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ +#define NoBigReal ((u64)1 << 50) /* No big real mode */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -640,7 +641,12 @@ static int __linearize(struct x86_emulate_ctxt *ctxt, if (!fetch && (desc.type & 8) && !(desc.type & 2)) goto bad; lim = desc_limit_scaled(&desc); - if ((desc.type & 8) || !(desc.type & 4)) { + if ((ctxt->mode == X86EMUL_MODE_REAL) && !fetch && + (ctxt->d & NoBigReal)) { + /* la is between zero and 0xffff */ + if (la > 0xffff || (u32)(la + size - 1) > 0xffff) + goto bad; + } else if ((desc.type & 8) || !(desc.type & 4)) { /* expand-up segment */ if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) goto bad; -- cgit v1.2.3 From 68efa764f3429f2bd71f431e91c04b0bcb7d34f1 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 18 Jun 2014 17:19:35 +0300 Subject: KVM: x86: Emulator support for #UD on CPL>0 Certain instructions (e.g., mwait and monitor) cause a #UD exception when they are executed in user mode. This is in contrast to the regular privileged instructions which cause #GP. In order not to mess with SVM interception of mwait and monitor which assumes privilege level assertions take place before interception, a flag has been added. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index b61ffe9d86a7..dd074106d0c9 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -165,6 +165,7 @@ #define Intercept ((u64)1 << 48) /* Has valid intercept field */ #define CheckPerm ((u64)1 << 49) /* Has valid check_perm field */ #define NoBigReal ((u64)1 << 50) /* No big real mode */ +#define PrivUD ((u64)1 << 51) /* #UD instead of #GP on CPL > 0 */ #define DstXacc (DstAccLo | SrcAccHi | SrcWrite) @@ -4608,7 +4609,10 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) /* Privileged instruction can be executed only in CPL=0 */ if ((ctxt->d & Priv) && ops->cpl(ctxt)) { - rc = emulate_gp(ctxt, 0); + if (ctxt->d & PrivUD) + rc = emulate_ud(ctxt); + else + rc = emulate_gp(ctxt, 0); goto done; } -- cgit v1.2.3 From 963fee1656603ce2e91ebb988cd5a92f2af41369 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 17 Jul 2014 19:03:00 +0800 Subject: KVM: nVMX: Fix virtual interrupt delivery injection This patch fix bug reported in https://bugzilla.kernel.org/show_bug.cgi?id=73331, after the patch http://www.spinics.net/lists/kvm/msg105230.html applied, there is some progress and the L2 can boot up, however, slowly. The original idea of this fix vid injection patch is from "Zhang, Yang Z" . Interrupt which delivered by vid should be injected to L1 by L0 if current is in L1, or should be injected to L2 by L0 through the old injection way if L1 doesn't have set External-interrupt exiting bit. The current logic doen't consider these cases. This patch fix it by vid intr to L1 if current is L1 or L2 through old injection way if L1 doen't have External-interrupt exiting bit set. Signed-off-by: Wanpeng Li Signed-off-by: "Zhang, Yang Z" Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0c9569b994f9..7534a9f67cc8 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7130,7 +7130,26 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) if (max_irr == -1) return; - vmx_set_rvi(max_irr); + /* + * If a vmexit is needed, vmx_check_nested_events handles it. + */ + if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) + return; + + if (!is_guest_mode(vcpu)) { + vmx_set_rvi(max_irr); + return; + } + + /* + * Fall back to pre-APICv interrupt injection since L2 + * is run without virtual interrupt delivery. + */ + if (!kvm_event_needs_reinjection(vcpu) && + vmx_interrupt_allowed(vcpu)) { + kvm_queue_interrupt(vcpu, max_irr, false); + vmx_inject_irq(vcpu); + } } static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) -- cgit v1.2.3 From bb663c7ada380f3c89c2f83fdbe2b3626621385d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 21 Jul 2014 14:37:26 +0300 Subject: KVM: x86: Clearing rflags.rf upon skipped emulated instruction When skipping an emulated instruction, rflags.rf should be cleared as it would be on real x86 CPU. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f750b69ca443..1fd806cb96d4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5229,6 +5229,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, if (emulation_type & EMULTYPE_SKIP) { kvm_rip_write(vcpu, ctxt->_eip); + if (ctxt->eflags & X86_EFLAGS_RF) + kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF); return EMULATE_DONE; } -- cgit v1.2.3 From 163b135e7b09e9158f7eb0aa74e716865e3005d2 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 21 Jul 2014 14:37:28 +0300 Subject: KVM: x86: popf emulation should not change RF RFLAGS.RF is always zero after popf. Therefore, popf should not updated RF, as anyhow emulating popf, just as any other instruction should clear RFLAGS.RF. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index dd074106d0c9..cf117bfe9521 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1675,7 +1675,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF - | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID; + | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID; switch(ctxt->mode) { case X86EMUL_MODE_PROT64: -- cgit v1.2.3 From 4467c3f1ad16e3640e2b61e1a5e0bd55281a925d Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 21 Jul 2014 14:37:29 +0300 Subject: KVM: x86: Clear rflags.rf on emulated instructions When an instruction is emulated RFLAGS.RF should be cleared. KVM previously did not do so. This patch clears RFLAGS.RF after interception is done. If a fault occurs during the instruction, RFLAGS.RF will be set by a previous patch. This patch does not handle the case of traps/interrupts during rep-strings. Traps are only expected to occur on debug watchpoints, and those are anyhow not handled by the emulator. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index cf117bfe9521..189b8bd86e31 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4640,6 +4640,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) /* All REP prefixes have the same first termination condition */ if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) { ctxt->eip = ctxt->_eip; + ctxt->eflags &= ~EFLG_RF; goto done; } } @@ -4682,6 +4683,8 @@ special_insn: goto done; } + ctxt->eflags &= ~EFLG_RF; + if (ctxt->execute) { if (ctxt->d & Fastop) { void (*fop)(struct fastop *) = (void *)ctxt->execute; -- cgit v1.2.3 From 6c6cb69b8e974049cca2cc4480052fb9e7df767b Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 21 Jul 2014 14:37:30 +0300 Subject: KVM: x86: Cleanup of rflags.rf cleaning RFLAGS.RF was cleaned in several functions (e.g., syscall) in the x86 emulator. Now that we clear it before the execution of an instruction in the emulator, we can remove the specific cleanup of RFLAGS.RF. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 189b8bd86e31..8d415563e05b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2211,7 +2211,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) *reg_write(ctxt, VCPU_REGS_RCX) = ctxt->_eip; if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 - *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags & ~EFLG_RF; + *reg_write(ctxt, VCPU_REGS_R11) = ctxt->eflags; ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? @@ -2219,14 +2219,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt) ctxt->_eip = msr_data; ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); - ctxt->eflags &= ~(msr_data | EFLG_RF); + ctxt->eflags &= ~msr_data; #endif } else { /* legacy mode */ ops->get_msr(ctxt, MSR_STAR, &msr_data); ctxt->_eip = (u32)msr_data; - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); } return X86EMUL_CONTINUE; @@ -2275,7 +2275,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) break; } - ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); + ctxt->eflags &= ~(EFLG_VM | EFLG_IF); cs_sel = (u16)msr_data; cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; -- cgit v1.2.3 From 4161a569065b17954848069d5209182083ce876b Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 17 Jul 2014 01:19:31 +0300 Subject: KVM: x86: emulator injects #DB when RFLAGS.RF is set If the RFLAGS.RF is set, then no #DB should occur on instruction breakpoints. However, the KVM emulator injects #DB regardless to RFLAGS.RF. This patch fixes this behavior. KVM, however, still appears not to update RFLAGS.RF correctly, regardless of this patch. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1fd806cb96d4..cc4f65f39b87 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5158,7 +5158,8 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) } } - if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK)) { + if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && + !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) { dr6 = kvm_vcpu_check_hw_bp(eip, 0, vcpu->arch.dr7, vcpu->arch.db); -- cgit v1.2.3 From c9cdd085bb75226879fd468b88e2e7eb467325b7 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Mon, 21 Jul 2014 14:37:24 +0300 Subject: KVM: x86: Defining missing x86 vectors Defining XE, XM and VE vector numbers. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/include/uapi/asm/kvm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index d3a87780c70b..d7dcef58aefa 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -23,7 +23,10 @@ #define GP_VECTOR 13 #define PF_VECTOR 14 #define MF_VECTOR 16 +#define AC_VECTOR 17 #define MC_VECTOR 18 +#define XM_VECTOR 19 +#define VE_VECTOR 20 /* Select x86 specific features in */ #define __KVM_HAVE_PIT -- cgit v1.2.3 From 4fa7734c62cdd8c07edd54fa5a5e91482273071a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 17 Jul 2014 12:25:16 +0200 Subject: KVM: nVMX: fix lifetime issues for vmcs02 free_nested needs the loaded_vmcs to be valid if it is a vmcs02, in order to detach it from the shadow vmcs. However, this is not available anymore after commit 26a865f4aa8e (KVM: VMX: fix use after free of vmx->loaded_vmcs, 2014-01-03). Revert that patch, and fix its problem by forcing a vmcs01 as the active VMCS before freeing all the nested VMX state. Reported-by: Wanpeng Li Tested-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7534a9f67cc8..462334eaa3c0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -5772,22 +5772,27 @@ static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr) /* * Free all VMCSs saved for this vcpu, except the one pointed by - * vmx->loaded_vmcs. These include the VMCSs in vmcs02_pool (except the one - * currently used, if running L2), and vmcs01 when running L2. + * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs + * must be &vmx->vmcs01. */ static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx) { struct vmcs02_list *item, *n; + + WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01); list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) { - if (vmx->loaded_vmcs != &item->vmcs02) - free_loaded_vmcs(&item->vmcs02); + /* + * Something will leak if the above WARN triggers. Better than + * a use-after-free. + */ + if (vmx->loaded_vmcs == &item->vmcs02) + continue; + + free_loaded_vmcs(&item->vmcs02); list_del(&item->list); kfree(item); + vmx->nested.vmcs02_num--; } - vmx->nested.vmcs02_num = 0; - - if (vmx->loaded_vmcs != &vmx->vmcs01) - free_loaded_vmcs(&vmx->vmcs01); } /* @@ -7557,13 +7562,31 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } +static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int cpu; + + if (vmx->loaded_vmcs == &vmx->vmcs01) + return; + + cpu = get_cpu(); + vmx->loaded_vmcs = &vmx->vmcs01; + vmx_vcpu_put(vcpu); + vmx_vcpu_load(vcpu, cpu); + vcpu->cpu = cpu; + put_cpu(); +} + static void vmx_free_vcpu(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); free_vpid(vmx); - free_loaded_vmcs(vmx->loaded_vmcs); + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); free_nested(vmx); + free_loaded_vmcs(vmx->loaded_vmcs); kfree(vmx->guest_msrs); kvm_vcpu_uninit(vcpu); kmem_cache_free(kvm_vcpu_cache, vmx); @@ -8721,7 +8744,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, unsigned long exit_qualification) { struct vcpu_vmx *vmx = to_vmx(vcpu); - int cpu; struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* trying to cancel vmlaunch/vmresume is a bug */ @@ -8746,12 +8768,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_intr_error_code, KVM_ISA_VMX); - cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); + vmx_load_vmcs01(vcpu); vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS)); vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS)); -- cgit v1.2.3 From 9a2a05b9ed618b1bb6d4cbec0c2e1f80d6636609 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 17 Jul 2014 11:55:46 +0200 Subject: KVM: nVMX: clean up nested_release_vmcs12 and code around it Make nested_release_vmcs12 idempotent. Tested-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 462334eaa3c0..3300f4f2da48 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -6109,20 +6109,27 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) static inline void nested_release_vmcs12(struct vcpu_vmx *vmx) { u32 exec_control; + if (vmx->nested.current_vmptr == -1ull) + return; + + /* current_vmptr and current_vmcs12 are always set/reset together */ + if (WARN_ON(vmx->nested.current_vmcs12 == NULL)) + return; + if (enable_shadow_vmcs) { - if (vmx->nested.current_vmcs12 != NULL) { - /* copy to memory all shadowed fields in case - they were modified */ - copy_shadow_to_vmcs12(vmx); - vmx->nested.sync_shadow_vmcs = false; - exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); - exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; - vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); - vmcs_write64(VMCS_LINK_POINTER, -1ull); - } + /* copy to memory all shadowed fields in case + they were modified */ + copy_shadow_to_vmcs12(vmx); + vmx->nested.sync_shadow_vmcs = false; + exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS; + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); + vmcs_write64(VMCS_LINK_POINTER, -1ull); } kunmap(vmx->nested.current_vmcs12_page); nested_release_page(vmx->nested.current_vmcs12_page); + vmx->nested.current_vmptr = -1ull; + vmx->nested.current_vmcs12 = NULL; } /* @@ -6133,12 +6140,9 @@ static void free_nested(struct vcpu_vmx *vmx) { if (!vmx->nested.vmxon) return; + vmx->nested.vmxon = false; - if (vmx->nested.current_vmptr != -1ull) { - nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } + nested_release_vmcs12(vmx); if (enable_shadow_vmcs) free_vmcs(vmx->nested.current_shadow_vmcs); /* Unpin physical memory we referred to in current vmcs02 */ @@ -6175,11 +6179,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) if (nested_vmx_check_vmptr(vcpu, EXIT_REASON_VMCLEAR, &vmptr)) return 1; - if (vmptr == vmx->nested.current_vmptr) { + if (vmptr == vmx->nested.current_vmptr) nested_release_vmcs12(vmx); - vmx->nested.current_vmptr = -1ull; - vmx->nested.current_vmcs12 = NULL; - } page = nested_get_page(vcpu, vmptr); if (page == NULL) { @@ -6521,9 +6522,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) skip_emulated_instruction(vcpu); return 1; } - if (vmx->nested.current_vmptr != -1ull) - nested_release_vmcs12(vmx); + nested_release_vmcs12(vmx); vmx->nested.current_vmptr = vmptr; vmx->nested.current_vmcs12 = new_vmcs12; vmx->nested.current_vmcs12_page = page; -- cgit v1.2.3 From 6f43ed01e87c8a8dbd8c826eaf0f714c1342c039 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Tue, 15 Jul 2014 17:37:46 +0300 Subject: KVM: x86: DR6/7.RTM cannot be written Haswell and newer Intel CPUs have support for RTM, and in that case DR6.RTM is not fixed to 1 and DR7.RTM is not fixed to zero. That is not the case in the current KVM implementation. This bug is apparent only if the MOV-DR instruction is emulated or the host also debugs the guest. This patch is a partial fix which enables DR6.RTM and DR7.RTM to be cleared and set respectively. It also sets DR6.RTM upon every debug exception. Obviously, it is not a complete fix, as debugging of RTM is still unsupported. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 8 +++++--- arch/x86/kvm/cpuid.h | 8 ++++++++ arch/x86/kvm/vmx.c | 4 ++-- arch/x86/kvm/x86.c | 22 ++++++++++++++++------ 4 files changed, 31 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b8a4480176b9..a84eaf7ba33f 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -152,14 +152,16 @@ enum { #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) -#define DR6_FIXED_1 0xffff0ff0 -#define DR6_VOLATILE 0x0000e00f +#define DR6_RTM (1 << 16) +#define DR6_FIXED_1 0xfffe0ff0 +#define DR6_INIT 0xffff0ff0 +#define DR6_VOLATILE 0x0001e00f #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) #define DR7_FIXED_1 0x00000400 -#define DR7_VOLATILE 0xffff23ff +#define DR7_VOLATILE 0xffff2bff /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index f9087315e0cd..a5380590ab0e 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -95,4 +95,12 @@ static inline bool guest_cpuid_has_gbpages(struct kvm_vcpu *vcpu) best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); return best && (best->edx & bit(X86_FEATURE_GBPAGES)); } + +static inline bool guest_cpuid_has_rtm(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 7, 0); + return best && (best->ebx & bit(X86_FEATURE_RTM)); +} #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3300f4f2da48..fd24f68378a7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4892,7 +4892,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; if (!(dr6 & ~DR6_RESERVED)) /* icebp */ skip_emulated_instruction(vcpu); @@ -5151,7 +5151,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) return 0; } else { vcpu->arch.dr7 &= ~DR7_GD; - vcpu->arch.dr6 |= DR6_BD; + vcpu->arch.dr6 |= DR6_BD | DR6_RTM; vmcs_writel(GUEST_DR7, vcpu->arch.dr7); kvm_queue_exception(vcpu, DB_VECTOR); return 1; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cc4f65f39b87..08c48a3c7c3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -759,6 +759,15 @@ static void kvm_update_dr7(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; } +static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) +{ + u64 fixed = DR6_FIXED_1; + + if (!guest_cpuid_has_rtm(vcpu)) + fixed |= DR6_RTM; + return fixed; +} + static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) { switch (dr) { @@ -774,7 +783,7 @@ static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) case 6: if (val & 0xffffffff00000000ULL) return -1; /* #GP */ - vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; + vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); kvm_update_dr6(vcpu); break; case 5: @@ -5115,7 +5124,8 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag */ if (unlikely(rflags & X86_EFLAGS_TF)) { if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { - kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | + DR6_RTM; kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip; kvm_run->debug.arch.exception = DB_VECTOR; kvm_run->exit_reason = KVM_EXIT_DEBUG; @@ -5128,7 +5138,7 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag * cleared by the processor". */ vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= DR6_BS; + vcpu->arch.dr6 |= DR6_BS | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); } } @@ -5147,7 +5157,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) vcpu->arch.eff_db); if (dr6 != 0) { - kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1; + kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM; kvm_run->debug.arch.pc = kvm_rip_read(vcpu) + get_segment_base(vcpu, VCPU_SREG_CS); @@ -5166,7 +5176,7 @@ static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) if (dr6 != 0) { vcpu->arch.dr6 &= ~15; - vcpu->arch.dr6 |= dr6; + vcpu->arch.dr6 |= dr6 | DR6_RTM; kvm_queue_exception(vcpu, DB_VECTOR); *r = EMULATE_DONE; return true; @@ -6866,7 +6876,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu) kvm_clear_exception_queue(vcpu); memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); - vcpu->arch.dr6 = DR6_FIXED_1; + vcpu->arch.dr6 = DR6_INIT; kvm_update_dr6(vcpu); vcpu->arch.dr7 = DR7_FIXED_1; kvm_update_dr7(vcpu); -- cgit v1.2.3 From b9a1ecb909e8f772934cc4bf1f164124c9fbb0d0 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 24 Jul 2014 14:51:23 +0300 Subject: KVM: x86: Setting rflags.rf during rep-string emulation This patch updates RF for rep-string emulation. The flag is set upon the first iteration, and cleared after the last (if emulated). It is intended to make sure that if a trap (in future data/io #DB emulation) or interrupt is delivered to the guest during the rep-string instruction, RF will be set correctly. RF affects whether instruction breakpoint in the guest is masked. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8d415563e05b..57743edfb4e2 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -4683,7 +4683,10 @@ special_insn: goto done; } - ctxt->eflags &= ~EFLG_RF; + if (ctxt->rep_prefix && (ctxt->d & String)) + ctxt->eflags |= EFLG_RF; + else + ctxt->eflags &= ~EFLG_RF; if (ctxt->execute) { if (ctxt->d & Fastop) { @@ -4824,6 +4827,7 @@ writeback: } goto done; /* skip rip writeback */ } + ctxt->eflags &= ~EFLG_RF; } ctxt->eip = ctxt->_eip; -- cgit v1.2.3 From d6e8c8545651b05a86c5b9d29d2fe11ad4cbb9aa Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 24 Jul 2014 14:51:24 +0300 Subject: KVM: x86: set rflags.rf during fault injection x86 does not automatically set rflags.rf during event injection. This patch does partial job, setting rflags.rf upon fault injection. It does not handle the setting of RF upon interrupt injection on rep-string instruction. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08c48a3c7c3a..439f96bf424d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -311,6 +311,31 @@ static int exception_class(int vector) return EXCPT_BENIGN; } +#define EXCPT_FAULT 0 +#define EXCPT_TRAP 1 +#define EXCPT_ABORT 2 +#define EXCPT_INTERRUPT 3 + +static int exception_type(int vector) +{ + unsigned int mask; + + if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) + return EXCPT_INTERRUPT; + + mask = 1 << vector; + + /* #DB is trap, as instruction watchpoints are handled elsewhere */ + if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR))) + return EXCPT_TRAP; + + if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) + return EXCPT_ABORT; + + /* Reserved exceptions will result in fault */ + return EXCPT_FAULT; +} + static void kvm_multiple_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error, u32 error_code, bool reinject) @@ -5893,6 +5918,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win) trace_kvm_inj_exception(vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code); + + if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT) + __kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) | + X86_EFLAGS_RF); + kvm_x86_ops->queue_exception(vcpu, vcpu->arch.exception.nr, vcpu->arch.exception.has_error_code, vcpu->arch.exception.error_code, -- cgit v1.2.3 From 0123be429fef40f067e5b1811576c3994229f59e Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Thu, 24 Jul 2014 15:06:56 +0300 Subject: KVM: x86: Assertions to check no overrun in MSR lists Currently there is no check whether shared MSRs list overrun the allocated size which can results in bugs. In addition there is no check that vmx->guest_msrs has sufficient space to accommodate all the VMX msrs. This patch adds the assertions. Signed-off-by: Nadav Amit Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 2 ++ arch/x86/kvm/x86.c | 1 + 2 files changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fd24f68378a7..3397a88b7463 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -7608,6 +7608,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + BUILD_BUG_ON(PAGE_SIZE / sizeof(struct shared_msr_entry) < NR_VMX_MSR); + err = -ENOMEM; if (!vmx->guest_msrs) { goto uninit_vcpu; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 439f96bf424d..d38abc81db65 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -212,6 +212,7 @@ static void shared_msr_update(unsigned slot, u32 msr) void kvm_define_shared_msr(unsigned slot, u32 msr) { + BUG_ON(slot >= KVM_NR_SHARED_MSRS); if (slot >= shared_msrs_global.nr) shared_msrs_global.nr = slot + 1; shared_msrs_global.msrs[slot] = msr; -- cgit v1.2.3 From 03916db9348c079d8d214f971cc114bb51c6b869 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 24 Jul 2014 14:21:57 +0200 Subject: Replace NR_VMX_MSR with its definition Using ARRAY_SIZE directly makes it easier to read the code. While touching the code, replace the division by a multiplication in the recently added BUILD_BUG_ON. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 3397a88b7463..a3845b8e2b80 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -823,7 +823,6 @@ static const u32 vmx_msr_index[] = { #endif MSR_EFER, MSR_TSC_AUX, MSR_STAR, }; -#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) static inline bool is_page_fault(u32 intr_info) { @@ -4441,7 +4440,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmx->vcpu.arch.pat = host_pat; } - for (i = 0; i < NR_VMX_MSR; ++i) { + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) { u32 index = vmx_msr_index[i]; u32 data_low, data_high; int j = vmx->nmsrs; @@ -7608,7 +7607,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); - BUILD_BUG_ON(PAGE_SIZE / sizeof(struct shared_msr_entry) < NR_VMX_MSR); + BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0]) + > PAGE_SIZE); err = -ENOMEM; if (!vmx->guest_msrs) { @@ -8960,7 +8960,7 @@ static int __init vmx_init(void) rdmsrl_safe(MSR_EFER, &host_efer); - for (i = 0; i < NR_VMX_MSR; ++i) + for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); -- cgit v1.2.3 From b55a8144d1807f9e74c51cb584f0dd198483d86c Mon Sep 17 00:00:00 2001 From: Mark Rustad Date: Fri, 25 Jul 2014 06:27:05 -0700 Subject: x86/kvm: Resolve shadow warning from min macro Resolve a shadow warning generated in W=2 builds by the nested use of the min macro by instead using the min3 macro for the minimum of 3 values. Signed-off-by: Mark Rustad Signed-off-by: Jeff Kirsher Signed-off-by: Paolo Bonzini --- arch/x86/kvm/emulate.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 57743edfb4e2..56657b0bb3bb 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -1317,8 +1317,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, in_page = (ctxt->eflags & EFLG_DF) ? offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) : PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)); - n = min(min(in_page, (unsigned int)sizeof(rc->data)) / size, - count); + n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count); if (n == 0) n = 1; rc->pos = rc->end = 0; -- cgit v1.2.3 From 296f047502f1b3ddfd63adbc192624ce80740081 Mon Sep 17 00:00:00 2001 From: Chris J Arges Date: Tue, 29 Jul 2014 16:14:10 -0500 Subject: KVM: vmx: remove duplicate vmx_mpx_supported() prototype Remove a prototype which was added by both 93c4adc7afe and 36be0b9deb2. Signed-off-by: Chris J Arges Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a3845b8e2b80..e618f34bde2d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -743,7 +743,6 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var); static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu); static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx); static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx); -static bool vmx_mpx_supported(void); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); -- cgit v1.2.3 From 42cbc04fd3b5e3f9b011bf9fa3ce0b3d1e10b58b Mon Sep 17 00:00:00 2001 From: Mark D Rustad Date: Wed, 30 Jul 2014 14:19:26 -0700 Subject: x86/kvm: Resolve shadow warnings in macro expansion Resolve shadow warnings that appear in W=2 builds. Instead of using ret to hold the return pointer, save the length in a new variable saved_len and compute the pointer on exit. This also resolves a very technical error, in that ret was declared as a const char *, when it really was a char * const. Signed-off-by: Mark Rustad Signed-off-by: Jeff Kirsher Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmutrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h index 9d2e0ffcb190..5aaf35641768 100644 --- a/arch/x86/kvm/mmutrace.h +++ b/arch/x86/kvm/mmutrace.h @@ -22,7 +22,7 @@ __entry->unsync = sp->unsync; #define KVM_MMU_PAGE_PRINTK() ({ \ - const char *ret = p->buffer + p->len; \ + const u32 saved_len = p->len; \ static const char *access_str[] = { \ "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" \ }; \ @@ -41,7 +41,7 @@ role.nxe ? "" : "!", \ __entry->root_count, \ __entry->unsync ? "unsync" : "sync", 0); \ - ret; \ + p->buffer + saved_len; \ }) #define kvm_mmu_trace_pferr_flags \ -- cgit v1.2.3