From a9810327726b01404ecde082c075a7468c433ddf Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 29 Jan 2018 12:22:45 +0100 Subject: KVM: s390: optimize wakeup for exitless interrupts For interrupt injection of floating interrupts we queue the interrupt either in the GISA or in the floating interrupt list. The first CPU that looks at these data structures - either in KVM code or hardware will then deliver that interrupt. To minimize latency we also: -a: choose a VCPU to deliver that interrupt. We prefer idle CPUs -b: we wake up the host thread that runs the VCPU -c: set an I/O intervention bit for that CPU so that it exits guest context as soon as the PSW I/O mask is enabled This will make sure that this CPU will execute the interrupt delivery code of KVM very soon. We can now optimize the injection case if we have exitless interrupts. The wakeup is still necessary in case the target CPU sleeps. We can avoid the I/O intervention request bit though. Whenever this intervention request would be handled, the hardware could also directly inject the interrupt on that CPU, no need to go through the interrupt injection loop of KVM. Cc: Michael Mueller Reviewed-by: Halil Pasic Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index aabf46f5f883..337a69bc04db 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1701,7 +1701,8 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type) kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_STOP_INT); break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); + if (!(type & KVM_S390_INT_IO_AI_MASK && kvm->arch.gisa)) + kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_IO_INT); break; default: kvm_s390_set_cpuflags(dst_vcpu, CPUSTAT_EXT_INT); -- cgit v1.2.3 From 8846f3175c6bf16382b06a4b9755e5296c0f921c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 12 Feb 2018 12:33:39 +0000 Subject: KVM: s390: do not set intervention requests for GISA interrupts If GISA is available, we do not have to kick CPUs out of SIE to deliver interrupts. The hardware can deliver such interrupts while running. Cc: Michael Mueller Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 337a69bc04db..e399495001ca 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -236,10 +236,15 @@ static inline int kvm_s390_gisa_tac_ipm_gisc(struct kvm_s390_gisa *gisa, u32 gis return test_and_clear_bit_inv(IPM_BIT_OFFSET + gisc, (unsigned long *) gisa); } -static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) +static inline unsigned long pending_irqs_no_gisa(struct kvm_vcpu *vcpu) { return vcpu->kvm->arch.float_int.pending_irqs | - vcpu->arch.local_int.pending_irqs | + vcpu->arch.local_int.pending_irqs; +} + +static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu) +{ + return pending_irqs_no_gisa(vcpu) | kvm_s390_gisa_get_ipm(vcpu->kvm->arch.gisa) << IRQ_PEND_IO_ISC_7; } @@ -337,7 +342,7 @@ static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) static void set_intercept_indicators_io(struct kvm_vcpu *vcpu) { - if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK)) + if (!(pending_irqs_no_gisa(vcpu) & IRQ_PEND_IO_MASK)) return; else if (psw_ioint_disabled(vcpu)) kvm_s390_set_cpuflags(vcpu, CPUSTAT_IO_INT); -- cgit v1.2.3 From f315104ad8b0c32be13eac628569ae707c332cb5 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 13 Feb 2018 13:55:49 +0000 Subject: KVM: s390: force bp isolation for VSIE If the guest runs with bp isolation when doing a SIE instruction, we must also run the nested guest with bp isolation when emulating that SIE instruction. This is done by activating BPBC in the lpar, which acts as an override for lower level guests. Signed-off-by: Christian Borntraeger Reviewed-by: Janosch Frank Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/vsie.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index ec772700ff96..8961e3970901 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -821,6 +821,7 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; + int guest_bp_isolation; int rc; handle_last_fault(vcpu, vsie_page); @@ -831,6 +832,20 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) s390_handle_mcck(); srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + + /* save current guest state of bp isolation override */ + guest_bp_isolation = test_thread_flag(TIF_ISOLATE_BP_GUEST); + + /* + * The guest is running with BPBC, so we have to force it on for our + * nested guest. This is done by enabling BPBC globally, so the BPBC + * control in the SCB (which the nested guest can modify) is simply + * ignored. + */ + if (test_kvm_facility(vcpu->kvm, 82) && + vcpu->arch.sie_block->fpf & FPF_BPBC) + set_thread_flag(TIF_ISOLATE_BP_GUEST); + local_irq_disable(); guest_enter_irqoff(); local_irq_enable(); @@ -840,6 +855,11 @@ static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) local_irq_disable(); guest_exit_irqoff(); local_irq_enable(); + + /* restore guest state for bp isolation override */ + if (!guest_bp_isolation) + clear_thread_flag(TIF_ISOLATE_BP_GUEST); + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); if (rc == -EINTR) { -- cgit v1.2.3 From 6db4263fec9e550e0cdaed732f4af77a44c10f5f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 8 Apr 2016 17:52:39 +0200 Subject: KVM: s390: use switch vs jump table in priv.c Instead of having huge jump tables for function selection, let's use normal switch/case statements for the instruction handlers in priv.c This allows the compiler to make the right decision depending on the situation (e.g. avoid jump-tables for thunks). Signed-off-by: Christian Borntraeger Reviewed-by: Cornelia Huck Reviewed-by: Janosch Frank Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger --- arch/s390/kvm/priv.c | 183 +++++++++++++++++++++++++-------------------------- 1 file changed, 91 insertions(+), 92 deletions(-) diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index c4c4e157c036..a74578cdd3f3 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -795,55 +795,60 @@ out: return rc; } -static const intercept_handler_t b2_handlers[256] = { - [0x02] = handle_stidp, - [0x04] = handle_set_clock, - [0x10] = handle_set_prefix, - [0x11] = handle_store_prefix, - [0x12] = handle_store_cpu_address, - [0x14] = kvm_s390_handle_vsie, - [0x21] = handle_ipte_interlock, - [0x29] = handle_iske, - [0x2a] = handle_rrbe, - [0x2b] = handle_sske, - [0x2c] = handle_test_block, - [0x30] = handle_io_inst, - [0x31] = handle_io_inst, - [0x32] = handle_io_inst, - [0x33] = handle_io_inst, - [0x34] = handle_io_inst, - [0x35] = handle_io_inst, - [0x36] = handle_io_inst, - [0x37] = handle_io_inst, - [0x38] = handle_io_inst, - [0x39] = handle_io_inst, - [0x3a] = handle_io_inst, - [0x3b] = handle_io_inst, - [0x3c] = handle_io_inst, - [0x50] = handle_ipte_interlock, - [0x56] = handle_sthyi, - [0x5f] = handle_io_inst, - [0x74] = handle_io_inst, - [0x76] = handle_io_inst, - [0x7d] = handle_stsi, - [0xb1] = handle_stfl, - [0xb2] = handle_lpswe, -}; - int kvm_s390_handle_b2(struct kvm_vcpu *vcpu) { - intercept_handler_t handler; - - /* - * A lot of B2 instructions are priviledged. Here we check for - * the privileged ones, that we can handle in the kernel. - * Anything else goes to userspace. - */ - handler = b2_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; - if (handler) - return handler(vcpu); - - return -EOPNOTSUPP; + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x02: + return handle_stidp(vcpu); + case 0x04: + return handle_set_clock(vcpu); + case 0x10: + return handle_set_prefix(vcpu); + case 0x11: + return handle_store_prefix(vcpu); + case 0x12: + return handle_store_cpu_address(vcpu); + case 0x14: + return kvm_s390_handle_vsie(vcpu); + case 0x21: + case 0x50: + return handle_ipte_interlock(vcpu); + case 0x29: + return handle_iske(vcpu); + case 0x2a: + return handle_rrbe(vcpu); + case 0x2b: + return handle_sske(vcpu); + case 0x2c: + return handle_test_block(vcpu); + case 0x30: + case 0x31: + case 0x32: + case 0x33: + case 0x34: + case 0x35: + case 0x36: + case 0x37: + case 0x38: + case 0x39: + case 0x3a: + case 0x3b: + case 0x3c: + case 0x5f: + case 0x74: + case 0x76: + return handle_io_inst(vcpu); + case 0x56: + return handle_sthyi(vcpu); + case 0x7d: + return handle_stsi(vcpu); + case 0xb1: + return handle_stfl(vcpu); + case 0xb2: + return handle_lpswe(vcpu); + default: + return -EOPNOTSUPP; + } } static int handle_epsw(struct kvm_vcpu *vcpu) @@ -1105,25 +1110,22 @@ static int handle_essa(struct kvm_vcpu *vcpu) return 0; } -static const intercept_handler_t b9_handlers[256] = { - [0x8a] = handle_ipte_interlock, - [0x8d] = handle_epsw, - [0x8e] = handle_ipte_interlock, - [0x8f] = handle_ipte_interlock, - [0xab] = handle_essa, - [0xaf] = handle_pfmf, -}; - int kvm_s390_handle_b9(struct kvm_vcpu *vcpu) { - intercept_handler_t handler; - - /* This is handled just as for the B2 instructions. */ - handler = b9_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; - if (handler) - return handler(vcpu); - - return -EOPNOTSUPP; + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x8a: + case 0x8e: + case 0x8f: + return handle_ipte_interlock(vcpu); + case 0x8d: + return handle_epsw(vcpu); + case 0xab: + return handle_essa(vcpu); + case 0xaf: + return handle_pfmf(vcpu); + default: + return -EOPNOTSUPP; + } } int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) @@ -1271,22 +1273,20 @@ static int handle_stctg(struct kvm_vcpu *vcpu) return rc ? kvm_s390_inject_prog_cond(vcpu, rc) : 0; } -static const intercept_handler_t eb_handlers[256] = { - [0x2f] = handle_lctlg, - [0x25] = handle_stctg, - [0x60] = handle_ri, - [0x61] = handle_ri, - [0x62] = handle_ri, -}; - int kvm_s390_handle_eb(struct kvm_vcpu *vcpu) { - intercept_handler_t handler; - - handler = eb_handlers[vcpu->arch.sie_block->ipb & 0xff]; - if (handler) - return handler(vcpu); - return -EOPNOTSUPP; + switch (vcpu->arch.sie_block->ipb & 0x000000ff) { + case 0x25: + return handle_stctg(vcpu); + case 0x2f: + return handle_lctlg(vcpu); + case 0x60: + case 0x61: + case 0x62: + return handle_ri(vcpu); + default: + return -EOPNOTSUPP; + } } static int handle_tprot(struct kvm_vcpu *vcpu) @@ -1346,10 +1346,12 @@ out_unlock: int kvm_s390_handle_e5(struct kvm_vcpu *vcpu) { - /* For e5xx... instructions we only handle TPROT */ - if ((vcpu->arch.sie_block->ipa & 0x00ff) == 0x01) + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x01: return handle_tprot(vcpu); - return -EOPNOTSUPP; + default: + return -EOPNOTSUPP; + } } static int handle_sckpf(struct kvm_vcpu *vcpu) @@ -1380,17 +1382,14 @@ static int handle_ptff(struct kvm_vcpu *vcpu) return 0; } -static const intercept_handler_t x01_handlers[256] = { - [0x04] = handle_ptff, - [0x07] = handle_sckpf, -}; - int kvm_s390_handle_01(struct kvm_vcpu *vcpu) { - intercept_handler_t handler; - - handler = x01_handlers[vcpu->arch.sie_block->ipa & 0x00ff]; - if (handler) - return handler(vcpu); - return -EOPNOTSUPP; + switch (vcpu->arch.sie_block->ipa & 0x00ff) { + case 0x04: + return handle_ptff(vcpu); + case 0x07: + return handle_sckpf(vcpu); + default: + return -EOPNOTSUPP; + } } -- cgit v1.2.3 From cb7485da3ed1ac4ef6c71d4b2b715f8b87f118c8 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 6 Feb 2018 10:19:28 +0000 Subject: KVM: s390: use switch vs jump table in intercept.c Instead of having huge jump tables for function selection, let's use normal switch/case statements for the instruction handlers in intercept.c We can now also get rid of intercept_handler_t. This allows the compiler to make the right decision depending on the situation (e.g. avoid jump-tables for thunks). Signed-off-by: Christian Borntraeger Reviewed-by: Janosch Frank Reviewed-by: David Hildenbrand Reviewed-by: Cornelia Huck Signed-off-by: Christian Borntraeger --- arch/s390/kvm/intercept.c | 51 +++++++++++++++++++++++++++-------------------- arch/s390/kvm/kvm-s390.h | 2 -- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 9c7d70715862..07c6e81163bf 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -22,22 +22,6 @@ #include "trace.h" #include "trace-s390.h" - -static const intercept_handler_t instruction_handlers[256] = { - [0x01] = kvm_s390_handle_01, - [0x82] = kvm_s390_handle_lpsw, - [0x83] = kvm_s390_handle_diag, - [0xaa] = kvm_s390_handle_aa, - [0xae] = kvm_s390_handle_sigp, - [0xb2] = kvm_s390_handle_b2, - [0xb6] = kvm_s390_handle_stctl, - [0xb7] = kvm_s390_handle_lctl, - [0xb9] = kvm_s390_handle_b9, - [0xe3] = kvm_s390_handle_e3, - [0xe5] = kvm_s390_handle_e5, - [0xeb] = kvm_s390_handle_eb, -}; - u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu) { struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block; @@ -129,16 +113,39 @@ static int handle_validity(struct kvm_vcpu *vcpu) static int handle_instruction(struct kvm_vcpu *vcpu) { - intercept_handler_t handler; - vcpu->stat.exit_instruction++; trace_kvm_s390_intercept_instruction(vcpu, vcpu->arch.sie_block->ipa, vcpu->arch.sie_block->ipb); - handler = instruction_handlers[vcpu->arch.sie_block->ipa >> 8]; - if (handler) - return handler(vcpu); - return -EOPNOTSUPP; + + switch (vcpu->arch.sie_block->ipa >> 8) { + case 0x01: + return kvm_s390_handle_01(vcpu); + case 0x82: + return kvm_s390_handle_lpsw(vcpu); + case 0x83: + return kvm_s390_handle_diag(vcpu); + case 0xaa: + return kvm_s390_handle_aa(vcpu); + case 0xae: + return kvm_s390_handle_sigp(vcpu); + case 0xb2: + return kvm_s390_handle_b2(vcpu); + case 0xb6: + return kvm_s390_handle_stctl(vcpu); + case 0xb7: + return kvm_s390_handle_lctl(vcpu); + case 0xb9: + return kvm_s390_handle_b9(vcpu); + case 0xe3: + return kvm_s390_handle_e3(vcpu); + case 0xe5: + return kvm_s390_handle_e5(vcpu); + case 0xeb: + return kvm_s390_handle_eb(vcpu); + default: + return -EOPNOTSUPP; + } } static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu) diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index bd31b37b0e6f..3c0a975c2477 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -19,8 +19,6 @@ #include #include -typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); - /* Transactional Memory Execution related macros */ #define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define TDB_FORMAT1 1 -- cgit v1.2.3 From baabee67f4135e3de87bc874929ac50637aacb0d Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 6 Feb 2018 15:17:43 +0100 Subject: KVM: s390: use switch vs jump table in interrupt.c Just like for the interception handlers, let's also use a switch-case in our interrupt delivery code. Signed-off-by: David Hildenbrand Message-Id: <20180206141743.24497-1-david@redhat.com> Reviewed-by: Cornelia Huck Reviewed-by: Janosch Frank Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 84 ++++++++++++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 34 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index e399495001ca..3f2c49b1a393 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -187,12 +187,6 @@ static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu) return kvm_s390_get_cpu_timer(vcpu) >> 63; } -static inline int is_ioirq(unsigned long irq_type) -{ - return ((irq_type >= IRQ_PEND_IO_ISC_7) && - (irq_type <= IRQ_PEND_IO_ISC_0)); -} - static uint64_t isc_to_isc_bits(int isc) { return (0x80 >> isc) << 24; @@ -1016,24 +1010,6 @@ out: return rc; } -typedef int (*deliver_irq_t)(struct kvm_vcpu *vcpu); - -static const deliver_irq_t deliver_irq_funcs[] = { - [IRQ_PEND_MCHK_EX] = __deliver_machine_check, - [IRQ_PEND_MCHK_REP] = __deliver_machine_check, - [IRQ_PEND_PROG] = __deliver_prog, - [IRQ_PEND_EXT_EMERGENCY] = __deliver_emergency_signal, - [IRQ_PEND_EXT_EXTERNAL] = __deliver_external_call, - [IRQ_PEND_EXT_CLOCK_COMP] = __deliver_ckc, - [IRQ_PEND_EXT_CPU_TIMER] = __deliver_cpu_timer, - [IRQ_PEND_RESTART] = __deliver_restart, - [IRQ_PEND_SET_PREFIX] = __deliver_set_prefix, - [IRQ_PEND_PFAULT_INIT] = __deliver_pfault_init, - [IRQ_PEND_EXT_SERVICE] = __deliver_service, - [IRQ_PEND_PFAULT_DONE] = __deliver_pfault_done, - [IRQ_PEND_VIRTIO] = __deliver_virtio, -}; - /* Check whether an external call is pending (deliverable or not) */ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu) { @@ -1197,7 +1173,6 @@ void kvm_s390_clear_local_irqs(struct kvm_vcpu *vcpu) int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - deliver_irq_t func; int rc = 0; unsigned long irq_type; unsigned long irqs; @@ -1217,16 +1192,57 @@ int __must_check kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu) while ((irqs = deliverable_irqs(vcpu)) && !rc) { /* bits are in the reverse order of interrupt priority */ irq_type = find_last_bit(&irqs, IRQ_PEND_COUNT); - if (is_ioirq(irq_type)) { + switch (irq_type) { + case IRQ_PEND_IO_ISC_0: + case IRQ_PEND_IO_ISC_1: + case IRQ_PEND_IO_ISC_2: + case IRQ_PEND_IO_ISC_3: + case IRQ_PEND_IO_ISC_4: + case IRQ_PEND_IO_ISC_5: + case IRQ_PEND_IO_ISC_6: + case IRQ_PEND_IO_ISC_7: rc = __deliver_io(vcpu, irq_type); - } else { - func = deliver_irq_funcs[irq_type]; - if (!func) { - WARN_ON_ONCE(func == NULL); - clear_bit(irq_type, &li->pending_irqs); - continue; - } - rc = func(vcpu); + break; + case IRQ_PEND_MCHK_EX: + case IRQ_PEND_MCHK_REP: + rc = __deliver_machine_check(vcpu); + break; + case IRQ_PEND_PROG: + rc = __deliver_prog(vcpu); + break; + case IRQ_PEND_EXT_EMERGENCY: + rc = __deliver_emergency_signal(vcpu); + break; + case IRQ_PEND_EXT_EXTERNAL: + rc = __deliver_external_call(vcpu); + break; + case IRQ_PEND_EXT_CLOCK_COMP: + rc = __deliver_ckc(vcpu); + break; + case IRQ_PEND_EXT_CPU_TIMER: + rc = __deliver_cpu_timer(vcpu); + break; + case IRQ_PEND_RESTART: + rc = __deliver_restart(vcpu); + break; + case IRQ_PEND_SET_PREFIX: + rc = __deliver_set_prefix(vcpu); + break; + case IRQ_PEND_PFAULT_INIT: + rc = __deliver_pfault_init(vcpu); + break; + case IRQ_PEND_EXT_SERVICE: + rc = __deliver_service(vcpu); + break; + case IRQ_PEND_PFAULT_DONE: + rc = __deliver_pfault_done(vcpu); + break; + case IRQ_PEND_VIRTIO: + rc = __deliver_virtio(vcpu); + break; + default: + WARN_ONCE(1, "Unknown pending irq type %ld", irq_type); + clear_bit(irq_type, &li->pending_irqs); } } -- cgit v1.2.3 From d60d8b64280c8b36c085eda7821585c1ce911795 Mon Sep 17 00:00:00 2001 From: Christoffer Dall Date: Fri, 26 Jan 2018 16:06:51 +0100 Subject: KVM: arm/arm64: Fix arch timers with userspace irqchips When introducing support for irqchip in userspace we needed a way to mask the timer signal to prevent the guest continuously exiting due to a screaming timer. We did this by disabling the corresponding percpu interrupt on the host interrupt controller, because we cannot rely on the host system having a GIC, and therefore cannot make any assumptions about having an active state to hide the timer signal. Unfortunately, when introducing this feature, it became entirely possible that a VCPU which belongs to a VM that has a userspace irqchip can disable the vtimer irq on the host on some physical CPU, and then go away without ever enabling the vtimer irq on that physical CPU again. This means that using irqchips in userspace on a system that also supports running VMs with an in-kernel GIC can prevent forward progress from in-kernel GIC VMs. Later on, when we started taking virtual timer interrupts in the arch timer code, we would also leave this timer state active for userspace irqchip VMs, because we leave it up to a VGIC-enabled guest to deactivate the hardware IRQ using the HW bit in the LR. Both issues are solved by only using the enable/disable trick on systems that do not have a host GIC which supports the active state, because all VMs on such systems must use irqchips in userspace. Systems that have a working GIC with support for an active state use the active state to mask the timer signal for both userspace and in-kernel irqchips. Cc: Alexander Graf Reviewed-by: Marc Zyngier Cc: # v4.12+ Fixes: d9e139778376 ("KVM: arm/arm64: Support arch timers with a userspace gic") Signed-off-by: Christoffer Dall --- virt/kvm/arm/arch_timer.c | 116 +++++++++++++++++++++++++--------------------- 1 file changed, 64 insertions(+), 52 deletions(-) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 70268c0bec79..70f4c30918eb 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -36,6 +36,8 @@ static struct timecounter *timecounter; static unsigned int host_vtimer_irq; static u32 host_vtimer_irq_flags; +static DEFINE_STATIC_KEY_FALSE(has_gic_active_state); + static const struct kvm_irq_level default_ptimer_irq = { .irq = 30, .level = 1, @@ -56,6 +58,12 @@ u64 kvm_phys_timer_read(void) return timecounter->cc->read(timecounter->cc); } +static inline bool userspace_irqchip(struct kvm *kvm) +{ + return static_branch_unlikely(&userspace_irqchip_in_use) && + unlikely(!irqchip_in_kernel(kvm)); +} + static void soft_timer_start(struct hrtimer *hrt, u64 ns) { hrtimer_start(hrt, ktime_add_ns(ktime_get(), ns), @@ -69,25 +77,6 @@ static void soft_timer_cancel(struct hrtimer *hrt, struct work_struct *work) cancel_work_sync(work); } -static void kvm_vtimer_update_mask_user(struct kvm_vcpu *vcpu) -{ - struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - - /* - * When using a userspace irqchip with the architected timers, we must - * prevent continuously exiting from the guest, and therefore mask the - * physical interrupt by disabling it on the host interrupt controller - * when the virtual level is high, such that the guest can make - * forward progress. Once we detect the output level being - * de-asserted, we unmask the interrupt again so that we exit from the - * guest when the timer fires. - */ - if (vtimer->irq.level) - disable_percpu_irq(host_vtimer_irq); - else - enable_percpu_irq(host_vtimer_irq, 0); -} - static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) { struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id; @@ -106,9 +95,9 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id) if (kvm_timer_should_fire(vtimer)) kvm_timer_update_irq(vcpu, true, vtimer); - if (static_branch_unlikely(&userspace_irqchip_in_use) && - unlikely(!irqchip_in_kernel(vcpu->kvm))) - kvm_vtimer_update_mask_user(vcpu); + if (userspace_irqchip(vcpu->kvm) && + !static_branch_unlikely(&has_gic_active_state)) + disable_percpu_irq(host_vtimer_irq); return IRQ_HANDLED; } @@ -290,8 +279,7 @@ static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level); - if (!static_branch_unlikely(&userspace_irqchip_in_use) || - likely(irqchip_in_kernel(vcpu->kvm))) { + if (!userspace_irqchip(vcpu->kvm)) { ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level, @@ -350,12 +338,6 @@ static void kvm_timer_update_state(struct kvm_vcpu *vcpu) phys_timer_emulate(vcpu); } -static void __timer_snapshot_state(struct arch_timer_context *timer) -{ - timer->cnt_ctl = read_sysreg_el0(cntv_ctl); - timer->cnt_cval = read_sysreg_el0(cntv_cval); -} - static void vtimer_save_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; @@ -367,8 +349,10 @@ static void vtimer_save_state(struct kvm_vcpu *vcpu) if (!vtimer->loaded) goto out; - if (timer->enabled) - __timer_snapshot_state(vtimer); + if (timer->enabled) { + vtimer->cnt_ctl = read_sysreg_el0(cntv_ctl); + vtimer->cnt_cval = read_sysreg_el0(cntv_cval); + } /* Disable the virtual timer */ write_sysreg_el0(0, cntv_ctl); @@ -460,23 +444,43 @@ static void set_cntvoff(u64 cntvoff) kvm_call_hyp(__kvm_timer_set_cntvoff, low, high); } -static void kvm_timer_vcpu_load_vgic(struct kvm_vcpu *vcpu) +static inline void set_vtimer_irq_phys_active(struct kvm_vcpu *vcpu, bool active) +{ + int r; + r = irq_set_irqchip_state(host_vtimer_irq, IRQCHIP_STATE_ACTIVE, active); + WARN_ON(r); +} + +static void kvm_timer_vcpu_load_gic(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); bool phys_active; - int ret; - phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); - - ret = irq_set_irqchip_state(host_vtimer_irq, - IRQCHIP_STATE_ACTIVE, - phys_active); - WARN_ON(ret); + if (irqchip_in_kernel(vcpu->kvm)) + phys_active = kvm_vgic_map_is_active(vcpu, vtimer->irq.irq); + else + phys_active = vtimer->irq.level; + set_vtimer_irq_phys_active(vcpu, phys_active); } -static void kvm_timer_vcpu_load_user(struct kvm_vcpu *vcpu) +static void kvm_timer_vcpu_load_nogic(struct kvm_vcpu *vcpu) { - kvm_vtimer_update_mask_user(vcpu); + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + /* + * When using a userspace irqchip with the architected timers and a + * host interrupt controller that doesn't support an active state, we + * must still prevent continuously exiting from the guest, and + * therefore mask the physical interrupt by disabling it on the host + * interrupt controller when the virtual level is high, such that the + * guest can make forward progress. Once we detect the output level + * being de-asserted, we unmask the interrupt again so that we exit + * from the guest when the timer fires. + */ + if (vtimer->irq.level) + disable_percpu_irq(host_vtimer_irq); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) @@ -487,10 +491,10 @@ void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu) if (unlikely(!timer->enabled)) return; - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) - kvm_timer_vcpu_load_user(vcpu); + if (static_branch_likely(&has_gic_active_state)) + kvm_timer_vcpu_load_gic(vcpu); else - kvm_timer_vcpu_load_vgic(vcpu); + kvm_timer_vcpu_load_nogic(vcpu); set_cntvoff(vtimer->cntvoff); @@ -555,18 +559,24 @@ static void unmask_vtimer_irq_user(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); - if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { - __timer_snapshot_state(vtimer); - if (!kvm_timer_should_fire(vtimer)) { - kvm_timer_update_irq(vcpu, false, vtimer); - kvm_vtimer_update_mask_user(vcpu); - } + if (!kvm_timer_should_fire(vtimer)) { + kvm_timer_update_irq(vcpu, false, vtimer); + if (static_branch_likely(&has_gic_active_state)) + set_vtimer_irq_phys_active(vcpu, false); + else + enable_percpu_irq(host_vtimer_irq, host_vtimer_irq_flags); } } void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) { - unmask_vtimer_irq_user(vcpu); + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + if (unlikely(!timer->enabled)) + return; + + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + unmask_vtimer_irq_user(vcpu); } int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu) @@ -753,6 +763,8 @@ int kvm_timer_hyp_init(bool has_gic) kvm_err("kvm_arch_timer: error setting vcpu affinity\n"); goto out_free_irq; } + + static_branch_enable(&has_gic_active_state); } kvm_info("virtual timer IRQ%d\n", host_vtimer_irq); -- cgit v1.2.3 From 67870eb1204223598ea6d8a4467b482e9f5875b5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Feb 2018 16:07:34 +0100 Subject: ARM: kvm: fix building with gcc-8 In banked-sr.c, we use a top-level '__asm__(".arch_extension virt")' statement to allow compilation of a multi-CPU kernel for ARMv6 and older ARMv7-A that don't normally support access to the banked registers. This is considered to be a programming error by the gcc developers and will no longer work in gcc-8, where we now get a build error: /tmp/cc4Qy7GR.s:34: Error: Banked registers are not available with this architecture. -- `mrs r3,SP_usr' /tmp/cc4Qy7GR.s:41: Error: Banked registers are not available with this architecture. -- `mrs r3,ELR_hyp' /tmp/cc4Qy7GR.s:55: Error: Banked registers are not available with this architecture. -- `mrs r3,SP_svc' /tmp/cc4Qy7GR.s:62: Error: Banked registers are not available with this architecture. -- `mrs r3,LR_svc' /tmp/cc4Qy7GR.s:69: Error: Banked registers are not available with this architecture. -- `mrs r3,SPSR_svc' /tmp/cc4Qy7GR.s:76: Error: Banked registers are not available with this architecture. -- `mrs r3,SP_abt' Passign the '-march-armv7ve' flag to gcc works, and is ok here, because we know the functions won't ever be called on pre-ARMv7VE machines. Unfortunately, older compiler versions (4.8 and earlier) do not understand that flag, so we still need to keep the asm around. Backporting to stable kernels (4.6+) is needed to allow those to be built with future compilers as well. Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84129 Fixes: 33280b4cd1dc ("ARM: KVM: Add banked registers save/restore") Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Christoffer Dall --- arch/arm/kvm/hyp/Makefile | 5 +++++ arch/arm/kvm/hyp/banked-sr.c | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/arch/arm/kvm/hyp/Makefile b/arch/arm/kvm/hyp/Makefile index 5638ce0c9524..63d6b404d88e 100644 --- a/arch/arm/kvm/hyp/Makefile +++ b/arch/arm/kvm/hyp/Makefile @@ -7,6 +7,8 @@ ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING KVM=../../../../virt/kvm +CFLAGS_ARMV7VE :=$(call cc-option, -march=armv7ve) + obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v3-sr.o obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o @@ -15,7 +17,10 @@ obj-$(CONFIG_KVM_ARM_HOST) += tlb.o obj-$(CONFIG_KVM_ARM_HOST) += cp15-sr.o obj-$(CONFIG_KVM_ARM_HOST) += vfp.o obj-$(CONFIG_KVM_ARM_HOST) += banked-sr.o +CFLAGS_banked-sr.o += $(CFLAGS_ARMV7VE) + obj-$(CONFIG_KVM_ARM_HOST) += entry.o obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o obj-$(CONFIG_KVM_ARM_HOST) += switch.o +CFLAGS_switch.o += $(CFLAGS_ARMV7VE) obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o diff --git a/arch/arm/kvm/hyp/banked-sr.c b/arch/arm/kvm/hyp/banked-sr.c index 111bda8cdebd..be4b8b0a40ad 100644 --- a/arch/arm/kvm/hyp/banked-sr.c +++ b/arch/arm/kvm/hyp/banked-sr.c @@ -20,6 +20,10 @@ #include +/* + * gcc before 4.9 doesn't understand -march=armv7ve, so we have to + * trick the assembler. + */ __asm__(".arch_extension virt"); void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt) -- cgit v1.2.3 From 5fe01793dd953ab947fababe8abaf5ed5258c8df Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 7 Feb 2018 12:46:42 +0100 Subject: KVM: s390: take care of clock-comparator sign control Missed when enabling the Multiple-epoch facility. If the facility is installed and the control is set, a sign based comaprison has to be performed. Right now we would inject wrong interrupts and ignore interrupt conditions. Also the sleep time is calculated in a wrong way. Signed-off-by: David Hildenbrand Message-Id: <20180207114647.6220-2-david@redhat.com> Fixes: 8fa1696ea781 ("KVM: s390: Multiple Epoch Facility support") Cc: stable@vger.kernel.org Reviewed-by: Christian Borntraeger Signed-off-by: Christian Borntraeger --- arch/s390/kvm/interrupt.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 3f2c49b1a393..b04616b57a94 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -169,8 +169,15 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) static int ckc_irq_pending(struct kvm_vcpu *vcpu) { - if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm)) + const u64 now = kvm_s390_get_tod_clock_fast(vcpu->kvm); + const u64 ckc = vcpu->arch.sie_block->ckc; + + if (vcpu->arch.sie_block->gcr[0] & 0x0020000000000000ul) { + if ((s64)ckc >= (s64)now) + return 0; + } else if (ckc >= now) { return 0; + } return ckc_interrupts_enabled(vcpu); } @@ -1047,13 +1054,19 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) static u64 __calculate_sltime(struct kvm_vcpu *vcpu) { - u64 now, cputm, sltime = 0; + const u64 now = kvm_s390_get_tod_clock_fast(vcpu->kvm); + const u64 ckc = vcpu->arch.sie_block->ckc; + u64 cputm, sltime = 0; if (ckc_interrupts_enabled(vcpu)) { - now = kvm_s390_get_tod_clock_fast(vcpu->kvm); - sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); - /* already expired or overflow? */ - if (!sltime || vcpu->arch.sie_block->ckc <= now) + if (vcpu->arch.sie_block->gcr[0] & 0x0020000000000000ul) { + if ((s64)now < (s64)ckc) + sltime = tod_to_ns((s64)ckc - (s64)now); + } else if (now < ckc) { + sltime = tod_to_ns(ckc - now); + } + /* already expired */ + if (!sltime) return 0; if (cpu_timer_interrupts_enabled(vcpu)) { cputm = kvm_s390_get_cpu_timer(vcpu); -- cgit v1.2.3 From d16b52cb9cdb6f06dea8ab2f0a428e7d7f0b0a81 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 7 Feb 2018 12:46:44 +0100 Subject: KVM: s390: consider epoch index on hotplugged CPUs We must copy both, the epoch and the epoch_idx. Signed-off-by: David Hildenbrand Message-Id: <20180207114647.6220-4-david@redhat.com> Fixes: 8fa1696ea781 ("KVM: s390: Multiple Epoch Facility support") Reviewed-by: Cornelia Huck Reviewed-by: Christian Borntraeger Fixes: 8fa1696ea781 ("KVM: s390: Multiple Epoch Facility support") Cc: stable@vger.kernel.org Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index ba4c7092335a..5b7fe80cda56 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2389,6 +2389,7 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) mutex_lock(&vcpu->kvm->lock); preempt_disable(); vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch; + vcpu->arch.sie_block->epdx = vcpu->kvm->arch.epdx; preempt_enable(); mutex_unlock(&vcpu->kvm->lock); if (!kvm_is_ucontrol(vcpu->kvm)) { -- cgit v1.2.3 From 1575767ef3cf5326701d2ae3075b7732cbc855e4 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 7 Feb 2018 12:46:45 +0100 Subject: KVM: s390: consider epoch index on TOD clock syncs For now, we don't take care of over/underflows. Especially underflows are critical: Assume the epoch is currently 0 and we get a sync request for delta=1, meaning the TOD is moved forward by 1 and we have to fix it up by subtracting 1 from the epoch. Right now, this will leave the epoch index untouched, resulting in epoch=-1, epoch_idx=0, which is wrong. We have to take care of over and underflows, also for the VSIE case. So let's factor out calculation into a separate function. Signed-off-by: David Hildenbrand Message-Id: <20180207114647.6220-5-david@redhat.com> Reviewed-by: Christian Borntraeger Fixes: 8fa1696ea781 ("KVM: s390: Multiple Epoch Facility support") Cc: stable@vger.kernel.org Signed-off-by: Christian Borntraeger [use u8 for idx] --- arch/s390/kvm/kvm-s390.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 5b7fe80cda56..b07aa16dcf06 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -179,6 +179,28 @@ int kvm_arch_hardware_enable(void) static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start, unsigned long end); +static void kvm_clock_sync_scb(struct kvm_s390_sie_block *scb, u64 delta) +{ + u8 delta_idx = 0; + + /* + * The TOD jumps by delta, we have to compensate this by adding + * -delta to the epoch. + */ + delta = -delta; + + /* sign-extension - we're adding to signed values below */ + if ((s64)delta < 0) + delta_idx = -1; + + scb->epoch += delta; + if (scb->ecd & ECD_MEF) { + scb->epdx += delta_idx; + if (scb->epoch < delta) + scb->epdx += 1; + } +} + /* * This callback is executed during stop_machine(). All CPUs are therefore * temporarily stopped. In order not to change guest behavior, we have to @@ -194,13 +216,17 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, unsigned long long *delta = v; list_for_each_entry(kvm, &vm_list, vm_list) { - kvm->arch.epoch -= *delta; kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu->arch.sie_block->epoch -= *delta; + kvm_clock_sync_scb(vcpu->arch.sie_block, *delta); + if (i == 0) { + kvm->arch.epoch = vcpu->arch.sie_block->epoch; + kvm->arch.epdx = vcpu->arch.sie_block->epdx; + } if (vcpu->arch.cputm_enabled) vcpu->arch.cputm_start += *delta; if (vcpu->arch.vsie_block) - vcpu->arch.vsie_block->epoch -= *delta; + kvm_clock_sync_scb(vcpu->arch.vsie_block, + *delta); } } return NOTIFY_OK; -- cgit v1.2.3 From 0e7def5fb0dc53ddbb9f62a497d15f1e11ccdc36 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 7 Feb 2018 12:46:43 +0100 Subject: KVM: s390: provide only a single function for setting the tod (fix SCK) Right now, SET CLOCK called in the guest does not properly take care of the epoch index, as the call goes via the old kvm_s390_set_tod_clock() interface. So the epoch index is neither reset to 0, if required, nor properly set to e.g. 0xff on negative values. Fix this by providing a single kvm_s390_set_tod_clock() function. Move Multiple-epoch facility handling into it. Signed-off-by: David Hildenbrand Message-Id: <20180207114647.6220-3-david@redhat.com> Reviewed-by: Christian Borntraeger Fixes: 8fa1696ea781 ("KVM: s390: Multiple Epoch Facility support") Cc: stable@vger.kernel.org Signed-off-by: Christian Borntraeger --- arch/s390/kvm/kvm-s390.c | 46 +++++++++++++++------------------------------- arch/s390/kvm/kvm-s390.h | 5 ++--- arch/s390/kvm/priv.c | 9 +++++---- 3 files changed, 22 insertions(+), 38 deletions(-) diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b07aa16dcf06..77d7818130db 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -928,12 +928,9 @@ static int kvm_s390_set_tod_ext(struct kvm *kvm, struct kvm_device_attr *attr) if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) return -EFAULT; - if (test_kvm_facility(kvm, 139)) - kvm_s390_set_tod_clock_ext(kvm, >od); - else if (gtod.epoch_idx == 0) - kvm_s390_set_tod_clock(kvm, gtod.tod); - else + if (!test_kvm_facility(kvm, 139) && gtod.epoch_idx) return -EINVAL; + kvm_s390_set_tod_clock(kvm, >od); VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x, TOD base: 0x%llx", gtod.epoch_idx, gtod.tod); @@ -958,13 +955,14 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) { - u64 gtod; + struct kvm_s390_vm_tod_clock gtod = { 0 }; - if (copy_from_user(>od, (void __user *)attr->addr, sizeof(gtod))) + if (copy_from_user(>od.tod, (void __user *)attr->addr, + sizeof(gtod.tod))) return -EFAULT; - kvm_s390_set_tod_clock(kvm, gtod); - VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod); + kvm_s390_set_tod_clock(kvm, >od); + VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod.tod); return 0; } @@ -3048,8 +3046,8 @@ retry: return 0; } -void kvm_s390_set_tod_clock_ext(struct kvm *kvm, - const struct kvm_s390_vm_tod_clock *gtod) +void kvm_s390_set_tod_clock(struct kvm *kvm, + const struct kvm_s390_vm_tod_clock *gtod) { struct kvm_vcpu *vcpu; struct kvm_s390_tod_clock_ext htod; @@ -3061,10 +3059,12 @@ void kvm_s390_set_tod_clock_ext(struct kvm *kvm, get_tod_clock_ext((char *)&htod); kvm->arch.epoch = gtod->tod - htod.tod; - kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx; - - if (kvm->arch.epoch > gtod->tod) - kvm->arch.epdx -= 1; + kvm->arch.epdx = 0; + if (test_kvm_facility(kvm, 139)) { + kvm->arch.epdx = gtod->epoch_idx - htod.epoch_idx; + if (kvm->arch.epoch > gtod->tod) + kvm->arch.epdx -= 1; + } kvm_s390_vcpu_block_all(kvm); kvm_for_each_vcpu(i, vcpu, kvm) { @@ -3077,22 +3077,6 @@ void kvm_s390_set_tod_clock_ext(struct kvm *kvm, mutex_unlock(&kvm->lock); } -void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod) -{ - struct kvm_vcpu *vcpu; - int i; - - mutex_lock(&kvm->lock); - preempt_disable(); - kvm->arch.epoch = tod - get_tod_clock(); - kvm_s390_vcpu_block_all(kvm); - kvm_for_each_vcpu(i, vcpu, kvm) - vcpu->arch.sie_block->epoch = kvm->arch.epoch; - kvm_s390_vcpu_unblock_all(kvm); - preempt_enable(); - mutex_unlock(&kvm->lock); -} - /** * kvm_arch_fault_in_page - fault-in guest page if necessary * @vcpu: The corresponding virtual cpu diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 3c0a975c2477..f55ac0ef99ea 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -281,9 +281,8 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu); int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu); /* implemented in kvm-s390.c */ -void kvm_s390_set_tod_clock_ext(struct kvm *kvm, - const struct kvm_s390_vm_tod_clock *gtod); -void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod); +void kvm_s390_set_tod_clock(struct kvm *kvm, + const struct kvm_s390_vm_tod_clock *gtod); long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable); int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr); int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index a74578cdd3f3..f0b4185158af 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -85,9 +85,10 @@ int kvm_s390_handle_e3(struct kvm_vcpu *vcpu) /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { + struct kvm_s390_vm_tod_clock gtod = { 0 }; int rc; u8 ar; - u64 op2, val; + u64 op2; vcpu->stat.instruction_sck++; @@ -97,12 +98,12 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) op2 = kvm_s390_get_base_disp_s(vcpu, &ar); if (op2 & 7) /* Operand must be on a doubleword boundary */ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - rc = read_guest(vcpu, op2, ar, &val, sizeof(val)); + rc = read_guest(vcpu, op2, ar, >od.tod, sizeof(gtod.tod)); if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); - kvm_s390_set_tod_clock(vcpu->kvm, val); + VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", gtod.tod); + kvm_s390_set_tod_clock(vcpu->kvm, >od); kvm_s390_set_psw_cc(vcpu, 0); return 0; -- cgit v1.2.3 From 0b2e9904c15963e715d33e5f3f1387f17d19333a Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 23 Feb 2018 23:29:32 +0100 Subject: KVM: x86: move LAPIC initialization after VMCS creation The initial reset of the local APIC is performed before the VMCS has been created, but it tries to do a vmwrite: vmwrite error: reg 810 value 4a00 (err 18944) CPU: 54 PID: 38652 Comm: qemu-kvm Tainted: G W I 4.16.0-0.rc2.git0.1.fc28.x86_64 #1 Hardware name: Intel Corporation S2600CW/S2600CW, BIOS SE5C610.86B.01.01.0003.090520141303 09/05/2014 Call Trace: vmx_set_rvi [kvm_intel] vmx_hwapic_irr_update [kvm_intel] kvm_lapic_reset [kvm] kvm_create_lapic [kvm] kvm_arch_vcpu_init [kvm] kvm_vcpu_init [kvm] vmx_create_vcpu [kvm_intel] kvm_vm_ioctl [kvm] Move it later, after the VMCS has been created. Fixes: 4191db26b714 ("KVM: x86: Update APICv on APIC reset") Cc: stable@vger.kernel.org Cc: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 1 - arch/x86/kvm/x86.c | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 924ac8ce9d50..cc5fe7a50dde 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2165,7 +2165,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) */ vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */ - kvm_lapic_reset(vcpu, false); kvm_iodevice_init(&apic->dev, &apic_mmio_ops); return 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8a0b545ac20..ca90d9515137 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7975,6 +7975,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) kvm_vcpu_mtrr_init(vcpu); vcpu_load(vcpu); kvm_vcpu_reset(vcpu, false); + kvm_lapic_reset(vcpu, false); kvm_mmu_setup(vcpu); vcpu_put(vcpu); return 0; -- cgit v1.2.3 From 99158246208b82c0700d09a40d719bb56b32c607 Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Wed, 31 Jan 2018 18:12:50 +0100 Subject: KVM: nVMX: preserve SECONDARY_EXEC_DESC without UMIP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L1 might want to use SECONDARY_EXEC_DESC, so we must not clear the VMCS bit if UMIP is not being emulated. We must still set the bit when emulating UMIP as the feature can be passed to L2 where L0 will do the emulation and because L2 can change CR4 without a VM exit, we should clear the bit if UMIP is disabled. Fixes: 0367f205a3b7 ("KVM: vmx: add support for emulating UMIP") Reviewed-by: Paolo Bonzini Signed-off-by: Radim Krčmář --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f427723dc7db..2d2cf8c1f0f4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4485,7 +4485,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_DESC); hw_cr4 &= ~X86_CR4_UMIP; - } else + } else if (!is_guest_mode(vcpu) || + !nested_cpu_has2(get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_DESC); -- cgit v1.2.3 From 103c763c72dd2df3e8c91f2d7ec88f98ed391111 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 31 Jan 2018 17:30:21 -0800 Subject: KVM/x86: remove WARN_ON() for when vm_munmap() fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On x86, special KVM memslots such as the TSS region have anonymous memory mappings created on behalf of userspace, and these mappings are removed when the VM is destroyed. It is however possible for removing these mappings via vm_munmap() to fail. This can most easily happen if the thread receives SIGKILL while it's waiting to acquire ->mmap_sem. This triggers the 'WARN_ON(r < 0)' in __x86_set_memory_region(). syzkaller was able to hit this, using 'exit()' to send the SIGKILL. Note that while the vm_munmap() failure results in the mapping not being removed immediately, it is not leaked forever but rather will be freed when the process exits. It's not really possible to handle this failure properly, so almost every other caller of vm_munmap() doesn't check the return value. It's a limitation of having the kernel manage these mappings rather than userspace. So just remove the WARN_ON() so that users can't spam the kernel log with this warning. Fixes: f0d648bdf0a5 ("KVM: x86: map/unmap private slots in __x86_set_memory_region") Reported-by: syzbot Signed-off-by: Eric Biggers Signed-off-by: Radim Krčmář --- arch/x86/kvm/x86.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ca90d9515137..96edda878dbf 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8461,10 +8461,8 @@ int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size) return r; } - if (!size) { - r = vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); - WARN_ON(r < 0); - } + if (!size) + vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE); return 0; } -- cgit v1.2.3 From b28676bb8ae4569cced423dc2a88f7cb319d5379 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 13 Feb 2018 15:36:00 +0100 Subject: KVM: mmu: Fix overlap between public and private memslots MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: pte_list_remove: ffff9714eb1f8078 0->BUG ------------[ cut here ]------------ kernel BUG at arch/x86/kvm/mmu.c:1157! invalid opcode: 0000 [#1] SMP RIP: 0010:pte_list_remove+0x11b/0x120 [kvm] Call Trace: drop_spte+0x83/0xb0 [kvm] mmu_page_zap_pte+0xcc/0xe0 [kvm] kvm_mmu_prepare_zap_page+0x81/0x4a0 [kvm] kvm_mmu_invalidate_zap_all_pages+0x159/0x220 [kvm] kvm_arch_flush_shadow_all+0xe/0x10 [kvm] kvm_mmu_notifier_release+0x6c/0xa0 [kvm] ? kvm_mmu_notifier_release+0x5/0xa0 [kvm] __mmu_notifier_release+0x79/0x110 ? __mmu_notifier_release+0x5/0x110 exit_mmap+0x15a/0x170 ? do_exit+0x281/0xcb0 mmput+0x66/0x160 do_exit+0x2c9/0xcb0 ? __context_tracking_exit.part.5+0x4a/0x150 do_group_exit+0x50/0xd0 SyS_exit_group+0x14/0x20 do_syscall_64+0x73/0x1f0 entry_SYSCALL64_slow_path+0x25/0x25 The reason is that when creates new memslot, there is no guarantee for new memslot not overlap with private memslots. This can be triggered by the following program: #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include long r[16]; int main() { void *p = valloc(0x4000); r[2] = open("/dev/kvm", 0); r[3] = ioctl(r[2], KVM_CREATE_VM, 0x0ul); uint64_t addr = 0xf000; ioctl(r[3], KVM_SET_IDENTITY_MAP_ADDR, &addr); r[6] = ioctl(r[3], KVM_CREATE_VCPU, 0x0ul); ioctl(r[3], KVM_SET_TSS_ADDR, 0x0ul); ioctl(r[6], KVM_RUN, 0); ioctl(r[6], KVM_RUN, 0); struct kvm_userspace_memory_region mr = { .slot = 0, .flags = KVM_MEM_LOG_DIRTY_PAGES, .guest_phys_addr = 0xf000, .memory_size = 0x4000, .userspace_addr = (uintptr_t) p }; ioctl(r[3], KVM_SET_USER_MEMORY_REGION, &mr); return 0; } This patch fixes the bug by not adding a new memslot even if it overlaps with private memslots. Reported-by: Dmitry Vyukov Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Dmitry Vyukov Cc: Eric Biggers Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) --- virt/kvm/kvm_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4501e658e8d6..65dea3ffef68 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -969,8 +969,7 @@ int __kvm_set_memory_region(struct kvm *kvm, /* Check for overlaps */ r = -EEXIST; kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) { - if ((slot->id >= KVM_USER_MEM_SLOTS) || - (slot->id == id)) + if (slot->id == id) continue; if (!((base_gfn + npages <= slot->base_gfn) || (base_gfn >= slot->base_gfn + slot->npages))) -- cgit v1.2.3 From 135a06c3a515bbd17729eb04f4f26316d48363d7 Mon Sep 17 00:00:00 2001 From: Chao Gao Date: Sun, 11 Feb 2018 10:06:30 +0800 Subject: KVM: nVMX: Don't halt vcpu when L1 is injecting events to L2 Although L2 is in halt state, it will be in the active state after VM entry if the VM entry is vectoring according to SDM 26.6.2 Activity State. Halting the vcpu here means the event won't be injected to L2 and this decision isn't reported to L1. Thus L0 drops an event that should be injected to L2. Cc: Liran Alon Reviewed-by: Liran Alon Signed-off-by: Chao Gao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2d2cf8c1f0f4..67b028d8e726 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11197,7 +11197,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (ret) return ret; - if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) + /* + * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken + * by event injection, halt vcpu. + */ + if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) && + !(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK)) return kvm_vcpu_halt(vcpu); vmx->nested.nested_run_pending = 1; -- cgit v1.2.3 From 95e057e25892eaa48cad1e2d637b80d0f1a4fac5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 8 Feb 2018 15:32:45 +0800 Subject: KVM: X86: Fix SMRAM accessing even if VM is shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reported by syzkaller: WARNING: CPU: 6 PID: 2434 at arch/x86/kvm/vmx.c:6660 handle_ept_misconfig+0x54/0x1e0 [kvm_intel] CPU: 6 PID: 2434 Comm: repro_test Not tainted 4.15.0+ #4 RIP: 0010:handle_ept_misconfig+0x54/0x1e0 [kvm_intel] Call Trace: vmx_handle_exit+0xbd/0xe20 [kvm_intel] kvm_arch_vcpu_ioctl_run+0xdaf/0x1d50 [kvm] kvm_vcpu_ioctl+0x3e9/0x720 [kvm] do_vfs_ioctl+0xa4/0x6a0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x25/0x9c The testcase creates a first thread to issue KVM_SMI ioctl, and then creates a second thread to mmap and operate on the same vCPU. This triggers a race condition when running the testcase with multiple threads. Sometimes one thread exits with a triple fault while another thread mmaps and operates on the same vCPU. Because CS=0x3000/IP=0x8000 is not mapped, accessing the SMI handler results in an EPT misconfig. This patch fixes it by returning RET_PF_EMULATE in kvm_handle_bad_page(), which will go on to cause an emulation failure and an exit with KVM_EXIT_INTERNAL_ERROR. Reported-by: syzbot+c1d9517cab094dae65e446c0c5b4de6c40f4dc58@syzkaller.appspotmail.com Cc: Paolo Bonzini Cc: Radim Krčmář Cc: stable@vger.kernel.org Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8eca1d04aeb8..6c5a82c74750 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3029,7 +3029,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn) return RET_PF_RETRY; } - return -EFAULT; + return RET_PF_EMULATE; } static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From faa312a543283c717342cd332b5b9247bd305dce Mon Sep 17 00:00:00 2001 From: Marc Hartmayer Date: Tue, 9 Jan 2018 13:27:01 +0100 Subject: tools/kvm_stat: simplify the sortkey function The 'sortkey' function references a value in its enclosing scope (closure). This is not common practice for a sort key function so let's replace it. Additionally, the function 'sorted' has already a parameter for reversing the result therefore the inversion of the values is unneeded. The check for stats[x][1] is also superfluous as it's ensured that this value is initialized with 0. Signed-off-by: Marc Hartmayer Tested-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index a5684d0968b4..d630f5f3e091 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1080,30 +1080,23 @@ class Tui(object): self.screen.move(row, 0) self.screen.clrtobot() stats = self.stats.get(self._display_guests) - - def sortCurAvg(x): - # sort by current events if available - if stats[x][1]: - return (-stats[x][1], -stats[x][0]) - else: - return (0, -stats[x][0]) - - def sortTotal(x): - # sort by totals - return (0, -stats[x][0]) total = 0. for key in stats.keys(): if key.find('(') is -1: total += stats[key][0] if self._sorting == SORT_DEFAULT: - sortkey = sortCurAvg + def sortkey((_k, v)): + # sort by (delta value, overall value) + return (v[1], v[0]) else: - sortkey = sortTotal + def sortkey((_k, v)): + # sort by overall value + return v[0] + tavg = 0 - for key in sorted(stats.keys(), key=sortkey): + for key, values in sorted(stats.items(), key=sortkey, reverse=True): if row >= self.screen.getmaxyx()[0] - 1: break - values = stats[key] if not values[0] and not values[1]: break if values[0] is not None: -- cgit v1.2.3 From 006f1548ac13d67d21865416a0f4e8062df1a85f Mon Sep 17 00:00:00 2001 From: Marc Hartmayer Date: Tue, 9 Jan 2018 13:27:02 +0100 Subject: tools/kvm_stat: use a namedtuple for storing the values Use a namedtuple for storing the values as it allows to access the fields of a tuple via names. This makes the overall code much easier to read and to understand. Access by index is still possible as before. Signed-off-by: Marc Hartmayer Tested-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index d630f5f3e091..2b7e83a5f7b8 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -33,7 +33,7 @@ import resource import struct import re import subprocess -from collections import defaultdict +from collections import defaultdict, namedtuple VMX_EXIT_REASONS = { 'EXCEPTION_NMI': 0, @@ -800,6 +800,9 @@ class DebugfsProvider(Provider): self.read(2) +EventStat = namedtuple('EventStat', ['value', 'delta']) + + class Stats(object): """Manages the data providers and the data they provide. @@ -867,10 +870,10 @@ class Stats(object): for provider in self.providers: new = provider.read(by_guest=by_guest) for key in new if by_guest else provider.fields: - oldval = self.values.get(key, (0, 0))[0] + oldval = self.values.get(key, EventStat(0, 0)).value newval = new.get(key, 0) newdelta = newval - oldval - self.values[key] = (newval, newdelta) + self.values[key] = EventStat(newval, newdelta) return self.values def toggle_display_guests(self, to_pid): @@ -1083,28 +1086,28 @@ class Tui(object): total = 0. for key in stats.keys(): if key.find('(') is -1: - total += stats[key][0] + total += stats[key].value if self._sorting == SORT_DEFAULT: def sortkey((_k, v)): # sort by (delta value, overall value) - return (v[1], v[0]) + return (v.delta, v.value) else: def sortkey((_k, v)): # sort by overall value - return v[0] + return v.value tavg = 0 for key, values in sorted(stats.items(), key=sortkey, reverse=True): if row >= self.screen.getmaxyx()[0] - 1: break - if not values[0] and not values[1]: + if not values.value and not values.delta: break - if values[0] is not None: - cur = int(round(values[1] / sleeptime)) if values[1] else '' + if values.value is not None: + cur = int(round(values.delta / sleeptime)) if values.delta else '' if self._display_guests: key = self.get_gname_from_pid(key) self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % - (key, values[0], values[0] * 100 / total, + (key, values.value, values.value * 100 / total, cur)) if cur is not '' and key.find('(') is -1: tavg += cur @@ -1375,7 +1378,7 @@ def batch(stats): s = stats.get() for key in sorted(s.keys()): values = s[key] - print('%-42s%10d%10d' % (key, values[0], values[1])) + print('%-42s%10d%10d' % (key, values.value, values.delta)) except KeyboardInterrupt: pass @@ -1392,7 +1395,7 @@ def log(stats): def statline(): s = stats.get() for k in keys: - print(' %9d' % s[k][1], end=' ') + print(' %9d' % s[k].delta, end=' ') print() line = 0 banner_repeat = 20 -- cgit v1.2.3 From 0eb578009a1d530a11846d7c4733a5db04730884 Mon Sep 17 00:00:00 2001 From: Marc Hartmayer Date: Tue, 9 Jan 2018 13:27:03 +0100 Subject: tools/kvm_stat: use a more pythonic way to iterate over dictionaries If it's clear that the values of a dictionary will be used then use the '.items()' method. Signed-off-by: Marc Hartmayer Tested-by: Stefan Raspl [Include fix for logging mode by Stefan Raspl] Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 2b7e83a5f7b8..f0da954a856c 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1084,9 +1084,10 @@ class Tui(object): self.screen.clrtobot() stats = self.stats.get(self._display_guests) total = 0. - for key in stats.keys(): + for key, values in stats.items(): if key.find('(') is -1: - total += stats[key].value + total += values.value + if self._sorting == SORT_DEFAULT: def sortkey((_k, v)): # sort by (delta value, overall value) @@ -1376,8 +1377,7 @@ def batch(stats): s = stats.get() time.sleep(1) s = stats.get() - for key in sorted(s.keys()): - values = s[key] + for key, values in sorted(s.items()): print('%-42s%10d%10d' % (key, values.value, values.delta)) except KeyboardInterrupt: pass @@ -1388,14 +1388,14 @@ def log(stats): keys = sorted(stats.get().keys()) def banner(): - for k in keys: - print(k, end=' ') + for key in keys: + print(key, end=' ') print() def statline(): s = stats.get() - for k in keys: - print(' %9d' % s[k].delta, end=' ') + for key in keys: + print(' %9d' % s[key].delta, end=' ') print() line = 0 banner_repeat = 20 -- cgit v1.2.3 From 369d5a85bb782ecf63c5bae9686c7e6104eea991 Mon Sep 17 00:00:00 2001 From: Marc Hartmayer Date: Tue, 9 Jan 2018 13:27:04 +0100 Subject: tools/kvm_stat: avoid 'is' for equality checks Use '==' for equality checks and 'is' when comparing identities. An example where '==' and 'is' behave differently: >>> a = 4242 >>> a == 4242 True >>> a is 4242 False Signed-off-by: Marc Hartmayer Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index f0da954a856c..e3f0becb6632 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1085,7 +1085,7 @@ class Tui(object): stats = self.stats.get(self._display_guests) total = 0. for key, values in stats.items(): - if key.find('(') is -1: + if key.find('(') == -1: total += values.value if self._sorting == SORT_DEFAULT: @@ -1110,7 +1110,7 @@ class Tui(object): self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % (key, values.value, values.value * 100 / total, cur)) - if cur is not '' and key.find('(') is -1: + if cur != '' and key.find('(') == -1: tavg += cur row += 1 if row == 3: -- cgit v1.2.3 From 3df33a0f34a3883b6696bff8cc8fcda3c7444a62 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 5 Feb 2018 13:59:57 +0100 Subject: tools/kvm_stat: fix crash when filtering out all non-child trace events When we apply a filter that will only leave child trace events, we receive a ZeroDivisionError when calculating the percentages. In that case, provide percentages based on child events only. To reproduce, run 'kvm_stat -f .*[\(].*'. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index e3f0becb6632..4e0f282c5289 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1084,9 +1084,15 @@ class Tui(object): self.screen.clrtobot() stats = self.stats.get(self._display_guests) total = 0. + ctotal = 0. for key, values in stats.items(): if key.find('(') == -1: total += values.value + else: + ctotal += values.value + if total == 0.: + # we don't have any fields, or all non-child events are filtered + total = ctotal if self._sorting == SORT_DEFAULT: def sortkey((_k, v)): -- cgit v1.2.3 From 1cd8bfb1ed9962be6d80d5020508922aa93653ac Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Mon, 5 Feb 2018 13:59:58 +0100 Subject: tools/kvm_stat: print error on invalid regex Entering an invalid regular expression did not produce any indication of an error so far. To reproduce, press 'f' and enter 'foo(' (with an unescaped bracket). Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 4e0f282c5289..08f842238c32 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1176,6 +1176,7 @@ class Tui(object): Asks for a valid regex and sets the fields filter accordingly. """ + msg = '' while True: self.screen.erase() self.screen.addstr(0, 0, @@ -1184,6 +1185,7 @@ class Tui(object): self.screen.addstr(2, 0, "Current regex: {0}" .format(self.stats.fields_filter)) + self.screen.addstr(5, 0, msg) self.screen.addstr(3, 0, "New regex: ") curses.echo() regex = self.screen.getstr().decode(ENCODING) @@ -1198,6 +1200,7 @@ class Tui(object): self.refresh_header() return except re.error: + msg = '"' + regex + '": Not a valid regular expression' continue def show_vm_selection_by_pid(self): -- cgit v1.2.3 From 1fd6a708c8438403dee17eb411cf81ffba13cf43 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 22 Feb 2018 12:16:24 +0100 Subject: tools/kvm_stat: fix debugfs handling Te checks for debugfs assumed that debugfs is always mounted at /sys/kernel/debug - which is likely, but not guaranteed. This is addressed by checking /proc/mounts for the actual location. Furthermore, when debugfs was mounted, but the kvm module not loaded, a misleading error pointing towards debugfs not present was given. To reproduce, (a) run kvm_stat with debugfs mounted at a place different from /sys/kernel/debug (b) run kvm_stat with debugfs mounted but kvm module not loaded Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 08f842238c32..c5c8e9295b91 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -331,9 +331,6 @@ class perf_event_attr(ctypes.Structure): PERF_TYPE_TRACEPOINT = 2 PERF_FORMAT_GROUP = 1 << 3 -PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' -PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' - class Group(object): """Represents a perf event group.""" @@ -1544,17 +1541,6 @@ Press any other key to refresh statistics immediately. def check_access(options): """Exits if the current user can't access all needed directories.""" - if not os.path.exists('/sys/kernel/debug'): - sys.stderr.write('Please enable CONFIG_DEBUG_FS in your kernel.') - sys.exit(1) - - if not os.path.exists(PATH_DEBUGFS_KVM): - sys.stderr.write("Please make sure, that debugfs is mounted and " - "readable by the current user:\n" - "('mount -t debugfs debugfs /sys/kernel/debug')\n" - "Also ensure, that the kvm modules are loaded.\n") - sys.exit(1) - if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or not options.debugfs): sys.stderr.write("Please enable CONFIG_TRACING in your kernel " @@ -1572,7 +1558,33 @@ def check_access(options): return options +def assign_globals(): + global PATH_DEBUGFS_KVM + global PATH_DEBUGFS_TRACING + + debugfs = '' + for line in file('/proc/mounts'): + if line.split(' ')[0] == 'debugfs': + debugfs = line.split(' ')[1] + break + if debugfs == '': + sys.stderr.write("Please make sure that CONFIG_DEBUG_FS is enabled in " + "your kernel, mounted and\nreadable by the current " + "user:\n" + "('mount -t debugfs debugfs /sys/kernel/debug')\n") + sys.exit(1) + + PATH_DEBUGFS_KVM = os.path.join(debugfs, 'kvm') + PATH_DEBUGFS_TRACING = os.path.join(debugfs, 'tracing') + + if not os.path.exists(PATH_DEBUGFS_KVM): + sys.stderr.write("Please make sure that CONFIG_KVM is enabled in " + "your kernel and that the modules are loaded.\n") + sys.exit(1) + + def main(): + assign_globals() options = get_options() options = check_access(options) -- cgit v1.2.3 From c0e8c21eae616ed8703c1b4b01046a1578ee875c Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 22 Feb 2018 12:16:26 +0100 Subject: tools/kvm_stat: mark private methods as such Helps quite a bit reading the code when it's obvious when a method is intended for internal use only. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 132 ++++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index c5c8e9295b91..c09b7428f563 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -373,8 +373,8 @@ class Event(object): self.syscall = self.libc.syscall self.name = name self.fd = None - self.setup_event(group, trace_cpu, trace_pid, trace_point, - trace_filter, trace_set) + self._setup_event(group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set) def __del__(self): """Closes the event's file descriptor. @@ -387,7 +387,7 @@ class Event(object): if self.fd: os.close(self.fd) - def perf_event_open(self, attr, pid, cpu, group_fd, flags): + def _perf_event_open(self, attr, pid, cpu, group_fd, flags): """Wrapper for the sys_perf_evt_open() syscall. Used to set up performance events, returns a file descriptor or -1 @@ -406,7 +406,7 @@ class Event(object): ctypes.c_int(pid), ctypes.c_int(cpu), ctypes.c_int(group_fd), ctypes.c_long(flags)) - def setup_event_attribute(self, trace_set, trace_point): + def _setup_event_attribute(self, trace_set, trace_point): """Returns an initialized ctype perf_event_attr struct.""" id_path = os.path.join(PATH_DEBUGFS_TRACING, 'events', trace_set, @@ -416,8 +416,8 @@ class Event(object): event_attr.config = int(open(id_path).read()) return event_attr - def setup_event(self, group, trace_cpu, trace_pid, trace_point, - trace_filter, trace_set): + def _setup_event(self, group, trace_cpu, trace_pid, trace_point, + trace_filter, trace_set): """Sets up the perf event in Linux. Issues the syscall to register the event in the kernel and @@ -425,7 +425,7 @@ class Event(object): """ - event_attr = self.setup_event_attribute(trace_set, trace_point) + event_attr = self._setup_event_attribute(trace_set, trace_point) # First event will be group leader. group_leader = -1 @@ -434,8 +434,8 @@ class Event(object): if group.events: group_leader = group.events[0].fd - fd = self.perf_event_open(event_attr, trace_pid, - trace_cpu, group_leader, 0) + fd = self._perf_event_open(event_attr, trace_pid, + trace_cpu, group_leader, 0) if fd == -1: err = ctypes.get_errno() raise OSError(err, os.strerror(err), @@ -497,12 +497,12 @@ class TracepointProvider(Provider): """ def __init__(self, pid, fields_filter): self.group_leaders = [] - self.filters = self.get_filters() + self.filters = self._get_filters() self.update_fields(fields_filter) self.pid = pid @staticmethod - def get_filters(): + def _get_filters(): """Returns a dict of trace events, their filter ids and the values that can be filtered. @@ -518,7 +518,7 @@ class TracepointProvider(Provider): filters['kvm_exit'] = ('exit_reason', ARCH.exit_reasons) return filters - def get_available_fields(self): + def _get_available_fields(self): """Returns a list of available event's of format 'event name(filter name)'. @@ -546,11 +546,11 @@ class TracepointProvider(Provider): def update_fields(self, fields_filter): """Refresh fields, applying fields_filter""" - self.fields = [field for field in self.get_available_fields() + self.fields = [field for field in self._get_available_fields() if self.is_field_wanted(fields_filter, field)] @staticmethod - def get_online_cpus(): + def _get_online_cpus(): """Returns a list of cpu id integers.""" def parse_int_list(list_string): """Returns an int list from a string of comma separated integers and @@ -572,17 +572,17 @@ class TracepointProvider(Provider): cpu_string = cpu_list.readline() return parse_int_list(cpu_string) - def setup_traces(self): + def _setup_traces(self): """Creates all event and group objects needed to be able to retrieve data.""" - fields = self.get_available_fields() + fields = self._get_available_fields() if self._pid > 0: # Fetch list of all threads of the monitored pid, as qemu # starts a thread for each vcpu. path = os.path.join('/proc', str(self._pid), 'task') groupids = self.walkdir(path)[1] else: - groupids = self.get_online_cpus() + groupids = self._get_online_cpus() # The constant is needed as a buffer for python libs, std # streams and other files that the script opens. @@ -660,7 +660,7 @@ class TracepointProvider(Provider): # The garbage collector will get rid of all Event/Group # objects and open files after removing the references. self.group_leaders = [] - self.setup_traces() + self._setup_traces() self.fields = self._fields def read(self, by_guest=0): @@ -689,9 +689,9 @@ class DebugfsProvider(Provider): self.paths = [] self.pid = pid if include_past: - self.restore() + self._restore() - def get_available_fields(self): + def _get_available_fields(self): """"Returns a list of available fields. The fields are all available KVM debugfs files @@ -701,7 +701,7 @@ class DebugfsProvider(Provider): def update_fields(self, fields_filter): """Refresh fields, applying fields_filter""" - self._fields = [field for field in self.get_available_fields() + self._fields = [field for field in self._get_available_fields() if self.is_field_wanted(fields_filter, field)] @property @@ -755,7 +755,7 @@ class DebugfsProvider(Provider): paths.append(dir) for path in paths: for field in self._fields: - value = self.read_field(field, path) + value = self._read_field(field, path) key = path + field if reset == 1: self._baseline[key] = value @@ -776,7 +776,7 @@ class DebugfsProvider(Provider): return results - def read_field(self, field, path): + def _read_field(self, field, path): """Returns the value of a single field from a specific VM.""" try: return int(open(os.path.join(PATH_DEBUGFS_KVM, @@ -791,7 +791,7 @@ class DebugfsProvider(Provider): self._baseline = {} self.read(1) - def restore(self): + def _restore(self): """Reset field counters""" self._baseline = {} self.read(2) @@ -808,13 +808,12 @@ class Stats(object): """ def __init__(self, options): - self.providers = self.get_providers(options) + self.providers = self._get_providers(options) self._pid_filter = options.pid self._fields_filter = options.fields self.values = {} - @staticmethod - def get_providers(options): + def _get_providers(self, options): """Returns a list of data providers depending on the passed options.""" providers = [] @@ -826,7 +825,7 @@ class Stats(object): return providers - def update_provider_filters(self): + def _update_provider_filters(self): """Propagates fields filters to providers.""" # As we reset the counters when updating the fields we can # also clear the cache of old values. @@ -847,7 +846,7 @@ class Stats(object): def fields_filter(self, fields_filter): if fields_filter != self._fields_filter: self._fields_filter = fields_filter - self.update_provider_filters() + self._update_provider_filters() @property def pid_filter(self): @@ -969,7 +968,7 @@ class Tui(object): return res - def print_all_gnames(self, row): + def _print_all_gnames(self, row): """Print a list of all running guests along with their pids.""" self.screen.addstr(row, 2, '%8s %-60s' % ('Pid', 'Guest Name (fuzzy list, might be ' @@ -1032,7 +1031,7 @@ class Tui(object): return name - def update_drilldown(self): + def _update_drilldown(self): """Sets or removes a filter that only allows fields without braces.""" if not self.stats.fields_filter: self.stats.fields_filter = DEFAULT_REGEX @@ -1040,11 +1039,11 @@ class Tui(object): elif self.stats.fields_filter == DEFAULT_REGEX: self.stats.fields_filter = None - def update_pid(self, pid): + def _update_pid(self, pid): """Propagates pid selection to stats object.""" self.stats.pid_filter = pid - def refresh_header(self, pid=None): + def _refresh_header(self, pid=None): """Refreshes the header.""" if pid is None: pid = self.stats.pid_filter @@ -1075,7 +1074,7 @@ class Tui(object): self.screen.addstr(4, 1, 'Collecting data...') self.screen.refresh() - def refresh_body(self, sleeptime): + def _refresh_body(self, sleeptime): row = 3 self.screen.move(row, 0) self.screen.clrtobot() @@ -1124,7 +1123,7 @@ class Tui(object): curses.A_BOLD) self.screen.refresh() - def show_msg(self, text): + def _show_msg(self, text): """Display message centered text and exit on key press""" hint = 'Press any key to continue' curses.cbreak() @@ -1139,7 +1138,7 @@ class Tui(object): curses.A_STANDOUT) self.screen.getkey() - def show_help_interactive(self): + def _show_help_interactive(self): """Display help with list of interactive commands""" msg = (' b toggle events by guests (debugfs only, honors' ' filters)', @@ -1165,9 +1164,9 @@ class Tui(object): self.screen.addstr(row, 0, line) row += 1 self.screen.getkey() - self.refresh_header() + self._refresh_header() - def show_filter_selection(self): + def _show_filter_selection(self): """Draws filter selection mask. Asks for a valid regex and sets the fields filter accordingly. @@ -1189,18 +1188,18 @@ class Tui(object): curses.noecho() if len(regex) == 0: self.stats.fields_filter = DEFAULT_REGEX - self.refresh_header() + self._refresh_header() return try: re.compile(regex) self.stats.fields_filter = regex - self.refresh_header() + self._refresh_header() return except re.error: msg = '"' + regex + '": Not a valid regular expression' continue - def show_vm_selection_by_pid(self): + def _show_vm_selection_by_pid(self): """Draws PID selection mask. Asks for a pid until a valid pid or 0 has been entered. @@ -1216,7 +1215,7 @@ class Tui(object): 'This might limit the shown data to the trace ' 'statistics.') self.screen.addstr(5, 0, msg) - self.print_all_gnames(7) + self._print_all_gnames(7) curses.echo() self.screen.addstr(3, 0, "Pid [0 or pid]: ") @@ -1232,13 +1231,13 @@ class Tui(object): continue else: pid = 0 - self.refresh_header(pid) - self.update_pid(pid) + self._refresh_header(pid) + self._update_pid(pid) break except ValueError: msg = '"' + str(pid) + '": Not a valid pid' - def show_set_update_interval(self): + def _show_set_update_interval(self): """Draws update interval selection mask.""" msg = '' while True: @@ -1268,9 +1267,9 @@ class Tui(object): except ValueError: msg = '"' + str(val) + '": Invalid value' - self.refresh_header() + self._refresh_header() - def show_vm_selection_by_guest_name(self): + def _show_vm_selection_by_guest_name(self): """Draws guest selection mask. Asks for a guest name until a valid guest name or '' is entered. @@ -1286,15 +1285,15 @@ class Tui(object): 'This might limit the shown data to the trace ' 'statistics.') self.screen.addstr(5, 0, msg) - self.print_all_gnames(7) + self._print_all_gnames(7) curses.echo() self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") gname = self.screen.getstr().decode(ENCODING) curses.noecho() if not gname: - self.refresh_header(0) - self.update_pid(0) + self._refresh_header(0) + self._update_pid(0) break else: pids = [] @@ -1311,17 +1310,17 @@ class Tui(object): msg = '"' + gname + '": Multiple matches found, use pid ' \ 'filter instead' continue - self.refresh_header(pids[0]) - self.update_pid(pids[0]) + self._refresh_header(pids[0]) + self._update_pid(pids[0]) break def show_stats(self): """Refreshes the screen and processes user input.""" sleeptime = self._delay_initial - self.refresh_header() + self._refresh_header() start = 0.0 # result based on init value never appears on screen while True: - self.refresh_body(time.time() - start) + self._refresh_body(time.time() - start) curses.halfdelay(int(sleeptime * 10)) start = time.time() sleeptime = self._delay_regular @@ -1330,32 +1329,33 @@ class Tui(object): if char == 'b': self._display_guests = not self._display_guests if self.stats.toggle_display_guests(self._display_guests): - self.show_msg(['Command not available with tracepoints' - ' enabled', 'Restart with debugfs only ' - '(see option \'-d\') and try again!']) + self._show_msg(['Command not available with ' + 'tracepoints enabled', 'Restart with ' + 'debugfs only (see option \'-d\') and ' + 'try again!']) self._display_guests = not self._display_guests - self.refresh_header() + self._refresh_header() if char == 'c': self.stats.fields_filter = DEFAULT_REGEX - self.refresh_header(0) - self.update_pid(0) + self._refresh_header(0) + self._update_pid(0) if char == 'f': curses.curs_set(1) - self.show_filter_selection() + self._show_filter_selection() curses.curs_set(0) sleeptime = self._delay_initial if char == 'g': curses.curs_set(1) - self.show_vm_selection_by_guest_name() + self._show_vm_selection_by_guest_name() curses.curs_set(0) sleeptime = self._delay_initial if char == 'h': - self.show_help_interactive() + self._show_help_interactive() if char == 'o': self._sorting = not self._sorting if char == 'p': curses.curs_set(1) - self.show_vm_selection_by_pid() + self._show_vm_selection_by_pid() curses.curs_set(0) sleeptime = self._delay_initial if char == 'q': @@ -1364,11 +1364,11 @@ class Tui(object): self.stats.reset() if char == 's': curses.curs_set(1) - self.show_set_update_interval() + self._show_set_update_interval() curses.curs_set(0) sleeptime = self._delay_initial if char == 'x': - self.update_drilldown() + self._update_drilldown() # prevents display of current values on next refresh self.stats.get(self._display_guests) except KeyboardInterrupt: -- cgit v1.2.3 From 516f1190a1e0cce12128a6446e6438745c8de62a Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 22 Feb 2018 12:16:27 +0100 Subject: tools/kvm_stat: eliminate extra guest/pid selection dialog We can do with a single dialog that takes both, pids and guest names. Note that we keep both interactive commands, 'p' and 'g' for now, to avoid confusion among users used to a specific key. While at it, we improve on some minor glitches regarding curses usage, e.g. cursor still visible when not supposed to be. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 110 ++++++++++++++-------------------------- tools/kvm/kvm_stat/kvm_stat.txt | 4 +- 2 files changed, 39 insertions(+), 75 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index c09b7428f563..0d5776785d27 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1041,6 +1041,8 @@ class Tui(object): def _update_pid(self, pid): """Propagates pid selection to stats object.""" + self.screen.addstr(4, 1, 'Updating pid filter...') + self.screen.refresh() self.stats.pid_filter = pid def _refresh_header(self, pid=None): @@ -1144,10 +1146,10 @@ class Tui(object): ' filters)', ' c clear filter', ' f filter by regular expression', - ' g filter by guest name', + ' g filter by guest name/PID', ' h display interactive commands reference', ' o toggle sorting order (Total vs CurAvg/s)', - ' p filter by PID', + ' p filter by guest name/PID', ' q quit', ' r reset stats', ' s set update interval', @@ -1199,44 +1201,6 @@ class Tui(object): msg = '"' + regex + '": Not a valid regular expression' continue - def _show_vm_selection_by_pid(self): - """Draws PID selection mask. - - Asks for a pid until a valid pid or 0 has been entered. - - """ - msg = '' - while True: - self.screen.erase() - self.screen.addstr(0, 0, - 'Show statistics for specific pid.', - curses.A_BOLD) - self.screen.addstr(1, 0, - 'This might limit the shown data to the trace ' - 'statistics.') - self.screen.addstr(5, 0, msg) - self._print_all_gnames(7) - - curses.echo() - self.screen.addstr(3, 0, "Pid [0 or pid]: ") - pid = self.screen.getstr().decode(ENCODING) - curses.noecho() - - try: - if len(pid) > 0: - pid = int(pid) - if pid != 0 and not os.path.isdir(os.path.join('/proc/', - str(pid))): - msg = '"' + str(pid) + '": Not a running process' - continue - else: - pid = 0 - self._refresh_header(pid) - self._update_pid(pid) - break - except ValueError: - msg = '"' + str(pid) + '": Not a valid pid' - def _show_set_update_interval(self): """Draws update interval selection mask.""" msg = '' @@ -1269,17 +1233,17 @@ class Tui(object): msg = '"' + str(val) + '": Invalid value' self._refresh_header() - def _show_vm_selection_by_guest_name(self): + def _show_vm_selection_by_guest(self): """Draws guest selection mask. - Asks for a guest name until a valid guest name or '' is entered. + Asks for a guest name or pid until a valid guest name or '' is entered. """ msg = '' while True: self.screen.erase() self.screen.addstr(0, 0, - 'Show statistics for specific guest.', + 'Show statistics for specific guest or pid.', curses.A_BOLD) self.screen.addstr(1, 0, 'This might limit the shown data to the trace ' @@ -1287,32 +1251,39 @@ class Tui(object): self.screen.addstr(5, 0, msg) self._print_all_gnames(7) curses.echo() - self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") - gname = self.screen.getstr().decode(ENCODING) + curses.curs_set(1) + self.screen.addstr(3, 0, "Guest or pid [ENTER exits]: ") + guest = self.screen.getstr().decode(ENCODING) curses.noecho() - if not gname: - self._refresh_header(0) - self._update_pid(0) + pid = 0 + if not guest or guest == '0': break - else: - pids = [] - try: - pids = self.get_pid_from_gname(gname) - except: - msg = '"' + gname + '": Internal error while searching, ' \ - 'use pid filter instead' - continue - if len(pids) == 0: - msg = '"' + gname + '": Not an active guest' + if guest.isdigit(): + if not os.path.isdir(os.path.join('/proc/', guest)): + msg = '"' + guest + '": Not a running process' continue - if len(pids) > 1: - msg = '"' + gname + '": Multiple matches found, use pid ' \ - 'filter instead' - continue - self._refresh_header(pids[0]) - self._update_pid(pids[0]) + pid = int(guest) break + pids = [] + try: + pids = self.get_pid_from_gname(guest) + except: + msg = '"' + guest + '": Internal error while searching, ' \ + 'use pid filter instead' + continue + if len(pids) == 0: + msg = '"' + guest + '": Not an active guest' + continue + if len(pids) > 1: + msg = '"' + guest + '": Multiple matches found, use pid ' \ + 'filter instead' + continue + pid = pids[0] + break + curses.curs_set(0) + self._refresh_header(pid) + self._update_pid(pid) def show_stats(self): """Refreshes the screen and processes user input.""" @@ -1344,20 +1315,13 @@ class Tui(object): self._show_filter_selection() curses.curs_set(0) sleeptime = self._delay_initial - if char == 'g': - curses.curs_set(1) - self._show_vm_selection_by_guest_name() - curses.curs_set(0) + if char == 'g' or char == 'p': + self._show_vm_selection_by_guest() sleeptime = self._delay_initial if char == 'h': self._show_help_interactive() if char == 'o': self._sorting = not self._sorting - if char == 'p': - curses.curs_set(1) - self._show_vm_selection_by_pid() - curses.curs_set(0) - sleeptime = self._delay_initial if char == 'q': break if char == 'r': diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt index b5b3810c9e94..0811d860fe75 100644 --- a/tools/kvm/kvm_stat/kvm_stat.txt +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -35,13 +35,13 @@ INTERACTIVE COMMANDS *f*:: filter by regular expression -*g*:: filter by guest name +*g*:: filter by guest name/PID *h*:: display interactive commands reference *o*:: toggle sorting order (Total vs CurAvg/s) -*p*:: filter by PID +*p*:: filter by guest name/PID *q*:: quit -- cgit v1.2.3 From 18e8f4100ef14f924514fbd91eb67bd5fa5396b7 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 22 Feb 2018 12:16:28 +0100 Subject: tools/kvm_stat: separate drilldown and fields filtering Drilldown (i.e. toggle display of child trace events) was implemented by overriding the fields filter. This resulted in inconsistencies: E.g. when drilldown was not active, adding a filter that also matches child trace events would not only filter fields according to the filter, but also add in the child trace events matching the filter. E.g. on x86, setting 'kvm_userspace_exit' as the fields filter after startup would result in display of kvm_userspace_exit(DCR), although that wasn't previously present - not exactly what one would expect from a filter. This patch addresses the issue by keeping drilldown and fields filter separate. While at it, we also fix a PEP8 issue by adding a blank line at one place (since we're in the area...). We implement this by adding a framework that also allows to define a taxonomy among the debugfs events to identify child trace events. I.e. drilldown using 'x' can now also work with debugfs. A respective parent- child relationship is only known for S390 at the moment, but could be added adjusting other platforms' ARCH.dbg_is_child() methods accordingly. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 143 +++++++++++++++++++++++++++++++------------- 1 file changed, 100 insertions(+), 43 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 0d5776785d27..6b6630ee6daf 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -228,6 +228,7 @@ IOCTL_NUMBERS = { } ENCODING = locale.getpreferredencoding(False) +TRACE_FILTER = re.compile(r'^[^\(]*$') class Arch(object): @@ -260,6 +261,11 @@ class Arch(object): return ArchX86(SVM_EXIT_REASONS) return + def tracepoint_is_child(self, field): + if (TRACE_FILTER.match(field)): + return None + return field.split('(', 1)[0] + class ArchX86(Arch): def __init__(self, exit_reasons): @@ -267,6 +273,10 @@ class ArchX86(Arch): self.ioctl_numbers = IOCTL_NUMBERS self.exit_reasons = exit_reasons + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + return None + class ArchPPC(Arch): def __init__(self): @@ -282,6 +292,10 @@ class ArchPPC(Arch): self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 self.exit_reasons = {} + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + return None + class ArchA64(Arch): def __init__(self): @@ -289,6 +303,10 @@ class ArchA64(Arch): self.ioctl_numbers = IOCTL_NUMBERS self.exit_reasons = AARCH64_EXIT_REASONS + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + return None + class ArchS390(Arch): def __init__(self): @@ -296,6 +314,12 @@ class ArchS390(Arch): self.ioctl_numbers = IOCTL_NUMBERS self.exit_reasons = None + def debugfs_is_child(self, field): + """ Returns name of parent if 'field' is a child, None otherwise """ + if field.startswith('instruction_'): + return 'exit_instruction' + + ARCH = Arch.get_arch() @@ -472,6 +496,10 @@ class Event(object): class Provider(object): """Encapsulates functionalities used by all providers.""" + def __init__(self, pid): + self.child_events = False + self.pid = pid + @staticmethod def is_field_wanted(fields_filter, field): """Indicate whether field is valid according to fields_filter.""" @@ -499,7 +527,7 @@ class TracepointProvider(Provider): self.group_leaders = [] self.filters = self._get_filters() self.update_fields(fields_filter) - self.pid = pid + super(TracepointProvider, self).__init__(pid) @staticmethod def _get_filters(): @@ -519,7 +547,7 @@ class TracepointProvider(Provider): return filters def _get_available_fields(self): - """Returns a list of available event's of format 'event name(filter + """Returns a list of available events of format 'event name(filter name)'. All available events have directories under @@ -547,7 +575,8 @@ class TracepointProvider(Provider): def update_fields(self, fields_filter): """Refresh fields, applying fields_filter""" self.fields = [field for field in self._get_available_fields() - if self.is_field_wanted(fields_filter, field)] + if self.is_field_wanted(fields_filter, field) or + ARCH.tracepoint_is_child(field)] @staticmethod def _get_online_cpus(): @@ -668,8 +697,12 @@ class TracepointProvider(Provider): ret = defaultdict(int) for group in self.group_leaders: for name, val in group.read().items(): - if name in self._fields: - ret[name] += val + if name not in self._fields: + continue + parent = ARCH.tracepoint_is_child(name) + if parent: + name += ' ' + parent + ret[name] += val return ret def reset(self): @@ -687,7 +720,7 @@ class DebugfsProvider(Provider): self._baseline = {} self.do_read = True self.paths = [] - self.pid = pid + super(DebugfsProvider, self).__init__(pid) if include_past: self._restore() @@ -702,7 +735,8 @@ class DebugfsProvider(Provider): def update_fields(self, fields_filter): """Refresh fields, applying fields_filter""" self._fields = [field for field in self._get_available_fields() - if self.is_field_wanted(fields_filter, field)] + if self.is_field_wanted(fields_filter, field) or + ARCH.debugfs_is_child(field)] @property def fields(self): @@ -763,14 +797,15 @@ class DebugfsProvider(Provider): self._baseline[key] = 0 if self._baseline.get(key, -1) == -1: self._baseline[key] = value - increment = (results.get(field, 0) + value - - self._baseline.get(key, 0)) - if by_guest: - pid = key.split('-')[0] - if pid in results: - results[pid] += increment - else: - results[pid] = increment + parent = ARCH.debugfs_is_child(field) + if parent: + field = field + ' ' + parent + else: + if by_guest: + field = key.split('-')[0] # set 'field' to 'pid' + increment = value - self._baseline.get(key, 0) + if field in results: + results[field] += increment else: results[field] = increment @@ -812,6 +847,7 @@ class Stats(object): self._pid_filter = options.pid self._fields_filter = options.fields self.values = {} + self._child_events = False def _get_providers(self, options): """Returns a list of data providers depending on the passed options.""" @@ -860,12 +896,29 @@ class Stats(object): for provider in self.providers: provider.pid = self._pid_filter + @property + def child_events(self): + return self._child_events + + @child_events.setter + def child_events(self, val): + self._child_events = val + for provider in self.providers: + provider.child_events = val + def get(self, by_guest=0): """Returns a dict with field -> (value, delta to last value) of all - provider data.""" + provider data. + Key formats: + * plain: 'key' is event name + * child-parent: 'key' is in format ' ' + * pid: 'key' is the pid of the guest, and the record contains the + aggregated event data + These formats are generated by the providers, and handled in class TUI. + """ for provider in self.providers: new = provider.read(by_guest=by_guest) - for key in new if by_guest else provider.fields: + for key in new: oldval = self.values.get(key, EventStat(0, 0)).value newval = new.get(key, 0) newdelta = newval - oldval @@ -898,10 +951,10 @@ class Stats(object): self.get(to_pid) return 0 + DELAY_DEFAULT = 3.0 MAX_GUEST_NAME_LEN = 48 MAX_REGEX_LEN = 44 -DEFAULT_REGEX = r'^[^\(]*$' SORT_DEFAULT = 0 @@ -1031,14 +1084,6 @@ class Tui(object): return name - def _update_drilldown(self): - """Sets or removes a filter that only allows fields without braces.""" - if not self.stats.fields_filter: - self.stats.fields_filter = DEFAULT_REGEX - - elif self.stats.fields_filter == DEFAULT_REGEX: - self.stats.fields_filter = None - def _update_pid(self, pid): """Propagates pid selection to stats object.""" self.screen.addstr(4, 1, 'Updating pid filter...') @@ -1060,8 +1105,7 @@ class Tui(object): .format(pid, gname), curses.A_BOLD) else: self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) - if self.stats.fields_filter and self.stats.fields_filter \ - != DEFAULT_REGEX: + if self.stats.fields_filter: regex = self.stats.fields_filter if len(regex) > MAX_REGEX_LEN: regex = regex[:MAX_REGEX_LEN] + '...' @@ -1077,6 +1121,9 @@ class Tui(object): self.screen.refresh() def _refresh_body(self, sleeptime): + def is_child_field(field): + return field.find('(') != -1 + row = 3 self.screen.move(row, 0) self.screen.clrtobot() @@ -1084,7 +1131,11 @@ class Tui(object): total = 0. ctotal = 0. for key, values in stats.items(): - if key.find('(') == -1: + if self._display_guests: + if self.get_gname_from_pid(key): + total += values.value + continue + if not key.find(' ') != -1: total += values.value else: ctotal += values.value @@ -1101,19 +1152,26 @@ class Tui(object): # sort by overall value return v.value + sorted_items = sorted(stats.items(), key=sortkey, reverse=True) + + # print events tavg = 0 - for key, values in sorted(stats.items(), key=sortkey, reverse=True): + for key, values in sorted_items: if row >= self.screen.getmaxyx()[0] - 1: break - if not values.value and not values.delta: - break + if values == (0, 0): + continue + if not self.stats.child_events and key.find(' ') != -1: + continue if values.value is not None: cur = int(round(values.delta / sleeptime)) if values.delta else '' if self._display_guests: key = self.get_gname_from_pid(key) - self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % - (key, values.value, values.value * 100 / total, - cur)) + if not key: + continue + self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % (key + .split(' ')[0], values.value, + values.value * 100 / total, cur)) if cur != '' and key.find('(') == -1: tavg += cur row += 1 @@ -1189,7 +1247,7 @@ class Tui(object): regex = self.screen.getstr().decode(ENCODING) curses.noecho() if len(regex) == 0: - self.stats.fields_filter = DEFAULT_REGEX + self.stats.fields_filter = '' self._refresh_header() return try: @@ -1307,7 +1365,7 @@ class Tui(object): self._display_guests = not self._display_guests self._refresh_header() if char == 'c': - self.stats.fields_filter = DEFAULT_REGEX + self.stats.fields_filter = '' self._refresh_header(0) self._update_pid(0) if char == 'f': @@ -1332,9 +1390,7 @@ class Tui(object): curses.curs_set(0) sleeptime = self._delay_initial if char == 'x': - self._update_drilldown() - # prevents display of current values on next refresh - self.stats.get(self._display_guests) + self.stats.child_events = not self.stats.child_events except KeyboardInterrupt: break except curses.error: @@ -1348,7 +1404,8 @@ def batch(stats): time.sleep(1) s = stats.get() for key, values in sorted(s.items()): - print('%-42s%10d%10d' % (key, values.value, values.delta)) + print('%-42s%10d%10d' % (key.split(' ')[0], values.value, + values.delta)) except KeyboardInterrupt: pass @@ -1359,7 +1416,7 @@ def log(stats): def banner(): for key in keys: - print(key, end=' ') + print(key.split(' ')[0], end=' ') print() def statline(): @@ -1470,7 +1527,7 @@ Press any other key to refresh statistics immediately. ) optparser.add_option('-f', '--fields', action='store', - default=DEFAULT_REGEX, + default='', dest='fields', help='''fields to display (regex) "-f help" for a list of available events''', -- cgit v1.2.3 From df72ecfc790fa01de1c41f836ff51d12f9c40318 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 22 Feb 2018 12:16:29 +0100 Subject: tools/kvm_stat: group child events indented after parent We keep the current logic that sorts all events (parent and child), but re-shuffle the events afterwards, grouping the children after the respective parent. Note that the percentage column for child events gives the percentage of the parent's total. Since we rework the logic anyway, we modify the total average calculation to use the raw numbers instead of the (rounded) averages. Note that this can result in differing numbers (between total average and the sum of the individual averages) due to rounding errors. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 89 ++++++++++++++++++++++++++++++--------------- 1 file changed, 59 insertions(+), 30 deletions(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 6b6630ee6daf..862c997932e2 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1124,6 +1124,45 @@ class Tui(object): def is_child_field(field): return field.find('(') != -1 + def insert_child(sorted_items, child, values, parent): + num = len(sorted_items) + for i in range(0, num): + # only add child if parent is present + if parent.startswith(sorted_items[i][0]): + sorted_items.insert(i + 1, (' ' + child, values)) + + def get_sorted_events(self, stats): + """ separate parent and child events """ + if self._sorting == SORT_DEFAULT: + def sortkey((_k, v)): + # sort by (delta value, overall value) + return (v.delta, v.value) + else: + def sortkey((_k, v)): + # sort by overall value + return v.value + + childs = [] + sorted_items = [] + # we can't rule out child events to appear prior to parents even + # when sorted - separate out all children first, and add in later + for key, values in sorted(stats.items(), key=sortkey, + reverse=True): + if values == (0, 0): + continue + if key.find(' ') != -1: + if not self.stats.child_events: + continue + childs.insert(0, (key, values)) + else: + sorted_items.append((key, values)) + if self.stats.child_events: + for key, values in childs: + (child, parent) = key.split(' ') + insert_child(sorted_items, child, values, parent) + + return sorted_items + row = 3 self.screen.move(row, 0) self.screen.clrtobot() @@ -1143,44 +1182,34 @@ class Tui(object): # we don't have any fields, or all non-child events are filtered total = ctotal - if self._sorting == SORT_DEFAULT: - def sortkey((_k, v)): - # sort by (delta value, overall value) - return (v.delta, v.value) - else: - def sortkey((_k, v)): - # sort by overall value - return v.value - - sorted_items = sorted(stats.items(), key=sortkey, reverse=True) - # print events tavg = 0 - for key, values in sorted_items: - if row >= self.screen.getmaxyx()[0] - 1: + tcur = 0 + for key, values in get_sorted_events(self, stats): + if row >= self.screen.getmaxyx()[0] - 1 or values == (0, 0): break - if values == (0, 0): - continue - if not self.stats.child_events and key.find(' ') != -1: - continue - if values.value is not None: - cur = int(round(values.delta / sleeptime)) if values.delta else '' - if self._display_guests: - key = self.get_gname_from_pid(key) - if not key: - continue - self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % (key - .split(' ')[0], values.value, - values.value * 100 / total, cur)) - if cur != '' and key.find('(') == -1: - tavg += cur + if self._display_guests: + key = self.get_gname_from_pid(key) + if not key: + continue + cur = int(round(values.delta / sleeptime)) if values.delta else '' + if key[0] != ' ': + if values.delta: + tcur += values.delta + ptotal = values.value + ltotal = total + else: + ltotal = ptotal + self.screen.addstr(row, 1, '%-40s %10d%7.1f %8s' % (key, + values.value, + values.value * 100 / float(ltotal), cur)) row += 1 if row == 3: self.screen.addstr(4, 1, 'No matching events reported yet') else: + tavg = int(round(tcur / sleeptime)) if tcur > 0 else '' self.screen.addstr(row, 1, '%-40s %10d %8s' % - ('Total', total, tavg if tavg else ''), - curses.A_BOLD) + ('Total', total, tavg), curses.A_BOLD) self.screen.refresh() def _show_msg(self, text): -- cgit v1.2.3 From 6789af030a462708f937137e05eb12ea009fb348 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Thu, 22 Feb 2018 12:16:30 +0100 Subject: tools/kvm_stat: print 'Total' line for multiple events only The 'Total' line looks a bit weird when we have a single event only. This can happen e.g. due to filters. Therefore suppress when there's only a single event in the output. Signed-off-by: Stefan Raspl Signed-off-by: Paolo Bonzini --- tools/kvm/kvm_stat/kvm_stat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 862c997932e2..5898c22ba310 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1206,7 +1206,7 @@ class Tui(object): row += 1 if row == 3: self.screen.addstr(4, 1, 'No matching events reported yet') - else: + if row > 4: tavg = int(round(tcur / sleeptime)) if tcur > 0 else '' self.screen.addstr(row, 1, '%-40s %10d %8s' % ('Total', total, tavg), curses.A_BOLD) -- cgit v1.2.3 From 076467490b8176eb96eddc548a14d4135c7b5852 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 22 Feb 2018 13:05:41 +0100 Subject: kvm: fix warning for CONFIG_HAVE_KVM_EVENTFD builds Move the kvm_arch_irq_routing_update() prototype outside of ifdef CONFIG_HAVE_KVM_EVENTFD guards to fix the following sparse warning: arch/s390/kvm/../../../virt/kvm/irqchip.c:171:28: warning: symbol 'kvm_arch_irq_routing_update' was not declared. Should it be static? Signed-off-by: Sebastian Ott Acked-by: Christian Borntraeger Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ac0062b74aed..84b9c50693f2 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1105,7 +1105,6 @@ static inline void kvm_irq_routing_update(struct kvm *kvm) { } #endif -void kvm_arch_irq_routing_update(struct kvm *kvm); static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) { @@ -1114,6 +1113,8 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args) #endif /* CONFIG_HAVE_KVM_EVENTFD */ +void kvm_arch_irq_routing_update(struct kvm *kvm); + static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) { /* -- cgit v1.2.3 From f75e4924f0152be747bf04c9d16bb23fd8baf5f9 Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Thu, 22 Feb 2018 13:04:39 +0100 Subject: kvm: fix warning for non-x86 builds Fix the following sparse warning by moving the prototype of kvm_arch_mmu_notifier_invalidate_range() to linux/kvm_host.h . CHECK arch/s390/kvm/../../../virt/kvm/kvm_main.c arch/s390/kvm/../../../virt/kvm/kvm_main.c:138:13: warning: symbol 'kvm_arch_mmu_notifier_invalidate_range' was not declared. Should it be static? Signed-off-by: Sebastian Ott Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 --- include/linux/kvm_host.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd6f57a54a26..0a9e330b34f0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1464,7 +1464,4 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); - #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 84b9c50693f2..6930c63126c7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1273,4 +1273,7 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end); + #endif -- cgit v1.2.3 From fe2a3027e74e40a3ece3a4c1e4e51403090a907a Mon Sep 17 00:00:00 2001 From: Radim Krčmář Date: Thu, 1 Feb 2018 22:16:21 +0100 Subject: KVM: x86: fix backward migration with async_PF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Guests on new hypersiors might set KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT bit when enabling async_PF, but this bit is reserved on old hypervisors, which results in a failure upon migration. To avoid breaking different cases, we are checking for CPUID feature bit before enabling the feature and nothing else. Fixes: 52a5c155cf79 ("KVM: async_pf: Let guest support delivery of async_pf from guest mode") Cc: Reviewed-by: Wanpeng Li Reviewed-by: David Hildenbrand Signed-off-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/cpuid.txt | 4 ++++ Documentation/virtual/kvm/msr.txt | 3 ++- arch/x86/include/uapi/asm/kvm_para.h | 1 + arch/x86/kernel/kvm.c | 8 ++++---- arch/x86/kvm/cpuid.c | 3 ++- 5 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Documentation/virtual/kvm/cpuid.txt b/Documentation/virtual/kvm/cpuid.txt index dcab6dc11e3b..87a7506f31c2 100644 --- a/Documentation/virtual/kvm/cpuid.txt +++ b/Documentation/virtual/kvm/cpuid.txt @@ -58,6 +58,10 @@ KVM_FEATURE_PV_TLB_FLUSH || 9 || guest checks this feature bit || || before enabling paravirtualized || || tlb flush. ------------------------------------------------------------------------------ +KVM_FEATURE_ASYNC_PF_VMEXIT || 10 || paravirtualized async PF VM exit + || || can be enabled by setting bit 2 + || || when writing to msr 0x4b564d02 +------------------------------------------------------------------------------ KVM_FEATURE_CLOCKSOURCE_STABLE_BIT || 24 || host will warn if no guest-side || || per-cpu warps are expected in || || kvmclock. diff --git a/Documentation/virtual/kvm/msr.txt b/Documentation/virtual/kvm/msr.txt index 1ebecc115dc6..f3f0d57ced8e 100644 --- a/Documentation/virtual/kvm/msr.txt +++ b/Documentation/virtual/kvm/msr.txt @@ -170,7 +170,8 @@ MSR_KVM_ASYNC_PF_EN: 0x4b564d02 when asynchronous page faults are enabled on the vcpu 0 when disabled. Bit 1 is 1 if asynchronous page faults can be injected when vcpu is in cpl == 0. Bit 2 is 1 if asynchronous page faults - are delivered to L1 as #PF vmexits. + are delivered to L1 as #PF vmexits. Bit 2 can be set only if + KVM_FEATURE_ASYNC_PF_VMEXIT is present in CPUID. First 4 byte of 64 byte memory location will be written to by the hypervisor at the time of asynchronous page fault (APF) diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 7a2ade4aa235..6cfa9c8cb7d6 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -26,6 +26,7 @@ #define KVM_FEATURE_PV_EOI 6 #define KVM_FEATURE_PV_UNHALT 7 #define KVM_FEATURE_PV_TLB_FLUSH 9 +#define KVM_FEATURE_ASYNC_PF_VMEXIT 10 /* The last 8 bits are used to indicate how to interpret the flags field * in pvclock structure. If no bits are set, all flags are ignored. diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 4e37d1a851a6..971babe964d2 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -341,10 +341,10 @@ static void kvm_guest_cpu_init(void) #endif pa |= KVM_ASYNC_PF_ENABLED; - /* Async page fault support for L1 hypervisor is optional */ - if (wrmsr_safe(MSR_KVM_ASYNC_PF_EN, - (pa | KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT) & 0xffffffff, pa >> 32) < 0) - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); + if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) + pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; + + wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(apf_reason.enabled, 1); printk(KERN_INFO"KVM setup async PF for cpu %d\n", smp_processor_id()); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index a0c5a69bc7c4..b671fc2d0422 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -607,7 +607,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, (1 << KVM_FEATURE_PV_EOI) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) | (1 << KVM_FEATURE_PV_UNHALT) | - (1 << KVM_FEATURE_PV_TLB_FLUSH); + (1 << KVM_FEATURE_PV_TLB_FLUSH) | + (1 << KVM_FEATURE_ASYNC_PF_VMEXIT); if (sched_info_on()) entry->eax |= (1 << KVM_FEATURE_STEAL_TIME); -- cgit v1.2.3 From afdc3f588850a6fbc996205ee2d472eb4426afb3 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Wed, 17 Jan 2018 11:46:54 +0800 Subject: x86/kvm: Make parse_no_xxx __init for kvm The early_param() is only called during kernel initialization, So Linux marks the functions of it with __init macro to save memory. But it forgot to mark the parse_no_kvmapf/stealacc/kvmclock_vsyscall, So, Make them __init as well. Cc: Paolo Bonzini Cc: rkrcmar@redhat.com Cc: kvm@vger.kernel.org Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Juergen Gross Cc: x86@kernel.org Signed-off-by: Dou Liyang Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 971babe964d2..ee7d5c951864 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -49,7 +49,7 @@ static int kvmapf = 1; -static int parse_no_kvmapf(char *arg) +static int __init parse_no_kvmapf(char *arg) { kvmapf = 0; return 0; @@ -58,7 +58,7 @@ static int parse_no_kvmapf(char *arg) early_param("no-kvmapf", parse_no_kvmapf); static int steal_acc = 1; -static int parse_no_stealacc(char *arg) +static int __init parse_no_stealacc(char *arg) { steal_acc = 0; return 0; @@ -67,7 +67,7 @@ static int parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static int kvmclock_vsyscall = 1; -static int parse_no_kvmclock_vsyscall(char *arg) +static int __init parse_no_kvmclock_vsyscall(char *arg) { kvmclock_vsyscall = 0; return 0; -- cgit v1.2.3 From 4f2f61fc507176edd65826fbedc8987dea29b9d5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sun, 4 Feb 2018 22:57:58 -0800 Subject: KVM: X86: Avoid traversing all the cpus for pv tlb flush when steal time is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Avoid traversing all the cpus for pv tlb flush when steal time is disabled since pv tlb flush depends on the field in steal time for shared data. Cc: Paolo Bonzini Cc: Radim KrÄmář Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kernel/kvm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index ee7d5c951864..bc1a27280c4b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -545,7 +545,8 @@ static void __init kvm_guest_init(void) pv_time_ops.steal_clock = kvm_steal_clock; } - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) @@ -633,7 +634,8 @@ static __init int kvm_setup_pv_tlb_flush(void) { int cpu; - if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH)) { + if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && + !kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { for_each_possible_cpu(cpu) { zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu), GFP_KERNEL, cpu_to_node(cpu)); -- cgit v1.2.3 From e5699f56bc91a286f006b0728085e0b4e8f5749b Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 15 Jan 2018 07:32:02 -0600 Subject: crypto: ccp: Fix sparse, use plain integer as NULL pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix sparse warning: Using plain integer as NULL pointer. Replaces assignment of 0 to pointer with NULL assignment. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Borislav Petkov Cc: Herbert Xu Cc: Gary Hook Cc: Tom Lendacky Cc: linux-crypto@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- drivers/crypto/ccp/psp-dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/crypto/ccp/psp-dev.c b/drivers/crypto/ccp/psp-dev.c index fcfa5b1eae61..b3afb6cc9d72 100644 --- a/drivers/crypto/ccp/psp-dev.c +++ b/drivers/crypto/ccp/psp-dev.c @@ -211,7 +211,7 @@ static int __sev_platform_shutdown_locked(int *error) { int ret; - ret = __sev_do_cmd_locked(SEV_CMD_SHUTDOWN, 0, error); + ret = __sev_do_cmd_locked(SEV_CMD_SHUTDOWN, NULL, error); if (ret) return ret; @@ -271,7 +271,7 @@ static int sev_ioctl_do_reset(struct sev_issue_cmd *argp) return rc; } - return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, 0, &argp->error); + return __sev_do_cmd_locked(SEV_CMD_FACTORY_RESET, NULL, &argp->error); } static int sev_ioctl_do_platform_status(struct sev_issue_cmd *argp) @@ -299,7 +299,7 @@ static int sev_ioctl_do_pek_pdh_gen(int cmd, struct sev_issue_cmd *argp) return rc; } - return __sev_do_cmd_locked(cmd, 0, &argp->error); + return __sev_do_cmd_locked(cmd, NULL, &argp->error); } static int sev_ioctl_do_pek_csr(struct sev_issue_cmd *argp) @@ -624,7 +624,7 @@ EXPORT_SYMBOL_GPL(sev_guest_decommission); int sev_guest_df_flush(int *error) { - return sev_do_cmd(SEV_CMD_DF_FLUSH, 0, error); + return sev_do_cmd(SEV_CMD_DF_FLUSH, NULL, error); } EXPORT_SYMBOL_GPL(sev_guest_df_flush); -- cgit v1.2.3 From 45d0be876308bf2f858559e84455219eadd9ddc7 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 15 Jan 2018 07:32:04 -0600 Subject: include: psp-sev: Capitalize invalid length enum MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 1d57b17c60ff ("crypto: ccp: Define SEV userspace ioctl and command id") added the invalid length enum but we missed capitalizing it. Fixes: 1d57b17c60ff (crypto: ccp: Define SEV userspace ioctl ...) Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Borislav Petkov Cc: Tom Lendacky CC: Gary R Hook Cc: linux-kernel@vger.kernel.org Signed-off-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- include/uapi/linux/psp-sev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/psp-sev.h b/include/uapi/linux/psp-sev.h index 3d77fe91239a..9008f31c7eb6 100644 --- a/include/uapi/linux/psp-sev.h +++ b/include/uapi/linux/psp-sev.h @@ -42,7 +42,7 @@ typedef enum { SEV_RET_INVALID_PLATFORM_STATE, SEV_RET_INVALID_GUEST_STATE, SEV_RET_INAVLID_CONFIG, - SEV_RET_INVALID_len, + SEV_RET_INVALID_LEN, SEV_RET_ALREADY_OWNED, SEV_RET_INVALID_CERTIFICATE, SEV_RET_POLICY_FAILURE, -- cgit v1.2.3 From 3e233385ef4a217a2812115ed84d4be36eb16817 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Fri, 23 Feb 2018 12:36:50 -0600 Subject: KVM: SVM: no need to call access_ok() in LAUNCH_MEASURE command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the access_ok() to validate the input before issuing the SEV command does not buy us anything in this case. If userland is giving us a garbage pointer then copy_to_user() will catch it when we try to return the measurement. Suggested-by: Al Viro Fixes: 0d0736f76347 (KVM: SVM: Add support for KVM_SEV_LAUNCH_MEASURE ...) Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Borislav Petkov Cc: Tom Lendacky Cc: linux-kernel@vger.kernel.org Cc: Joerg Roedel Signed-off-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b3e488a74828..ca69d53d7e6d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6236,16 +6236,18 @@ e_free: static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) { + void __user *measure = (void __user *)(uintptr_t)argp->data; struct kvm_sev_info *sev = &kvm->arch.sev_info; struct sev_data_launch_measure *data; struct kvm_sev_launch_measure params; + void __user *p = NULL; void *blob = NULL; int ret; if (!sev_guest(kvm)) return -ENOTTY; - if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params))) + if (copy_from_user(¶ms, measure, sizeof(params))) return -EFAULT; data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -6256,17 +6258,13 @@ static int sev_launch_measure(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!params.len) goto cmd; - if (params.uaddr) { + p = (void __user *)(uintptr_t)params.uaddr; + if (p) { if (params.len > SEV_FW_BLOB_MAX_SIZE) { ret = -EINVAL; goto e_free; } - if (!access_ok(VERIFY_WRITE, params.uaddr, params.len)) { - ret = -EFAULT; - goto e_free; - } - ret = -ENOMEM; blob = kmalloc(params.len, GFP_KERNEL); if (!blob) @@ -6290,13 +6288,13 @@ cmd: goto e_free_blob; if (blob) { - if (copy_to_user((void __user *)(uintptr_t)params.uaddr, blob, params.len)) + if (copy_to_user(p, blob, params.len)) ret = -EFAULT; } done: params.len = data->len; - if (copy_to_user((void __user *)(uintptr_t)argp->data, ¶ms, sizeof(params))) + if (copy_to_user(measure, ¶ms, sizeof(params))) ret = -EFAULT; e_free_blob: kfree(blob); -- cgit v1.2.3 From 7607b7174405aec7441ff6c970833c463114040a Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 19 Feb 2018 10:14:44 -0600 Subject: KVM: SVM: install RSM intercept MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RSM instruction is used by the SMM handler to return from SMM mode. Currently, rsm causes a #UD - which results in instruction fetch, decode, and emulate. By installing the RSM intercept we can avoid the instruction fetch since we know that #VMEXIT was due to rsm. The patch is required for the SEV guest, because in case of SEV guest memory is encrypted with guest-specific key and hypervisor will not able to fetch the instruction bytes from the guest memory. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Joerg Roedel Cc: Borislav Petkov Cc: Tom Lendacky Signed-off-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ca69d53d7e6d..4aeb665ffbb0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -300,6 +300,8 @@ module_param(vgif, int, 0444); static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT); module_param(sev, int, 0444); +static u8 rsm_ins_bytes[] = "\x0f\xaa"; + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -1383,6 +1385,7 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); set_intercept(svm, INTERCEPT_XSETBV); + set_intercept(svm, INTERCEPT_RSM); if (!kvm_mwait_in_guest()) { set_intercept(svm, INTERCEPT_MONITOR); @@ -3699,6 +3702,12 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +static int rsm_interception(struct vcpu_svm *svm) +{ + return x86_emulate_instruction(&svm->vcpu, 0, 0, + rsm_ins_bytes, 2) == EMULATE_DONE; +} + static int rdpmc_interception(struct vcpu_svm *svm) { int err; @@ -4541,7 +4550,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_MWAIT] = mwait_interception, [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = npf_interception, - [SVM_EXIT_RSM] = emulate_on_interception, + [SVM_EXIT_RSM] = rsm_interception, [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, }; -- cgit v1.2.3 From 9c5e0afaf15788bcbd1c3469da701ac3da826886 Mon Sep 17 00:00:00 2001 From: Brijesh Singh Date: Mon, 19 Feb 2018 10:13:25 -0600 Subject: KVM: SVM: Fix SEV LAUNCH_SECRET command MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SEV LAUNCH_SECRET command fails with error code 'invalid param' because we missed filling the guest and header system physical address while issuing the command. Fixes: 9f5b5b950aa9 (KVM: SVM: Add support for SEV LAUNCH_SECRET command) Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Borislav Petkov Cc: Tom Lendacky Cc: linux-kernel@vger.kernel.org Cc: Joerg Roedel Signed-off-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 4aeb665ffbb0..3d8377f75eda 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -6604,7 +6604,7 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) struct page **pages; void *blob, *hdr; unsigned long n; - int ret; + int ret, offset; if (!sev_guest(kvm)) return -ENOTTY; @@ -6630,6 +6630,10 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) if (!data) goto e_unpin_memory; + offset = params.guest_uaddr & (PAGE_SIZE - 1); + data->guest_address = __sme_page_pa(pages[0]) + offset; + data->guest_len = params.guest_len; + blob = psp_copy_user_blob(params.trans_uaddr, params.trans_len); if (IS_ERR(blob)) { ret = PTR_ERR(blob); @@ -6644,8 +6648,8 @@ static int sev_launch_secret(struct kvm *kvm, struct kvm_sev_cmd *argp) ret = PTR_ERR(hdr); goto e_free_blob; } - data->trans_address = __psp_pa(blob); - data->trans_len = params.trans_len; + data->hdr_address = __psp_pa(hdr); + data->hdr_len = params.hdr_len; data->handle = sev->handle; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_SECRET, data, &argp->error); -- cgit v1.2.3