From a9c788f07347cf5dfcbe4bf77bf0f3a2a04c89f1 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 23 May 2019 18:43:02 +0200 Subject: KVM: selftests: Guard struct kvm_vcpu_events with __KVM_HAVE_VCPU_EVENTS The struct kvm_vcpu_events code is only available on certain architectures (arm, arm64 and x86). To be able to compile kvm_util.c also for other architectures, we have to fence the code with __KVM_HAVE_VCPU_EVENTS. Reviewed-by: David Hildenbrand Reviewed-by: Andrew Jones Signed-off-by: Thomas Huth Message-Id: <20190523164309.13345-3-thuth@redhat.com> Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/include/kvm_util.h | 2 ++ tools/testing/selftests/kvm/lib/kvm_util.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index a5a4b28f14d8..b8bf961074fe 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -114,10 +114,12 @@ void vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs); int _vcpu_sregs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_sregs *sregs); +#ifdef __KVM_HAVE_VCPU_EVENTS void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events); void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events); +#endif #ifdef __x86_64__ void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_nested_state *state); diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 633b22df46a4..9ea9ba28af1f 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -1222,6 +1222,7 @@ void vcpu_regs_set(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_regs *regs) ret, errno); } +#ifdef __KVM_HAVE_VCPU_EVENTS void vcpu_events_get(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_vcpu_events *events) { @@ -1247,6 +1248,7 @@ void vcpu_events_set(struct kvm_vm *vm, uint32_t vcpuid, TEST_ASSERT(ret == 0, "KVM_SET_VCPU_EVENTS, failed, rc: %i errno: %i", ret, errno); } +#endif #ifdef __x86_64__ void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid, -- cgit v1.2.3 From fe85ec86fcd5998389274b6e12977bcf8509d3e0 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 23 May 2019 18:43:04 +0200 Subject: KVM: selftests: Introduce a VM_MODE_DEFAULT macro for the default bits This will be required later for tests like the kvm_create_max_vcpus test that do not use the vm_create_default() function. Reviewed-by: Andrew Jones Signed-off-by: Thomas Huth Message-Id: <20190523164309.13345-5-thuth@redhat.com> Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/include/kvm_util.h | 6 ++++++ tools/testing/selftests/kvm/lib/aarch64/processor.c | 2 +- tools/testing/selftests/kvm/lib/x86_64/processor.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index b8bf961074fe..b6eb6471e6b2 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -43,6 +43,12 @@ enum vm_guest_mode { NUM_VM_MODES, }; +#ifdef __aarch64__ +#define VM_MODE_DEFAULT VM_MODE_P40V48_4K +#else +#define VM_MODE_DEFAULT VM_MODE_P52V48_4K +#endif + #define vm_guest_mode_string(m) vm_guest_mode_string[m] extern const char * const vm_guest_mode_string[]; diff --git a/tools/testing/selftests/kvm/lib/aarch64/processor.c b/tools/testing/selftests/kvm/lib/aarch64/processor.c index 19e667911496..c278e2f68b10 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/processor.c +++ b/tools/testing/selftests/kvm/lib/aarch64/processor.c @@ -227,7 +227,7 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, uint64_t extra_pg_pages = (extra_mem_pages / ptrs_per_4k_pte) * 2; struct kvm_vm *vm; - vm = vm_create(VM_MODE_P40V48_4K, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); kvm_vm_elf_load(vm, program_invocation_name, 0, 0); vm_vcpu_add_default(vm, vcpuid, guest_code); diff --git a/tools/testing/selftests/kvm/lib/x86_64/processor.c b/tools/testing/selftests/kvm/lib/x86_64/processor.c index 21f3040d90cb..90ed1d1fe963 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/processor.c +++ b/tools/testing/selftests/kvm/lib/x86_64/processor.c @@ -821,7 +821,7 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, uint64_t extra_pg_pages = extra_mem_pages / 512 * 2; /* Create VM */ - vm = vm_create(VM_MODE_P52V48_4K, + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); -- cgit v1.2.3 From da2a2d601c2d95bedf23b896bd409d53c92c04ad Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 23 May 2019 18:43:05 +0200 Subject: KVM: selftests: Align memory region addresses to 1M on s390x On s390x, there is a constraint that memory regions have to be aligned to 1M (or running the VM will fail). Introduce a new "alignment" variable in the vm_userspace_mem_region_add() function which now can be used for both, huge page and s390x alignment requirements. Signed-off-by: Thomas Huth Message-Id: <20190523164309.13345-6-thuth@redhat.com> Signed-off-by: Christian Borntraeger [prepare for THP as outlined by Andrew Jones] --- tools/testing/selftests/kvm/lib/kvm_util.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 9ea9ba28af1f..ab7fcb9acf3d 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -557,6 +557,7 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, int ret; struct userspace_mem_region *region; size_t huge_page_size = KVM_UTIL_PGS_PER_HUGEPG * vm->page_size; + size_t alignment; TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " "address not on a page boundary.\n" @@ -606,9 +607,20 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, TEST_ASSERT(region != NULL, "Insufficient Memory"); region->mmap_size = npages * vm->page_size; - /* Enough memory to align up to a huge page. */ +#ifdef __s390x__ + /* On s390x, the host address must be aligned to 1M (due to PGSTEs) */ + alignment = 0x100000; +#else + alignment = 1; +#endif + if (src_type == VM_MEM_SRC_ANONYMOUS_THP) - region->mmap_size += huge_page_size; + alignment = max(huge_page_size, alignment); + + /* Add enough memory to align up if necessary */ + if (alignment > 1) + region->mmap_size += alignment; + region->mmap_start = mmap(NULL, region->mmap_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS @@ -618,9 +630,8 @@ void vm_userspace_mem_region_add(struct kvm_vm *vm, "test_malloc failed, mmap_start: %p errno: %i", region->mmap_start, errno); - /* Align THP allocation up to start of a huge page. */ - region->host_mem = align(region->mmap_start, - src_type == VM_MEM_SRC_ANONYMOUS_THP ? huge_page_size : 1); + /* Align host address */ + region->host_mem = align(region->mmap_start, alignment); /* As needed perform madvise */ if (src_type == VM_MEM_SRC_ANONYMOUS || src_type == VM_MEM_SRC_ANONYMOUS_THP) { -- cgit v1.2.3 From edf54478d83eb0ca4174a2c9c01682c1d01cbe7d Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 23 May 2019 18:43:06 +0200 Subject: KVM: selftests: Add processor code for s390x Code that takes care of basic CPU setup, page table walking, etc. Signed-off-by: Thomas Huth Message-Id: <20190523164309.13345-7-thuth@redhat.com> Signed-off-by: Christian Borntraeger --- MAINTAINERS | 1 + tools/testing/selftests/kvm/Makefile | 1 + .../selftests/kvm/include/s390x/processor.h | 22 ++ tools/testing/selftests/kvm/lib/s390x/processor.c | 286 +++++++++++++++++++++ 4 files changed, 310 insertions(+) create mode 100644 tools/testing/selftests/kvm/include/s390x/processor.h create mode 100644 tools/testing/selftests/kvm/lib/s390x/processor.c diff --git a/MAINTAINERS b/MAINTAINERS index a6954776a37e..68ce91fffeaf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8674,6 +8674,7 @@ F: arch/s390/include/asm/gmap.h F: arch/s390/include/asm/kvm* F: arch/s390/kvm/ F: arch/s390/mm/gmap.c +F: tools/testing/selftests/kvm/*/s390x/ KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86) M: Paolo Bonzini diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 41280dc06297..b4a07cd0b48e 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -10,6 +10,7 @@ UNAME_M := $(shell uname -m) LIBKVM = lib/assert.c lib/elf.c lib/io.c lib/kvm_util.c lib/ucall.c lib/sparsebit.c LIBKVM_x86_64 = lib/x86_64/processor.c lib/x86_64/vmx.c LIBKVM_aarch64 = lib/aarch64/processor.c +LIBKVM_s390x = lib/s390x/processor.c TEST_GEN_PROGS_x86_64 = x86_64/platform_info_test TEST_GEN_PROGS_x86_64 += x86_64/set_sregs_test diff --git a/tools/testing/selftests/kvm/include/s390x/processor.h b/tools/testing/selftests/kvm/include/s390x/processor.h new file mode 100644 index 000000000000..e0e96a5f608c --- /dev/null +++ b/tools/testing/selftests/kvm/include/s390x/processor.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * s390x processor specific defines + */ +#ifndef SELFTEST_KVM_PROCESSOR_H +#define SELFTEST_KVM_PROCESSOR_H + +/* Bits in the region/segment table entry */ +#define REGION_ENTRY_ORIGIN ~0xfffUL /* region/segment table origin */ +#define REGION_ENTRY_PROTECT 0x200 /* region protection bit */ +#define REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */ +#define REGION_ENTRY_OFFSET 0xc0 /* region table offset */ +#define REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ +#define REGION_ENTRY_TYPE 0x0c /* region/segment table type mask */ +#define REGION_ENTRY_LENGTH 0x03 /* region third length */ + +/* Bits in the page table entry */ +#define PAGE_INVALID 0x400 /* HW invalid bit */ +#define PAGE_PROTECT 0x200 /* HW read-only bit */ +#define PAGE_NOEXEC 0x100 /* HW no-execute bit */ + +#endif diff --git a/tools/testing/selftests/kvm/lib/s390x/processor.c b/tools/testing/selftests/kvm/lib/s390x/processor.c new file mode 100644 index 000000000000..c8759445e7d3 --- /dev/null +++ b/tools/testing/selftests/kvm/lib/s390x/processor.c @@ -0,0 +1,286 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KVM selftest s390x library code - CPU-related functions (page tables...) + * + * Copyright (C) 2019, Red Hat, Inc. + */ + +#define _GNU_SOURCE /* for program_invocation_name */ + +#include "processor.h" +#include "kvm_util.h" +#include "../kvm_util_internal.h" + +#define KVM_GUEST_PAGE_TABLE_MIN_PADDR 0x180000 + +#define PAGES_PER_REGION 4 + +void virt_pgd_alloc(struct kvm_vm *vm, uint32_t memslot) +{ + vm_paddr_t paddr; + + TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + vm->page_size); + + if (vm->pgd_created) + return; + + paddr = vm_phy_pages_alloc(vm, PAGES_PER_REGION, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + memset(addr_gpa2hva(vm, paddr), 0xff, PAGES_PER_REGION * vm->page_size); + + vm->pgd = paddr; + vm->pgd_created = true; +} + +/* + * Allocate 4 pages for a region/segment table (ri < 4), or one page for + * a page table (ri == 4). Returns a suitable region/segment table entry + * which points to the freshly allocated pages. + */ +static uint64_t virt_alloc_region(struct kvm_vm *vm, int ri, uint32_t memslot) +{ + uint64_t taddr; + + taddr = vm_phy_pages_alloc(vm, ri < 4 ? PAGES_PER_REGION : 1, + KVM_GUEST_PAGE_TABLE_MIN_PADDR, memslot); + memset(addr_gpa2hva(vm, taddr), 0xff, PAGES_PER_REGION * vm->page_size); + + return (taddr & REGION_ENTRY_ORIGIN) + | (((4 - ri) << 2) & REGION_ENTRY_TYPE) + | ((ri < 4 ? (PAGES_PER_REGION - 1) : 0) & REGION_ENTRY_LENGTH); +} + +/* + * VM Virtual Page Map + * + * Input Args: + * vm - Virtual Machine + * gva - VM Virtual Address + * gpa - VM Physical Address + * memslot - Memory region slot for new virtual translation tables + * + * Output Args: None + * + * Return: None + * + * Within the VM given by vm, creates a virtual translation for the page + * starting at vaddr to the page starting at paddr. + */ +void virt_pg_map(struct kvm_vm *vm, uint64_t gva, uint64_t gpa, + uint32_t memslot) +{ + int ri, idx; + uint64_t *entry; + + TEST_ASSERT((gva % vm->page_size) == 0, + "Virtual address not on page boundary,\n" + " vaddr: 0x%lx vm->page_size: 0x%x", + gva, vm->page_size); + TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, + (gva >> vm->page_shift)), + "Invalid virtual address, vaddr: 0x%lx", + gva); + TEST_ASSERT((gpa % vm->page_size) == 0, + "Physical address not on page boundary,\n" + " paddr: 0x%lx vm->page_size: 0x%x", + gva, vm->page_size); + TEST_ASSERT((gpa >> vm->page_shift) <= vm->max_gfn, + "Physical address beyond beyond maximum supported,\n" + " paddr: 0x%lx vm->max_gfn: 0x%lx vm->page_size: 0x%x", + gva, vm->max_gfn, vm->page_size); + + /* Walk through region and segment tables */ + entry = addr_gpa2hva(vm, vm->pgd); + for (ri = 1; ri <= 4; ri++) { + idx = (gva >> (64 - 11 * ri)) & 0x7ffu; + if (entry[idx] & REGION_ENTRY_INVALID) + entry[idx] = virt_alloc_region(vm, ri, memslot); + entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); + } + + /* Fill in page table entry */ + idx = (gva >> 12) & 0x0ffu; /* page index */ + if (!(entry[idx] & PAGE_INVALID)) + fprintf(stderr, + "WARNING: PTE for gpa=0x%"PRIx64" already set!\n", gpa); + entry[idx] = gpa; +} + +/* + * Address Guest Virtual to Guest Physical + * + * Input Args: + * vm - Virtual Machine + * gpa - VM virtual address + * + * Output Args: None + * + * Return: + * Equivalent VM physical address + * + * Translates the VM virtual address given by gva to a VM physical + * address and then locates the memory region containing the VM + * physical address, within the VM given by vm. When found, the host + * virtual address providing the memory to the vm physical address is + * returned. + * A TEST_ASSERT failure occurs if no region containing translated + * VM virtual address exists. + */ +vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva) +{ + int ri, idx; + uint64_t *entry; + + TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + vm->page_size); + + entry = addr_gpa2hva(vm, vm->pgd); + for (ri = 1; ri <= 4; ri++) { + idx = (gva >> (64 - 11 * ri)) & 0x7ffu; + TEST_ASSERT(!(entry[idx] & REGION_ENTRY_INVALID), + "No region mapping for vm virtual address 0x%lx", + gva); + entry = addr_gpa2hva(vm, entry[idx] & REGION_ENTRY_ORIGIN); + } + + idx = (gva >> 12) & 0x0ffu; /* page index */ + + TEST_ASSERT(!(entry[idx] & PAGE_INVALID), + "No page mapping for vm virtual address 0x%lx", gva); + + return (entry[idx] & ~0xffful) + (gva & 0xffful); +} + +static void virt_dump_ptes(FILE *stream, struct kvm_vm *vm, uint8_t indent, + uint64_t ptea_start) +{ + uint64_t *pte, ptea; + + for (ptea = ptea_start; ptea < ptea_start + 0x100 * 8; ptea += 8) { + pte = addr_gpa2hva(vm, ptea); + if (*pte & PAGE_INVALID) + continue; + fprintf(stream, "%*spte @ 0x%lx: 0x%016lx\n", + indent, "", ptea, *pte); + } +} + +static void virt_dump_region(FILE *stream, struct kvm_vm *vm, uint8_t indent, + uint64_t reg_tab_addr) +{ + uint64_t addr, *entry; + + for (addr = reg_tab_addr; addr < reg_tab_addr + 0x400 * 8; addr += 8) { + entry = addr_gpa2hva(vm, addr); + if (*entry & REGION_ENTRY_INVALID) + continue; + fprintf(stream, "%*srt%lde @ 0x%lx: 0x%016lx\n", + indent, "", 4 - ((*entry & REGION_ENTRY_TYPE) >> 2), + addr, *entry); + if (*entry & REGION_ENTRY_TYPE) { + virt_dump_region(stream, vm, indent + 2, + *entry & REGION_ENTRY_ORIGIN); + } else { + virt_dump_ptes(stream, vm, indent + 2, + *entry & REGION_ENTRY_ORIGIN); + } + } +} + +void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent) +{ + if (!vm->pgd_created) + return; + + virt_dump_region(stream, vm, indent, vm->pgd); +} + +/* + * Create a VM with reasonable defaults + * + * Input Args: + * vcpuid - The id of the single VCPU to add to the VM. + * extra_mem_pages - The size of extra memories to add (this will + * decide how much extra space we will need to + * setup the page tables using mem slot 0) + * guest_code - The vCPU's entry point + * + * Output Args: None + * + * Return: + * Pointer to opaque structure that describes the created VM. + */ +struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages, + void *guest_code) +{ + /* + * The additional amount of pages required for the page tables is: + * 1 * n / 256 + 4 * (n / 256) / 2048 + 4 * (n / 256) / 2048^2 + ... + * which is definitely smaller than (n / 256) * 2. + */ + uint64_t extra_pg_pages = extra_mem_pages / 256 * 2; + struct kvm_vm *vm; + + vm = vm_create(VM_MODE_DEFAULT, + DEFAULT_GUEST_PHY_PAGES + extra_pg_pages, O_RDWR); + + kvm_vm_elf_load(vm, program_invocation_name, 0, 0); + vm_vcpu_add_default(vm, vcpuid, guest_code); + + return vm; +} + +/* + * Adds a vCPU with reasonable defaults (i.e. a stack and initial PSW) + * + * Input Args: + * vcpuid - The id of the VCPU to add to the VM. + * guest_code - The vCPU's entry point + */ +void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code) +{ + size_t stack_size = DEFAULT_STACK_PGS * getpagesize(); + uint64_t stack_vaddr; + struct kvm_regs regs; + struct kvm_sregs sregs; + struct kvm_run *run; + + TEST_ASSERT(vm->page_size == 4096, "Unsupported page size: 0x%x", + vm->page_size); + + stack_vaddr = vm_vaddr_alloc(vm, stack_size, + DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0); + + vm_vcpu_add(vm, vcpuid, 0, 0); + + /* Setup guest registers */ + vcpu_regs_get(vm, vcpuid, ®s); + regs.gprs[15] = stack_vaddr + (DEFAULT_STACK_PGS * getpagesize()) - 160; + vcpu_regs_set(vm, vcpuid, ®s); + + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs.crs[1] = vm->pgd | 0xf; /* Primary region table */ + vcpu_sregs_set(vm, vcpuid, &sregs); + + run = vcpu_state(vm, vcpuid); + run->psw_mask = 0x0400000180000000ULL; /* DAT enabled + 64 bit mode */ + run->psw_addr = (uintptr_t)guest_code; +} + +void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot) +{ + struct kvm_sregs sregs; + + vcpu_sregs_get(vm, vcpuid, &sregs); + sregs.crs[0] |= 0x00040000; /* Enable floating point regs */ + vcpu_sregs_set(vm, vcpuid, &sregs); +} + +void vcpu_dump(FILE *stream, struct kvm_vm *vm, uint32_t vcpuid, uint8_t indent) +{ + struct vcpu *vcpu = vm->vcpu_head; + + fprintf(stream, "%*spstate: psw: 0x%.16llx:0x%.16llx\n", + indent, "", vcpu->state->psw_mask, vcpu->state->psw_addr); +} -- cgit v1.2.3 From ee1563f42856f756c72bb0924109432d566c9efa Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 23 May 2019 18:43:07 +0200 Subject: KVM: selftests: Add the sync_regs test for s390x The test is an adaption of the same test for x86. Note that there are some differences in the way how s390x deals with the kvm_valid_regs in struct kvm_run, so some of the tests had to be removed. Also this test is not using the ucall() interface on s390x yet (which would need some work to be usable on s390x), so it simply drops out of the VM with a diag 0x501 breakpoint instead. Signed-off-by: Thomas Huth Message-Id: <20190523164309.13345-8-thuth@redhat.com> Signed-off-by: Christian Borntraeger --- MAINTAINERS | 1 + tools/testing/selftests/kvm/Makefile | 2 + tools/testing/selftests/kvm/s390x/sync_regs_test.c | 151 +++++++++++++++++++++ 3 files changed, 154 insertions(+) create mode 100644 tools/testing/selftests/kvm/s390x/sync_regs_test.c diff --git a/MAINTAINERS b/MAINTAINERS index 68ce91fffeaf..c8264e972364 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8674,6 +8674,7 @@ F: arch/s390/include/asm/gmap.h F: arch/s390/include/asm/kvm* F: arch/s390/kvm/ F: arch/s390/mm/gmap.c +F: tools/testing/selftests/kvm/s390x/ F: tools/testing/selftests/kvm/*/s390x/ KERNEL VIRTUAL MACHINE FOR X86 (KVM/x86) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index b4a07cd0b48e..ed8eb6007566 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -30,6 +30,8 @@ TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += clear_dirty_log_test +TEST_GEN_PROGS_s390x += s390x/sync_regs_test + TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/s390x/sync_regs_test.c b/tools/testing/selftests/kvm/s390x/sync_regs_test.c new file mode 100644 index 000000000000..e85ff0d69548 --- /dev/null +++ b/tools/testing/selftests/kvm/s390x/sync_regs_test.c @@ -0,0 +1,151 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test for s390x KVM_CAP_SYNC_REGS + * + * Based on the same test for x86: + * Copyright (C) 2018, Google LLC. + * + * Adaptions for s390x: + * Copyright (C) 2019, Red Hat, Inc. + * + * Test expected behavior of the KVM_CAP_SYNC_REGS functionality. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" + +#define VCPU_ID 5 + +static void guest_code(void) +{ + for (;;) { + asm volatile ("diag 0,0,0x501"); + asm volatile ("ahi 11,1"); + } +} + +#define REG_COMPARE(reg) \ + TEST_ASSERT(left->reg == right->reg, \ + "Register " #reg \ + " values did not match: 0x%llx, 0x%llx\n", \ + left->reg, right->reg) + +static void compare_regs(struct kvm_regs *left, struct kvm_sync_regs *right) +{ + int i; + + for (i = 0; i < 16; i++) + REG_COMPARE(gprs[i]); +} + +static void compare_sregs(struct kvm_sregs *left, struct kvm_sync_regs *right) +{ + int i; + + for (i = 0; i < 16; i++) + REG_COMPARE(acrs[i]); + + for (i = 0; i < 16; i++) + REG_COMPARE(crs[i]); +} + +#undef REG_COMPARE + +#define TEST_SYNC_FIELDS (KVM_SYNC_GPRS|KVM_SYNC_ACRS|KVM_SYNC_CRS) +#define INVALID_SYNC_FIELD 0x80000000 + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_run *run; + struct kvm_regs regs; + struct kvm_sregs sregs; + int rv, cap; + + /* Tell stdout not to buffer its content */ + setbuf(stdout, NULL); + + cap = kvm_check_cap(KVM_CAP_SYNC_REGS); + if (!cap) { + fprintf(stderr, "CAP_SYNC_REGS not supported, skipping test\n"); + exit(KSFT_SKIP); + } + + /* Create VM */ + vm = vm_create_default(VCPU_ID, 0, guest_code); + + run = vcpu_state(vm, VCPU_ID); + + /* Request and verify all valid register sets. */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "Unexpected exit reason: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s390_sieic.icptcode == 4 && + (run->s390_sieic.ipa >> 8) == 0x83 && + (run->s390_sieic.ipb >> 16) == 0x501, + "Unexpected interception code: ic=%u, ipa=0x%x, ipb=0x%x\n", + run->s390_sieic.icptcode, run->s390_sieic.ipa, + run->s390_sieic.ipb); + + vcpu_regs_get(vm, VCPU_ID, ®s); + compare_regs(®s, &run->s.regs); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + compare_sregs(&sregs, &run->s.regs); + + /* Set and verify various register values */ + run->s.regs.gprs[11] = 0xBAD1DEA; + run->s.regs.acrs[0] = 1 << 11; + + run->kvm_valid_regs = TEST_SYNC_FIELDS; + run->kvm_dirty_regs = KVM_SYNC_GPRS | KVM_SYNC_ACRS; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "Unexpected exit reason: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.gprs[11] == 0xBAD1DEA + 1, + "r11 sync regs value incorrect 0x%llx.", + run->s.regs.gprs[11]); + TEST_ASSERT(run->s.regs.acrs[0] == 1 << 11, + "acr0 sync regs value incorrect 0x%llx.", + run->s.regs.acrs[0]); + + vcpu_regs_get(vm, VCPU_ID, ®s); + compare_regs(®s, &run->s.regs); + + vcpu_sregs_get(vm, VCPU_ID, &sregs); + compare_sregs(&sregs, &run->s.regs); + + /* Clear kvm_dirty_regs bits, verify new s.regs values are + * overwritten with existing guest values. + */ + run->kvm_valid_regs = TEST_SYNC_FIELDS; + run->kvm_dirty_regs = 0; + run->s.regs.gprs[11] = 0xDEADBEEF; + rv = _vcpu_run(vm, VCPU_ID); + TEST_ASSERT(rv == 0, "vcpu_run failed: %d\n", rv); + TEST_ASSERT(run->exit_reason == KVM_EXIT_S390_SIEIC, + "Unexpected exit reason: %u (%s)\n", + run->exit_reason, + exit_reason_str(run->exit_reason)); + TEST_ASSERT(run->s.regs.gprs[11] != 0xDEADBEEF, + "r11 sync regs value incorrect 0x%llx.", + run->s.regs.gprs[11]); + + kvm_vm_free(vm); + + return 0; +} -- cgit v1.2.3 From 49fe9a5d16386310b15abcfe47343c26c3cc05f2 Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 23 May 2019 18:43:09 +0200 Subject: KVM: selftests: Move kvm_create_max_vcpus test to generic code There is nothing x86-specific in the test apart from the VM_MODE_P52V48_4K which we can now replace with VM_MODE_DEFAULT. Thus let's move the file to the main folder and enable it for aarch64 and s390x, too. Reviewed-by: Andrew Jones Reviewed-by: David Hildenbrand Signed-off-by: Thomas Huth Message-Id: <20190523164309.13345-10-thuth@redhat.com> Signed-off-by: Christian Borntraeger --- tools/testing/selftests/kvm/Makefile | 4 +- tools/testing/selftests/kvm/kvm_create_max_vcpus.c | 71 ++++++++++++++++++++++ .../selftests/kvm/x86_64/kvm_create_max_vcpus.c | 70 --------------------- 3 files changed, 74 insertions(+), 71 deletions(-) create mode 100644 tools/testing/selftests/kvm/kvm_create_max_vcpus.c delete mode 100644 tools/testing/selftests/kvm/x86_64/kvm_create_max_vcpus.c diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index ed8eb6007566..05a94a13a0a3 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -22,15 +22,17 @@ TEST_GEN_PROGS_x86_64 += x86_64/evmcs_test TEST_GEN_PROGS_x86_64 += x86_64/hyperv_cpuid TEST_GEN_PROGS_x86_64 += x86_64/vmx_close_while_nested_test TEST_GEN_PROGS_x86_64 += x86_64/smm_test -TEST_GEN_PROGS_x86_64 += x86_64/kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test +TEST_GEN_PROGS_x86_64 += kvm_create_max_vcpus TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += clear_dirty_log_test TEST_GEN_PROGS_aarch64 += dirty_log_test TEST_GEN_PROGS_aarch64 += clear_dirty_log_test +TEST_GEN_PROGS_aarch64 += kvm_create_max_vcpus TEST_GEN_PROGS_s390x += s390x/sync_regs_test +TEST_GEN_PROGS_s390x += kvm_create_max_vcpus TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M)) LIBKVM += $(LIBKVM_$(UNAME_M)) diff --git a/tools/testing/selftests/kvm/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c new file mode 100644 index 000000000000..db78ce07c416 --- /dev/null +++ b/tools/testing/selftests/kvm/kvm_create_max_vcpus.c @@ -0,0 +1,71 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * kvm_create_max_vcpus + * + * Copyright (C) 2019, Google LLC. + * + * This work is licensed under the terms of the GNU GPL, version 2. + * + * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID. + */ + +#define _GNU_SOURCE /* for program_invocation_short_name */ +#include +#include +#include +#include + +#include "test_util.h" + +#include "kvm_util.h" +#include "asm/kvm.h" +#include "linux/kvm.h" + +void test_vcpu_creation(int first_vcpu_id, int num_vcpus) +{ + struct kvm_vm *vm; + int i; + + printf("Testing creating %d vCPUs, with IDs %d...%d.\n", + num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + + for (i = 0; i < num_vcpus; i++) { + int vcpu_id = first_vcpu_id + i; + + /* This asserts that the vCPU was created. */ + vm_vcpu_add(vm, vcpu_id, 0, 0); + } + + kvm_vm_free(vm); +} + +int main(int argc, char *argv[]) +{ + int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); + int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); + + printf("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); + printf("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); + + /* + * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. + * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID + * in this case. + */ + if (!kvm_max_vcpu_id) + kvm_max_vcpu_id = kvm_max_vcpus; + + TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus, + "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).", + kvm_max_vcpu_id, kvm_max_vcpus); + + test_vcpu_creation(0, kvm_max_vcpus); + + if (kvm_max_vcpu_id > kvm_max_vcpus) + test_vcpu_creation( + kvm_max_vcpu_id - kvm_max_vcpus, kvm_max_vcpus); + + return 0; +} diff --git a/tools/testing/selftests/kvm/x86_64/kvm_create_max_vcpus.c b/tools/testing/selftests/kvm/x86_64/kvm_create_max_vcpus.c deleted file mode 100644 index 50e92996f918..000000000000 --- a/tools/testing/selftests/kvm/x86_64/kvm_create_max_vcpus.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * kvm_create_max_vcpus - * - * Copyright (C) 2019, Google LLC. - * - * This work is licensed under the terms of the GNU GPL, version 2. - * - * Test for KVM_CAP_MAX_VCPUS and KVM_CAP_MAX_VCPU_ID. - */ - -#define _GNU_SOURCE /* for program_invocation_short_name */ -#include -#include -#include -#include - -#include "test_util.h" - -#include "kvm_util.h" -#include "asm/kvm.h" -#include "linux/kvm.h" - -void test_vcpu_creation(int first_vcpu_id, int num_vcpus) -{ - struct kvm_vm *vm; - int i; - - printf("Testing creating %d vCPUs, with IDs %d...%d.\n", - num_vcpus, first_vcpu_id, first_vcpu_id + num_vcpus - 1); - - vm = vm_create(VM_MODE_P52V48_4K, DEFAULT_GUEST_PHY_PAGES, O_RDWR); - - for (i = 0; i < num_vcpus; i++) { - int vcpu_id = first_vcpu_id + i; - - /* This asserts that the vCPU was created. */ - vm_vcpu_add(vm, vcpu_id, 0, 0); - } - - kvm_vm_free(vm); -} - -int main(int argc, char *argv[]) -{ - int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID); - int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS); - - printf("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id); - printf("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus); - - /* - * Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID. - * Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID - * in this case. - */ - if (!kvm_max_vcpu_id) - kvm_max_vcpu_id = kvm_max_vcpus; - - TEST_ASSERT(kvm_max_vcpu_id >= kvm_max_vcpus, - "KVM_MAX_VCPU_ID (%d) must be at least as large as KVM_MAX_VCPUS (%d).", - kvm_max_vcpu_id, kvm_max_vcpus); - - test_vcpu_creation(0, kvm_max_vcpus); - - if (kvm_max_vcpu_id > kvm_max_vcpus) - test_vcpu_creation( - kvm_max_vcpu_id - kvm_max_vcpus, kvm_max_vcpus); - - return 0; -} -- cgit v1.2.3 From 8343ba2d4820b1738bbb7cb40ec18ea0a3b0b331 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 24 May 2019 12:27:01 +0200 Subject: KVM: selftests: enable pgste option for the linker on s390 To avoid testcase failures we need to enable the pgstes. This can be done with /proc/sys/vm/allocate_pgste or with a linker option that creates an S390_PGSTE program header. Reviewed-by: Thomas Huth Reviewed-by: David Hildenbrand Signed-off-by: Christian Borntraeger [Fixed as outlined by kernel test robot ] --- tools/testing/selftests/kvm/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 05a94a13a0a3..0d7265da1583 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -47,7 +47,12 @@ CFLAGS += -Wall -Wstrict-prototypes -Wuninitialized -O2 -g -std=gnu99 \ no-pie-option := $(call try-run, echo 'int main() { return 0; }' | \ $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -no-pie -x c - -o "$$TMP", -no-pie) -LDFLAGS += -pthread $(no-pie-option) +# On s390, build the testcases KVM-enabled +pgste-option = $(call try-run, echo 'int main() { return 0; }' | \ + $(CC) -Werror -Wl$(comma)--s390-pgste -x c - -o "$$TMP",-Wl$(comma)--s390-pgste) + + +LDFLAGS += -pthread $(no-pie-option) $(pgste-option) # After inclusion, $(OUTPUT) is defined and # $(TEST_GEN_PROGS) starts with $(OUTPUT)/ -- cgit v1.2.3 From 0b774629512057b4becc705e2495220844e6e795 Mon Sep 17 00:00:00 2001 From: Jing Liu Date: Thu, 11 Jul 2019 13:49:57 +0800 Subject: KVM: x86: expose AVX512_BF16 feature to guest AVX512 BFLOAT16 instructions support 16-bit BFLOAT16 floating-point format (BF16) for deep learning optimization. Intel adds AVX512 BFLOAT16 feature in CooperLake, which is CPUID.7.1.EAX[5]. Detailed information of the CPUID bit can be found here, https://software.intel.com/sites/default/files/managed/c5/15/\ architecture-instruction-set-extensions-programming-reference.pdf. Signed-off-by: Jing Liu [Fix type mismatch in min, changing constant "1" to "1u". - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/cpuid.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index ead681210306..22c2720cd948 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -368,9 +368,13 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR); + /* cpuid 7.1.eax */ + const u32 kvm_cpuid_7_1_eax_x86_features = + F(AVX512_BF16); + switch (index) { case 0: - entry->eax = 0; + entry->eax = min(entry->eax, 1u); entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; cpuid_mask(&entry->ebx, CPUID_7_0_EBX); /* TSC_ADJUST is emulated */ @@ -394,6 +398,12 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index) */ entry->edx |= F(ARCH_CAPABILITIES); break; + case 1: + entry->eax &= kvm_cpuid_7_1_eax_x86_features; + entry->ebx = 0; + entry->ecx = 0; + entry->edx = 0; + break; default: WARN_ON_ONCE(1); entry->eax = 0; -- cgit v1.2.3 From f4e4805e4bf7a06235d2aa216e1d00cb1f3bd0c1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 12 Jul 2019 16:13:09 +0200 Subject: x86: kvm: avoid -Wsometimes-uninitized warning Clang notices a code path in which some variables are never initialized, but fails to figure out that this can never happen on i386 because is_64_bit_mode() always returns false. arch/x86/kvm/hyperv.c:1610:6: error: variable 'ingpa' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] if (!longmode) { ^~~~~~~~~ arch/x86/kvm/hyperv.c:1632:55: note: uninitialized use occurs here trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa); ^~~~~ arch/x86/kvm/hyperv.c:1610:2: note: remove the 'if' if its condition is always true if (!longmode) { ^~~~~~~~~~~~~~~ arch/x86/kvm/hyperv.c:1595:18: note: initialize the variable 'ingpa' to silence this warning u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; ^ = 0 arch/x86/kvm/hyperv.c:1610:6: error: variable 'outgpa' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] arch/x86/kvm/hyperv.c:1610:6: error: variable 'param' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] Flip the condition around to avoid the conditional execution on i386. Signed-off-by: Arnd Bergmann Signed-off-by: Paolo Bonzini --- arch/x86/kvm/hyperv.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index a39e38f13029..c10a8b10b203 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1594,7 +1594,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) { u64 param, ingpa, outgpa, ret = HV_STATUS_SUCCESS; uint16_t code, rep_idx, rep_cnt; - bool fast, longmode, rep; + bool fast, rep; /* * hypercall generates UD from non zero cpl and real mode @@ -1605,9 +1605,14 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) return 1; } - longmode = is_64_bit_mode(vcpu); - - if (!longmode) { +#ifdef CONFIG_X86_64 + if (is_64_bit_mode(vcpu)) { + param = kvm_rcx_read(vcpu); + ingpa = kvm_rdx_read(vcpu); + outgpa = kvm_r8_read(vcpu); + } else +#endif + { param = ((u64)kvm_rdx_read(vcpu) << 32) | (kvm_rax_read(vcpu) & 0xffffffff); ingpa = ((u64)kvm_rbx_read(vcpu) << 32) | @@ -1615,13 +1620,6 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu) outgpa = ((u64)kvm_rdi_read(vcpu) << 32) | (kvm_rsi_read(vcpu) & 0xffffffff); } -#ifdef CONFIG_X86_64 - else { - param = kvm_rcx_read(vcpu); - ingpa = kvm_rdx_read(vcpu); - outgpa = kvm_r8_read(vcpu); - } -#endif code = param & 0xffff; fast = !!(param & HV_HYPERCALL_FAST_BIT); -- cgit v1.2.3 From a6a6d3b1f867d34ba5bd61aa7bb056b48ca67cff Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 12 Jul 2019 11:12:30 +0200 Subject: x86: kvm: avoid constant-conversion warning clang finds a contruct suspicious that converts an unsigned character to a signed integer and back, causing an overflow: arch/x86/kvm/mmu.c:4605:39: error: implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from -205 to 51 [-Werror,-Wconstant-conversion] u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0; ~~ ^~ arch/x86/kvm/mmu.c:4607:38: error: implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from -241 to 15 [-Werror,-Wconstant-conversion] u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0; ~~ ^~ arch/x86/kvm/mmu.c:4609:39: error: implicit conversion from 'int' to 'u8' (aka 'unsigned char') changes value from -171 to 85 [-Werror,-Wconstant-conversion] u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0; ~~ ^~ Add an explicit cast to tell clang that everything works as intended here. Signed-off-by: Arnd Bergmann Link: https://github.com/ClangBuiltLinux/linux/issues/95 Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9a5814d8d194..8f72526e2f68 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4597,11 +4597,11 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu, */ /* Faults from writes to non-writable pages */ - u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0; + u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : 0; /* Faults from user mode accesses to supervisor pages */ - u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0; + u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : 0; /* Faults from fetches of non-executable pages*/ - u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0; + u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : 0; /* Faults from kernel mode fetches of user pages */ u8 smepf = 0; /* Faults from kernel mode accesses of user pages */ -- cgit v1.2.3 From 9481b7f10c5a7f149048310c25510f0386eb6631 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 15 Jul 2019 12:35:17 +0800 Subject: kvm: vmx: fix coccinelle warnings This fixes the following coccinelle warning: WARNING: return of 0/1 in function 'vmx_need_emulation_on_page_fault' with return type bool Return false instead of 0. Signed-off-by: Yi Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 69536553446d..84f8d49a2fd2 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7453,7 +7453,7 @@ static int enable_smi_window(struct kvm_vcpu *vcpu) static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { - return 0; + return false; } static __init int hardware_setup(void) -- cgit v1.2.3 From 9a5611af5edb5fa5fed11b4c5e96906524f8c323 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sat, 6 Jul 2019 01:10:22 +0800 Subject: kvm: x86: some tsc debug cleanup There are some pr_debug in TSC code, which may have been no use, so remove them as Paolo suggested. Signed-off-by: Yi Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a0b74ecd1de..9d7b9e6a0939 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1540,9 +1540,6 @@ static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, *pshift = shift; *pmultiplier = div_frac(scaled64, tps32); - - pr_debug("%s: base_hz %llu => %llu, shift %d, mul %u\n", - __func__, base_hz, scaled_hz, shift, *pmultiplier); } #ifdef CONFIG_X86_64 @@ -1785,12 +1782,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!kvm_check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; - pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; offset = kvm_compute_tsc_offset(vcpu, data); - pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation); @@ -1809,8 +1804,6 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) kvm->arch.cur_tsc_write = data; kvm->arch.cur_tsc_offset = offset; matched = false; - pr_debug("kvm: new tsc generation %llu, clock %llu\n", - kvm->arch.cur_tsc_generation, data); } /* @@ -6911,7 +6904,6 @@ static void kvm_timer_init(void) cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); } - pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz); cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", kvmclock_cpu_online, kvmclock_cpu_down_prep); -- cgit v1.2.3 From 0d88800d547211ce07be3551c812d404cf2be3a8 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sat, 6 Jul 2019 01:08:48 +0800 Subject: kvm: x86: ioapic and apic debug macros cleanup The ioapic_debug and apic_debug have been not used for years, and kvm tracepoints are enough for debugging, so remove them as Paolo suggested. However, there may be something wrong when pv evi get/put user, so it's better to retain some log there. Signed-off-by: Yi Wang Signed-off-by: Paolo Bonzini --- arch/x86/kvm/ioapic.c | 15 -------- arch/x86/kvm/lapic.c | 95 +++++---------------------------------------------- 2 files changed, 9 insertions(+), 101 deletions(-) diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 1add1bc881e2..d859ae8890d0 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -45,11 +45,6 @@ #include "lapic.h" #include "irq.h" -#if 0 -#define ioapic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) -#else -#define ioapic_debug(fmt, arg...) -#endif static int ioapic_service(struct kvm_ioapic *vioapic, int irq, bool line_status); @@ -294,7 +289,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) default: index = (ioapic->ioregsel - 0x10) >> 1; - ioapic_debug("change redir index %x val %x\n", index, val); if (index >= IOAPIC_NUM_PINS) return; e = &ioapic->redirtbl[index]; @@ -343,12 +337,6 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status) entry->fields.remote_irr)) return -1; - ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " - "vector=%x trig_mode=%x\n", - entry->fields.dest_id, entry->fields.dest_mode, - entry->fields.delivery_mode, entry->fields.vector, - entry->fields.trig_mode); - irqe.dest_id = entry->fields.dest_id; irqe.vector = entry->fields.vector; irqe.dest_mode = entry->fields.dest_mode; @@ -515,7 +503,6 @@ static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ioapic_debug("addr %lx\n", (unsigned long)addr); ASSERT(!(addr & 0xf)); /* check alignment */ addr &= 0xff; @@ -558,8 +545,6 @@ static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, if (!ioapic_in_range(ioapic, addr)) return -EOPNOTSUPP; - ioapic_debug("ioapic_mmio_write addr=%p len=%d val=%p\n", - (void*)addr, len, val); ASSERT(!(addr & 0xf)); /* check alignment */ switch (len) { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index a232e76d8f23..b7fda1dd3cc8 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -52,9 +52,6 @@ #define PRIu64 "u" #define PRIo64 "o" -/* #define apic_debug(fmt,arg...) printk(KERN_WARNING fmt,##arg) */ -#define apic_debug(fmt, arg...) do {} while (0) - /* 14 is the version for Xeon and Pentium 8.4.8*/ #define APIC_VERSION (0x14UL | ((KVM_APIC_LVT_NUM - 1) << 16)) #define LAPIC_MMIO_LENGTH (1 << 12) @@ -627,7 +624,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) { u8 val; if (pv_eoi_get_user(vcpu, &val) < 0) - apic_debug("Can't read EOI MSR value: 0x%llx\n", + printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); return val & 0x1; } @@ -635,7 +632,7 @@ static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu) static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_ENABLED) < 0) { - apic_debug("Can't set EOI MSR value: 0x%llx\n", + printk(KERN_WARNING "Can't set EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } @@ -645,7 +642,7 @@ static void pv_eoi_set_pending(struct kvm_vcpu *vcpu) static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu) { if (pv_eoi_put_user(vcpu, KVM_PV_EOI_DISABLED) < 0) { - apic_debug("Can't clear EOI MSR value: 0x%llx\n", + printk(KERN_WARNING "Can't clear EOI MSR value: 0x%llx\n", (unsigned long long)vcpu->arch.pv_eoi.msr_val); return; } @@ -679,9 +676,6 @@ static bool __apic_update_ppr(struct kvm_lapic *apic, u32 *new_ppr) else ppr = isrv & 0xf0; - apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", - apic, ppr, isr, isrv); - *new_ppr = ppr; if (old_ppr != ppr) kvm_lapic_set_reg(apic, APIC_PROCPRI, ppr); @@ -758,8 +752,6 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda) return ((logical_id >> 4) == (mda >> 4)) && (logical_id & mda & 0xf) != 0; default: - apic_debug("Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, kvm_lapic_get_reg(apic, APIC_DFR)); return false; } } @@ -798,10 +790,6 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, struct kvm_lapic *target = vcpu->arch.apic; u32 mda = kvm_apic_mda(vcpu, dest, source, target); - apic_debug("target %p, source %p, dest 0x%x, " - "dest_mode 0x%x, short_hand 0x%x\n", - target, source, dest, dest_mode, short_hand); - ASSERT(target); switch (short_hand) { case APIC_DEST_NOSHORT: @@ -816,8 +804,6 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, case APIC_DEST_ALLBUT: return target != source; default: - apic_debug("kvm: apic: Bad dest shorthand value %x\n", - short_hand); return false; } } @@ -1095,15 +1081,10 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, smp_wmb(); kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_vcpu_kick(vcpu); - } else { - apic_debug("Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); } break; case APIC_DM_STARTUP: - apic_debug("SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); result = 1; apic->sipi_vector = vector; /* make sure sipi_vector is visible for the receiver */ @@ -1221,14 +1202,6 @@ static void apic_send_ipi(struct kvm_lapic *apic) trace_kvm_apic_ipi(icr_low, irq.dest_id); - apic_debug("icr_high 0x%x, icr_low 0x%x, " - "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, " - "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x, " - "msi_redir_hint 0x%x\n", - icr_high, icr_low, irq.shorthand, irq.dest_id, - irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, - irq.vector, irq.msi_redir_hint); - kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq, NULL); } @@ -1282,7 +1255,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) switch (offset) { case APIC_ARBPRI: - apic_debug("Access APIC ARBPRI register which is for P6\n"); break; case APIC_TMCCT: /* Timer CCR */ @@ -1349,11 +1321,8 @@ int kvm_lapic_reg_read(struct kvm_lapic *apic, u32 offset, int len, if (!apic_x2apic_mode(apic)) valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI); - if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) { - apic_debug("KVM_APIC_READ: read reserved register %x\n", - offset); + if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset))) return 1; - } result = __apic_read(apic, offset & ~0xf); @@ -1411,9 +1380,6 @@ static void update_divide_count(struct kvm_lapic *apic) tmp1 = tdcr & 0xf; tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1; apic->divide_count = 0x1 << (tmp2 & 0x7); - - apic_debug("timer divide count is 0x%x\n", - apic->divide_count); } static void limit_periodic_timer_frequency(struct kvm_lapic *apic) @@ -1648,16 +1614,6 @@ static bool set_target_expiration(struct kvm_lapic *apic) limit_periodic_timer_frequency(apic); - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_lapic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); - apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); @@ -1860,8 +1816,6 @@ static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) if (apic->lvt0_in_nmi_mode != lvt0_in_nmi_mode) { apic->lvt0_in_nmi_mode = lvt0_in_nmi_mode; if (lvt0_in_nmi_mode) { - apic_debug("Receive NMI setting on APIC_LVT0 " - "for cpu %d\n", apic->vcpu->vcpu_id); atomic_inc(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); } else atomic_dec(&apic->vcpu->kvm->arch.vapics_in_nmi_mode); @@ -1975,8 +1929,6 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: { uint32_t old_divisor = apic->divide_count; - if (val & 4) - apic_debug("KVM_WRITE:TDCR %x\n", val); kvm_lapic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); if (apic->divide_count != old_divisor && @@ -1988,10 +1940,8 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; } case APIC_ESR: - if (apic_x2apic_mode(apic) && val != 0) { - apic_debug("KVM_WRITE:ESR not zero %x\n", val); + if (apic_x2apic_mode(apic) && val != 0) ret = 1; - } break; case APIC_SELF_IPI: @@ -2004,8 +1954,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) ret = 1; break; } - if (ret) - apic_debug("Local APIC Write to read-only register %x\n", reg); + return ret; } EXPORT_SYMBOL_GPL(kvm_lapic_reg_write); @@ -2033,20 +1982,12 @@ static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, * 32/64/128 bits registers must be accessed thru 32 bits. * Refer SDM 8.4.1 */ - if (len != 4 || (offset & 0xf)) { - /* Don't shout loud, $infamous_os would cause only noise. */ - apic_debug("apic write: bad size=%d %lx\n", len, (long)address); + if (len != 4 || (offset & 0xf)) return 0; - } val = *(u32*)data; - /* too common printing */ - if (offset != APIC_EOI) - apic_debug("%s: offset 0x%x with length 0x%x, and value is " - "0x%x\n", __func__, offset, len, val); - - kvm_lapic_reg_write(apic, offset, val); + kvm_lapic_reg_write(apic, offset & 0xff0, val); return 0; } @@ -2178,11 +2119,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) if ((value & MSR_IA32_APICBASE_ENABLE) && apic->base_address != APIC_DEFAULT_PHYS_BASE) pr_warn_once("APIC base relocation is unsupported by KVM"); - - /* with FSB delivery interrupt, we can restart APIC functionality */ - apic_debug("apic base msr is 0x%016" PRIx64 ", and base address is " - "0x%lx.\n", apic->vcpu->arch.apic_base, apic->base_address); - } void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -2193,8 +2129,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) if (!apic) return; - apic_debug("%s\n", __func__); - /* Stop the timer in case it's a reset to an active apic */ hrtimer_cancel(&apic->lapic_timer.timer); @@ -2247,11 +2181,6 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.apic_arb_prio = 0; vcpu->arch.apic_attention = 0; - - apic_debug("%s: vcpu=%p, id=0x%x, base_msr=" - "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__, - vcpu, kvm_lapic_get_reg(apic, APIC_ID), - vcpu->arch.apic_base, apic->base_address); } /* @@ -2323,7 +2252,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) struct kvm_lapic *apic; ASSERT(vcpu != NULL); - apic_debug("apic_init %d\n", vcpu->vcpu_id); apic = kzalloc(sizeof(*apic), GFP_KERNEL_ACCOUNT); if (!apic) @@ -2678,11 +2606,8 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data) if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic)) return 1; - if (reg == APIC_DFR || reg == APIC_ICR2) { - apic_debug("KVM_APIC_READ: read x2apic reserved register %x\n", - reg); + if (reg == APIC_DFR || reg == APIC_ICR2) return 1; - } if (kvm_lapic_reg_read(apic, reg, 4, &low)) return 1; @@ -2780,8 +2705,6 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu) /* evaluate pending_events before reading the vector */ smp_rmb(); sipi_vector = apic->sipi_vector; - apic_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, sipi_vector); kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector); vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -- cgit v1.2.3 From 6694e48012826351036fd10fc506ca880023e25f Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Mon, 15 Jul 2019 18:47:44 +0300 Subject: KVM: nVMX: Ignore segment base for VMX memory operand when segment not FS or GS As reported by Maxime at https://bugzilla.kernel.org/show_bug.cgi?id=204175: In vmx/nested.c::get_vmx_mem_address(), when the guest runs in long mode, the base address of the memory operand is computed with a simple: *ret = s.base + off; This is incorrect, the base applies only to FS and GS, not to the others. Because of that, if the guest uses a VMX instruction based on DS and has a DS.base that is non-zero, KVM wrongfully adds the base to the resulting address. Reported-by: Maxime Villard Reviewed-by: Joao Martins Signed-off-by: Liran Alon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index bb509c254939..4f23e34f628b 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -4194,7 +4194,10 @@ int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification, * mode, e.g. a 32-bit address size can yield a 64-bit virtual * address when using FS/GS with a non-zero base. */ - *ret = s.base + off; + if (seg_reg == VCPU_SREG_FS || seg_reg == VCPU_SREG_GS) + *ret = s.base + off; + else + *ret = off; /* Long mode: #GP(0)/#SS(0) if the memory address is in a * non-canonical form. This is the only check on the memory -- cgit v1.2.3 From 4d1a082da968ff0c9d7b69a7ec44e6be6fc6e213 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Wed, 17 Jul 2019 10:51:18 +0800 Subject: KVM: x86/vPMU: reset pmc->counter to 0 for pmu fixed_counters To avoid semantic inconsistency, the fixed_counters in Intel vPMU need to be reset to 0 in intel_pmu_reset() as gp_counters does. Signed-off-by: Like Xu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/pmu_intel.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 68d231d49c7a..4dea0e0e7e39 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -337,17 +337,22 @@ static void intel_pmu_init(struct kvm_vcpu *vcpu) static void intel_pmu_reset(struct kvm_vcpu *vcpu) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); + struct kvm_pmc *pmc = NULL; int i; for (i = 0; i < INTEL_PMC_MAX_GENERIC; i++) { - struct kvm_pmc *pmc = &pmu->gp_counters[i]; + pmc = &pmu->gp_counters[i]; pmc_stop_counter(pmc); pmc->counter = pmc->eventsel = 0; } - for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) - pmc_stop_counter(&pmu->fixed_counters[i]); + for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) { + pmc = &pmu->fixed_counters[i]; + + pmc_stop_counter(pmc); + pmc->counter = 0; + } pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = pmu->global_ovf_ctrl = 0; -- cgit v1.2.3 From 4d151bf3b89e71490e69defc811579b2bde617e2 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sat, 6 Jul 2019 09:26:50 +0800 Subject: KVM: LAPIC: Make lapic timer unpinned MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 61abdbe0bcc2 ("kvm: x86: make lapic hrtimer pinned") pinned the lapic timer to avoid to wait until the next kvm exit for the guest to see KVM_REQ_PENDING_TIMER set. There is another solution to give a kick after setting the KVM_REQ_PENDING_TIMER bit, make lapic timer unpinned will be used in follow up patches. Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Marcelo Tosatti Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 8 ++++---- arch/x86/kvm/x86.c | 6 +----- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index b7fda1dd3cc8..32b80ecc0ac5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1567,7 +1567,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); - hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS); } else apic_timer_expired(apic); @@ -1659,7 +1659,7 @@ static void start_sw_period(struct kvm_lapic *apic) hrtimer_start(&apic->lapic_timer.timer, apic->lapic_timer.target_expiration, - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS); } bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) @@ -2268,7 +2268,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) apic->vcpu = vcpu; hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS); apic->lapic_timer.timer.function = apic_timer_fn; if (timer_advance_ns == -1) { apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT; @@ -2458,7 +2458,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &vcpu->arch.apic->lapic_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9d7b9e6a0939..6ab30c5e1ae0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1456,12 +1456,8 @@ static void update_pvclock_gtod(struct timekeeper *tk) void kvm_set_pending_timer(struct kvm_vcpu *vcpu) { - /* - * Note: KVM_REQ_PENDING_TIMER is implicitly checked in - * vcpu_enter_guest. This function is only called from - * the physical CPU that is running vcpu. - */ kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); } static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock) -- cgit v1.2.3 From 0c5f81dad46c90792e6c3c4797131323c9e96dcd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Sat, 6 Jul 2019 09:26:51 +0800 Subject: KVM: LAPIC: Inject timer interrupt via posted interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dedicated instances are currently disturbed by unnecessary jitter due to the emulated lapic timers firing on the same pCPUs where the vCPUs reside. There is no hardware virtual timer on Intel for guest like ARM, so both programming timer in guest and the emulated timer fires incur vmexits. This patch tries to avoid vmexit when the emulated timer fires, at least in dedicated instance scenario when nohz_full is enabled. In that case, the emulated timers can be offload to the nearest busy housekeeping cpus since APICv has been found for several years in server processors. The guest timer interrupt can then be injected via posted interrupts, which are delivered by the housekeeping cpu once the emulated timer fires. The host should tuned so that vCPUs are placed on isolated physical processors, and with several pCPUs surplus for busy housekeeping. If disabled mwait/hlt/pause vmexits keep the vCPUs in non-root mode, ~3% redis performance benefit can be observed on Skylake server, and the number of external interrupt vmexits drops substantially. Without patch VM-EXIT Samples Samples% Time% Min Time Max Time Avg time EXTERNAL_INTERRUPT 42916 49.43% 39.30% 0.47us 106.09us 0.71us ( +- 1.09% ) While with patch: VM-EXIT Samples Samples% Time% Min Time Max Time Avg time EXTERNAL_INTERRUPT 6871 9.29% 2.96% 0.44us 57.88us 0.72us ( +- 4.02% ) Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Marcelo Tosatti Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/x86/kvm/lapic.c | 99 ++++++++++++++++++++++++++--------------- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/vmx/vmx.c | 3 +- arch/x86/kvm/x86.c | 6 +++ arch/x86/kvm/x86.h | 2 + include/linux/sched/isolation.h | 6 +++ kernel/sched/isolation.c | 6 +++ 7 files changed, 87 insertions(+), 36 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 32b80ecc0ac5..0aa158657f20 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -118,6 +118,17 @@ static inline u32 kvm_x2apic_id(struct kvm_lapic *apic) return apic->vcpu->vcpu_id; } +bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu) +{ + return pi_inject_timer && kvm_vcpu_apicv_active(vcpu); +} +EXPORT_SYMBOL_GPL(kvm_can_post_timer_interrupt); + +static bool kvm_use_posted_timer_interrupt(struct kvm_vcpu *vcpu) +{ + return kvm_can_post_timer_interrupt(vcpu) && vcpu->mode == IN_GUEST_MODE; +} + static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) { switch (map->mode) { @@ -1421,29 +1432,6 @@ static void apic_update_lvtt(struct kvm_lapic *apic) } } -static void apic_timer_expired(struct kvm_lapic *apic) -{ - struct kvm_vcpu *vcpu = apic->vcpu; - struct swait_queue_head *q = &vcpu->wq; - struct kvm_timer *ktimer = &apic->lapic_timer; - - if (atomic_read(&apic->lapic_timer.pending)) - return; - - atomic_inc(&apic->lapic_timer.pending); - kvm_set_pending_timer(vcpu); - - /* - * For x86, the atomic_inc() is serialized, thus - * using swait_active() is safe. - */ - if (swait_active(q)) - swake_up_one(q); - - if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) - ktimer->expired_tscdeadline = ktimer->tscdeadline; -} - /* * On APICv, this test will cause a busy wait * during a higher-priority task. @@ -1517,7 +1505,7 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu, apic->lapic_timer.timer_advance_ns = timer_advance_ns; } -void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) +static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; u64 guest_tsc, tsc_deadline; @@ -1525,9 +1513,6 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (apic->lapic_timer.expired_tscdeadline == 0) return; - if (!lapic_timer_int_injected(vcpu)) - return; - tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); @@ -1539,8 +1524,57 @@ void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) if (unlikely(!apic->lapic_timer.timer_advance_adjust_done)) adjust_lapic_timer_advance(vcpu, apic->lapic_timer.advance_expire_delta); } + +void kvm_wait_lapic_expire(struct kvm_vcpu *vcpu) +{ + if (lapic_timer_int_injected(vcpu)) + __kvm_wait_lapic_expire(vcpu); +} EXPORT_SYMBOL_GPL(kvm_wait_lapic_expire); +static void kvm_apic_inject_pending_timer_irqs(struct kvm_lapic *apic) +{ + struct kvm_timer *ktimer = &apic->lapic_timer; + + kvm_apic_local_deliver(apic, APIC_LVTT); + if (apic_lvtt_tscdeadline(apic)) + ktimer->tscdeadline = 0; + if (apic_lvtt_oneshot(apic)) { + ktimer->tscdeadline = 0; + ktimer->target_expiration = 0; + } +} + +static void apic_timer_expired(struct kvm_lapic *apic) +{ + struct kvm_vcpu *vcpu = apic->vcpu; + struct swait_queue_head *q = &vcpu->wq; + struct kvm_timer *ktimer = &apic->lapic_timer; + + if (atomic_read(&apic->lapic_timer.pending)) + return; + + if (apic_lvtt_tscdeadline(apic) || ktimer->hv_timer_in_use) + ktimer->expired_tscdeadline = ktimer->tscdeadline; + + if (kvm_use_posted_timer_interrupt(apic->vcpu)) { + if (apic->lapic_timer.timer_advance_ns) + __kvm_wait_lapic_expire(vcpu); + kvm_apic_inject_pending_timer_irqs(apic); + return; + } + + atomic_inc(&apic->lapic_timer.pending); + kvm_set_pending_timer(vcpu); + + /* + * For x86, the atomic_inc() is serialized, thus + * using swait_active() is safe. + */ + if (swait_active(q)) + swake_up_one(q); +} + static void start_sw_tscdeadline(struct kvm_lapic *apic) { struct kvm_timer *ktimer = &apic->lapic_timer; @@ -2325,13 +2359,7 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) struct kvm_lapic *apic = vcpu->arch.apic; if (atomic_read(&apic->lapic_timer.pending) > 0) { - kvm_apic_local_deliver(apic, APIC_LVTT); - if (apic_lvtt_tscdeadline(apic)) - apic->lapic_timer.tscdeadline = 0; - if (apic_lvtt_oneshot(apic)) { - apic->lapic_timer.tscdeadline = 0; - apic->lapic_timer.target_expiration = 0; - } + kvm_apic_inject_pending_timer_irqs(apic); atomic_set(&apic->lapic_timer.pending, 0); } } @@ -2453,7 +2481,8 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) { struct hrtimer *timer; - if (!lapic_in_kernel(vcpu)) + if (!lapic_in_kernel(vcpu) || + kvm_can_post_timer_interrupt(vcpu)) return; timer = &vcpu->arch.apic->lapic_timer.timer; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 36747174e4a8..50053d2b8b7b 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -236,6 +236,7 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu); void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu); bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu); void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu); +bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu); static inline enum lapic_mode kvm_apic_mode(u64 apic_base) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 84f8d49a2fd2..280320f74db7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7064,7 +7064,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; - if (kvm_mwait_in_guest(vcpu->kvm)) + if (kvm_mwait_in_guest(vcpu->kvm) || + kvm_can_post_timer_interrupt(vcpu)) return -EOPNOTSUPP; vmx = to_vmx(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6ab30c5e1ae0..58305cf81182 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -51,6 +51,7 @@ #include #include #include +#include #include #include @@ -153,6 +154,9 @@ EXPORT_SYMBOL_GPL(enable_vmware_backdoor); static bool __read_mostly force_emulation_prefix = false; module_param(force_emulation_prefix, bool, S_IRUGO); +int __read_mostly pi_inject_timer = -1; +module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -7058,6 +7062,8 @@ int kvm_arch_init(void *opaque) host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); kvm_lapic_init(); + if (pi_inject_timer == -1) + pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier); diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e08a12892e8b..6594020c0691 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -301,6 +301,8 @@ extern unsigned int min_timer_period_us; extern bool enable_vmware_backdoor; +extern int pi_inject_timer; + extern struct static_key kvm_no_apic_vcpu; static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index b0fb1446fe04..6c8512d3be88 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h @@ -19,6 +19,7 @@ enum hk_flags { DECLARE_STATIC_KEY_FALSE(housekeeping_overridden); extern int housekeeping_any_cpu(enum hk_flags flags); extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags); +extern bool housekeeping_enabled(enum hk_flags flags); extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags); extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags); extern void __init housekeeping_init(void); @@ -35,6 +36,11 @@ static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags) return cpu_possible_mask; } +static inline bool housekeeping_enabled(enum hk_flags flags) +{ + return false; +} + static inline void housekeeping_affine(struct task_struct *t, enum hk_flags flags) { } static inline void housekeeping_init(void) { } diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index 123ea07a3f3b..ccb28085b114 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c @@ -14,6 +14,12 @@ EXPORT_SYMBOL_GPL(housekeeping_overridden); static cpumask_var_t housekeeping_mask; static unsigned int housekeeping_flags; +bool housekeeping_enabled(enum hk_flags flags) +{ + return !!(housekeeping_flags & flags); +} +EXPORT_SYMBOL_GPL(housekeeping_enabled); + int housekeeping_any_cpu(enum hk_flags flags) { if (static_branch_unlikely(&housekeeping_overridden)) -- cgit v1.2.3 From 118154bdf54ca79e4b5f3ce6d4a8a7c6b7c2c76f Mon Sep 17 00:00:00 2001 From: Liran Alon Date: Wed, 17 Jul 2019 02:56:58 +0300 Subject: KVM: SVM: Fix detection of AMD Errata 1096 When CPU raise #NPF on guest data access and guest CR4.SMAP=1, it is possible that CPU microcode implementing DecodeAssist will fail to read bytes of instruction which caused #NPF. This is AMD errata 1096 and it happens because CPU microcode reading instruction bytes incorrectly attempts to read code as implicit supervisor-mode data accesses (that is, just like it would read e.g. a TSS), which are susceptible to SMAP faults. The microcode reads CS:RIP and if it is a user-mode address according to the page tables, the processor gives up and returns no instruction bytes. In this case, GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly return 0 instead of the correct guest instruction bytes. Current KVM code attemps to detect and workaround this errata, but it has multiple issues: 1) It mistakenly checks if guest CR4.SMAP=0 instead of guest CR4.SMAP=1, which is required for encountering a SMAP fault. 2) It assumes SMAP faults can only occur when guest CPL==3. However, in case guest CR4.SMEP=0, the guest can execute an instruction which reside in a user-accessible page with CPL<3 priviledge. If this instruction raise a #NPF on it's data access, then CPU DecodeAssist microcode will still encounter a SMAP violation. Even though no sane OS will do so (as it's an obvious priviledge escalation vulnerability), we still need to handle this semanticly correct in KVM side. Note that (2) *is* a useful optimization, because CR4.SMAP=1 is an easy triggerable condition and guests usually enable SMAP together with SMEP. If the vCPU has CR4.SMEP=1, the errata could indeed be encountered onlt at guest CPL==3; otherwise, the CPU would raise a SMEP fault to guest instead of #NPF. We keep this condition to avoid false positives in the detection of the errata. In addition, to avoid future confusion and improve code readbility, include details of the errata in code and not just in commit message. Fixes: 05d5a4863525 ("KVM: SVM: Workaround errata#1096 (insn_len maybe zero on SMAP violation)") Cc: Singh Brijesh Cc: Sean Christopherson Cc: Paolo Bonzini Reviewed-by: Boris Ostrovsky Signed-off-by: Liran Alon Reviewed-by: Brijesh Singh Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 583b9fa656f3..19f69df96758 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7128,13 +7128,41 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) { - bool is_user, smap; - - is_user = svm_get_cpl(vcpu) == 3; - smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); + unsigned long cr4 = kvm_read_cr4(vcpu); + bool smep = cr4 & X86_CR4_SMEP; + bool smap = cr4 & X86_CR4_SMAP; + bool is_user = svm_get_cpl(vcpu) == 3; /* - * Detect and workaround Errata 1096 Fam_17h_00_0Fh + * Detect and workaround Errata 1096 Fam_17h_00_0Fh. + * + * Errata: + * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is + * possible that CPU microcode implementing DecodeAssist will fail + * to read bytes of instruction which caused #NPF. In this case, + * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly + * return 0 instead of the correct guest instruction bytes. + * + * This happens because CPU microcode reading instruction bytes + * uses a special opcode which attempts to read data using CPL=0 + * priviledges. The microcode reads CS:RIP and if it hits a SMAP + * fault, it gives up and returns no instruction bytes. + * + * Detection: + * We reach here in case CPU supports DecodeAssist, raised #NPF and + * returned 0 in GuestIntrBytes field of the VMCB. + * First, errata can only be triggered in case vCPU CR4.SMAP=1. + * Second, if vCPU CR4.SMEP=1, errata could only be triggered + * in case vCPU CPL==3 (Because otherwise guest would have triggered + * a SMEP fault instead of #NPF). + * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL. + * As most guests enable SMAP if they have also enabled SMEP, use above + * logic in order to attempt minimize false-positive of detecting errata + * while still preserving all cases semantic correctness. + * + * Workaround: + * To determine what instruction the guest was executing, the hypervisor + * will have to decode the instruction at the instruction pointer. * * In non SEV guest, hypervisor will be able to read the guest * memory to decode the instruction pointer when insn_len is zero @@ -7145,11 +7173,11 @@ static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) * instruction pointer so we will not able to workaround it. Lets * print the error and request to kill the guest. */ - if (is_user && smap) { + if (smap && (!smep || is_user)) { if (!sev_guest(vcpu->kvm)) return true; - pr_err_ratelimited("KVM: Guest triggered AMD Erratum 1096\n"); + pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n"); kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); } -- cgit v1.2.3 From 2417c87059b0f29535bcedf9360b94a0110bc3da Mon Sep 17 00:00:00 2001 From: Thomas Huth Date: Thu, 18 Jul 2019 13:55:27 +0200 Subject: KVM: selftests: Remove superfluous define from vmx.c The code in vmx.c does not use "program_invocation_name", so there is no need to "#define _GNU_SOURCE" here. Signed-off-by: Thomas Huth Signed-off-by: Paolo Bonzini --- tools/testing/selftests/kvm/lib/x86_64/vmx.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/x86_64/vmx.c b/tools/testing/selftests/kvm/lib/x86_64/vmx.c index fe56d159d65f..204f847bd065 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86_64/vmx.c @@ -5,8 +5,6 @@ * Copyright (C) 2018, Google LLC. */ -#define _GNU_SOURCE /* for program_invocation_name */ - #include "test_util.h" #include "kvm_util.h" #include "processor.h" -- cgit v1.2.3 From d73eb57b80b98ae147e4e6a7d9877c2ba175f972 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 18 Jul 2019 19:39:06 +0800 Subject: KVM: Boost vCPUs that are delivering interrupts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inspired by commit 9cac38dd5d (KVM/s390: Set preempted flag during vcpu wakeup and interrupt delivery), we want to also boost not just lock holders but also vCPUs that are delivering interrupts. Most smp_call_function_many calls are synchronous, so the IPI target vCPUs are also good yield candidates. This patch introduces vcpu->ready to boost vCPUs during wakeup and interrupt delivery time; unlike s390 we do not reuse vcpu->preempted so that voluntarily preempted vCPUs are taken into account by kvm_vcpu_on_spin, but vmx_vcpu_pi_put is not affected (VT-d PI handles voluntary preemption separately, in pi_pre_block). Testing on 80 HT 2 socket Xeon Skylake server, with 80 vCPUs VM 80GB RAM: ebizzy -M vanilla boosting improved 1VM 21443 23520 9% 2VM 2800 8000 180% 3VM 1800 3100 72% Testing on my Haswell desktop 8 HT, with 8 vCPUs VM 8GB RAM, two VMs, one running ebizzy -M, the other running 'stress --cpu 2': w/ boosting + w/o pv sched yield(vanilla) vanilla boosting improved 1570 4000 155% w/ boosting + w/ pv sched yield(vanilla) vanilla boosting improved 1844 5157 179% w/o boosting, perf top in VM: 72.33% [kernel] [k] smp_call_function_many 4.22% [kernel] [k] call_function_i 3.71% [kernel] [k] async_page_fault w/ boosting, perf top in VM: 38.43% [kernel] [k] smp_call_function_many 6.31% [kernel] [k] async_page_fault 6.13% libc-2.23.so [.] __memcpy_avx_unaligned 4.88% [kernel] [k] call_function_interrupt Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Christian Borntraeger Cc: Paul Mackerras Cc: Marc Zyngier Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/s390/kvm/interrupt.c | 2 +- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 12 ++++++++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 9dde4d7d8704..26f8bf4a22a7 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1240,7 +1240,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) * The vcpu gave up the cpu voluntarily, mark it as a good * yield-candidate. */ - vcpu->preempted = true; + vcpu->ready = true; swake_up_one(&vcpu->wq); vcpu->stat.halt_wakeup++; } diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index c5da875f19e3..5c5b5867024c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -318,6 +318,7 @@ struct kvm_vcpu { } spin_loop; #endif bool preempted; + bool ready; struct kvm_vcpu_arch arch; struct dentry *debugfs_dentry; }; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b4ab59dd6846..887f3b0c2b60 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -314,6 +314,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) kvm_vcpu_set_in_spin_loop(vcpu, false); kvm_vcpu_set_dy_eligible(vcpu, false); vcpu->preempted = false; + vcpu->ready = false; r = kvm_arch_vcpu_init(vcpu); if (r < 0) @@ -2387,6 +2388,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) wqp = kvm_arch_vcpu_wq(vcpu); if (swq_has_sleeper(wqp)) { swake_up_one(wqp); + WRITE_ONCE(vcpu->ready, true); ++vcpu->stat.halt_wakeup; return true; } @@ -2500,7 +2502,7 @@ void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode) continue; } else if (pass && i > last_boosted_vcpu) break; - if (!READ_ONCE(vcpu->preempted)) + if (!READ_ONCE(vcpu->ready)) continue; if (vcpu == me) continue; @@ -4203,8 +4205,8 @@ static void kvm_sched_in(struct preempt_notifier *pn, int cpu) { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - if (vcpu->preempted) - vcpu->preempted = false; + vcpu->preempted = false; + WRITE_ONCE(vcpu->ready, false); kvm_arch_sched_in(vcpu, cpu); @@ -4216,8 +4218,10 @@ static void kvm_sched_out(struct preempt_notifier *pn, { struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn); - if (current->state == TASK_RUNNING) + if (current->state == TASK_RUNNING) { vcpu->preempted = true; + WRITE_ONCE(vcpu->ready, true); + } kvm_arch_vcpu_put(vcpu); } -- cgit v1.2.3 From d984740944308a310f9d33df774e2304fc1e6959 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Thu, 18 Jul 2019 19:39:07 +0800 Subject: KVM: s390: Use kvm_vcpu_wake_up in kvm_s390_vcpu_wakeup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use kvm_vcpu_wake_up() in kvm_s390_vcpu_wakeup(). Suggested-by: Paolo Bonzini Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Christian Borntraeger Signed-off-by: Wanpeng Li Signed-off-by: Paolo Bonzini --- arch/s390/kvm/interrupt.c | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 26f8bf4a22a7..b5fd6e85657c 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -1224,28 +1224,11 @@ no_timer: void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) { - /* - * We cannot move this into the if, as the CPU might be already - * in kvm_vcpu_block without having the waitqueue set (polling) - */ vcpu->valid_wakeup = true; + kvm_vcpu_wake_up(vcpu); + /* - * This is mostly to document, that the read in swait_active could - * be moved before other stores, leading to subtle races. - * All current users do not store or use an atomic like update - */ - smp_mb__after_atomic(); - if (swait_active(&vcpu->wq)) { - /* - * The vcpu gave up the cpu voluntarily, mark it as a good - * yield-candidate. - */ - vcpu->ready = true; - swake_up_one(&vcpu->wq); - vcpu->stat.halt_wakeup++; - } - /* - * The VCPU might not be sleeping but is executing the VSIE. Let's + * The VCPU might not be sleeping but rather executing VSIE. Let's * kick it, so it leaves the SIE to process the request. */ kvm_s390_vsie_kick(vcpu); -- cgit v1.2.3 From 6fc3977ccc5d3c22e851f2dce2d3ce2a0a843842 Mon Sep 17 00:00:00 2001 From: Like Xu Date: Thu, 18 Jul 2019 13:35:14 +0800 Subject: KVM: x86/vPMU: refine kvm_pmu err msg when event creation failed If a perf_event creation fails due to any reason of the host perf subsystem, it has no chance to log the corresponding event for guest which may cause abnormal sampling data in guest result. In debug mode, this message helps to understand the state of vPMC and we may not limit the number of occurrences but not in a spamming style. Suggested-by: Joe Perches Signed-off-by: Like Xu Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/pmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index aa5a2597305a..cedaa01ceb6f 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -131,8 +131,8 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, intr ? kvm_perf_overflow_intr : kvm_perf_overflow, pmc); if (IS_ERR(event)) { - printk_once("kvm_pmu: event creation failed %ld\n", - PTR_ERR(event)); + pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n", + PTR_ERR(event), pmc->idx); return; } -- cgit v1.2.3 From 3b20e03a1066ab2056711166c1b41d421d1ff7b7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 19 Jul 2019 18:15:08 +0200 Subject: KVM: VMX: dump VMCS on failed entry This is useful for debugging, and is ratelimited nowadays. Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/vmx.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 280320f74db7..a279447eb75b 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5829,6 +5829,7 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) } if (unlikely(vmx->fail)) { + dump_vmcs(); vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; vcpu->run->fail_entry.hardware_entry_failure_reason = vmcs_read32(VM_INSTRUCTION_ERROR); -- cgit v1.2.3 From 88dddc11a8d6b09201b4db9d255b3394d9bc9e57 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 19 Jul 2019 18:41:10 +0200 Subject: KVM: nVMX: do not use dangling shadow VMCS after guest reset If a KVM guest is reset while running a nested guest, free_nested will disable the shadow VMCS execution control in the vmcs01. However, on the next KVM_RUN vmx_vcpu_run would nevertheless try to sync the VMCS12 to the shadow VMCS which has since been freed. This causes a vmptrld of a NULL pointer on my machime, but Jan reports the host to hang altogether. Let's see how much this trivial patch fixes. Reported-by: Jan Kiszka Cc: Liran Alon Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/vmx/nested.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 4f23e34f628b..0f1378789bd0 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -194,6 +194,7 @@ static void vmx_disable_shadow_vmcs(struct vcpu_vmx *vmx) { secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_SHADOW_VMCS); vmcs_write64(VMCS_LINK_POINTER, -1ull); + vmx->nested.need_vmcs12_to_shadow_sync = false; } static inline void nested_release_evmcs(struct kvm_vcpu *vcpu) @@ -1341,6 +1342,9 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx) unsigned long val; int i; + if (WARN_ON(!shadow_vmcs)) + return; + preempt_disable(); vmcs_load(shadow_vmcs); @@ -1373,6 +1377,9 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx) unsigned long val; int i, q; + if (WARN_ON(!shadow_vmcs)) + return; + vmcs_load(shadow_vmcs); for (q = 0; q < ARRAY_SIZE(fields); q++) { @@ -4436,7 +4443,6 @@ static inline void nested_release_vmcs12(struct kvm_vcpu *vcpu) /* copy to memory all shadowed fields in case they were modified */ copy_shadow_to_vmcs12(vmx); - vmx->nested.need_vmcs12_to_shadow_sync = false; vmx_disable_shadow_vmcs(vmx); } vmx->nested.posted_intr_nv = -1; -- cgit v1.2.3 From 30cd8604323dbaf20a80e797fe7057f5b02e394d Mon Sep 17 00:00:00 2001 From: Eric Hankland Date: Thu, 18 Jul 2019 11:38:18 -0700 Subject: KVM: x86: Add fixed counters to PMU filter Updates KVM_CAP_PMU_EVENT_FILTER so it can also whitelist or blacklist fixed counters. Signed-off-by: Eric Hankland [No need to check padding fields for zero. - Paolo] Signed-off-by: Paolo Bonzini --- Documentation/virtual/kvm/api.txt | 15 ++++++++++----- arch/x86/include/uapi/asm/kvm.h | 9 ++++++--- arch/x86/kvm/pmu.c | 23 +++++++++++++++++++---- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 2cd6250b2896..e54a3f51ddc5 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -4090,17 +4090,22 @@ Parameters: struct kvm_pmu_event_filter (in) Returns: 0 on success, -1 on error struct kvm_pmu_event_filter { - __u32 action; - __u32 nevents; - __u64 events[0]; + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 pad[4]; + __u64 events[0]; }; This ioctl restricts the set of PMU events that the guest can program. The argument holds a list of events which will be allowed or denied. The eventsel+umask of each event the guest attempts to program is compared against the events field to determine whether the guest should have access. -This only affects general purpose counters; fixed purpose counters can -be disabled by changing the perfmon CPUID leaf. +The events field only controls general purpose counters; fixed purpose +counters are controlled by the fixed_counter_bitmap. + +No flags are defined yet, the field must be zero. Valid values for 'action': #define KVM_PMU_EVENT_ALLOW 0 diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index e901b0ab116f..503d3f42da16 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -435,9 +435,12 @@ struct kvm_nested_state { /* for KVM_CAP_PMU_EVENT_FILTER */ struct kvm_pmu_event_filter { - __u32 action; - __u32 nevents; - __u64 events[0]; + __u32 action; + __u32 nevents; + __u32 fixed_counter_bitmap; + __u32 flags; + __u32 pad[4]; + __u64 events[0]; }; #define KVM_PMU_EVENT_ALLOW 0 diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index cedaa01ceb6f..46875bbd0419 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -19,8 +19,8 @@ #include "lapic.h" #include "pmu.h" -/* This keeps the total size of the filter under 4k. */ -#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 63 +/* This is enough to filter the vast majority of currently defined events. */ +#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 /* NOTE: * - Each perf counter is defined as "struct kvm_pmc"; @@ -206,12 +206,24 @@ void reprogram_fixed_counter(struct kvm_pmc *pmc, u8 ctrl, int idx) { unsigned en_field = ctrl & 0x3; bool pmi = ctrl & 0x8; + struct kvm_pmu_event_filter *filter; + struct kvm *kvm = pmc->vcpu->kvm; pmc_stop_counter(pmc); if (!en_field || !pmc_is_enabled(pmc)) return; + filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); + if (filter) { + if (filter->action == KVM_PMU_EVENT_DENY && + test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + return; + if (filter->action == KVM_PMU_EVENT_ALLOW && + !test_bit(idx, (ulong *)&filter->fixed_counter_bitmap)) + return; + } + pmc_reprogram_counter(pmc, PERF_TYPE_HARDWARE, kvm_x86_ops->pmu_ops->find_fixed_event(idx), !(en_field & 0x2), /* exclude user */ @@ -385,6 +397,9 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) tmp.action != KVM_PMU_EVENT_DENY) return -EINVAL; + if (tmp.flags != 0) + return -EINVAL; + if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) return -E2BIG; @@ -406,8 +421,8 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) mutex_unlock(&kvm->lock); synchronize_srcu_expedited(&kvm->srcu); - r = 0; + r = 0; cleanup: kfree(filter); - return r; + return r; } -- cgit v1.2.3