From a987370f8e7a1677ae385042644326d9cd145a20 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 10 Mar 2015 19:06:59 +0000 Subject: arm64: KVM: Fix stage-2 PGD allocation to have per-page refcounting We're using __get_free_pages with to allocate the guest's stage-2 PGD. The standard behaviour of this function is to return a set of pages where only the head page has a valid refcount. This behaviour gets us into trouble when we're trying to increment the refcount on a non-head page: page:ffff7c00cfb693c0 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x4000000000000000() page dumped because: VM_BUG_ON_PAGE((*({ __attribute__((unused)) typeof((&page->_count)->counter) __var = ( typeof((&page->_count)->counter)) 0; (volatile typeof((&page->_count)->counter) *)&((&page->_count)->counter); })) <= 0) BUG: failure at include/linux/mm.h:548/get_page()! Kernel panic - not syncing: BUG! CPU: 1 PID: 1695 Comm: kvm-vcpu-0 Not tainted 4.0.0-rc1+ #3825 Hardware name: APM X-Gene Mustang board (DT) Call trace: [] dump_backtrace+0x0/0x13c [] show_stack+0x10/0x1c [] dump_stack+0x74/0x94 [] panic+0x100/0x240 [] stage2_get_pmd+0x17c/0x2bc [] kvm_handle_guest_abort+0x4b4/0x6b0 [] handle_exit+0x58/0x180 [] kvm_arch_vcpu_ioctl_run+0x114/0x45c [] kvm_vcpu_ioctl+0x2e0/0x754 [] do_vfs_ioctl+0x424/0x5c8 [] SyS_ioctl+0x40/0x78 CPU0: stopping A possible approach for this is to split the compound page using split_page() at allocation time, and change the teardown path to free one page at a time. It turns out that alloc_pages_exact() and free_pages_exact() does exactly that. While we're at it, the PGD allocation code is reworked to reduce duplication. This has been tested on an X-Gene platform with a 4kB/48bit-VA host kernel, and kvmtool hacked to place memory in the second page of the hardware PGD (PUD for the host kernel). Also regression-tested on a Cubietruck (Cortex-A7). [ Reworked to use alloc_pages_exact() and free_pages_exact() and to return pointers directly instead of by reference as arguments - Christoffer ] Reported-by: Mark Rutland Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_mmu.h | 46 ++++------------------------------------ 1 file changed, 4 insertions(+), 42 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 6458b5373142..a099cd9cdef8 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -171,43 +171,6 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) #define KVM_PREALLOC_LEVEL (0) #endif -/** - * kvm_prealloc_hwpgd - allocate inital table for VTTBR - * @kvm: The KVM struct pointer for the VM. - * @pgd: The kernel pseudo pgd - * - * When the kernel uses more levels of page tables than the guest, we allocate - * a fake PGD and pre-populate it to point to the next-level page table, which - * will be the real initial page table pointed to by the VTTBR. - * - * When KVM_PREALLOC_LEVEL==2, we allocate a single page for the PMD and - * the kernel will use folded pud. When KVM_PREALLOC_LEVEL==1, we - * allocate 2 consecutive PUD pages. - */ -static inline int kvm_prealloc_hwpgd(struct kvm *kvm, pgd_t *pgd) -{ - unsigned int i; - unsigned long hwpgd; - - if (KVM_PREALLOC_LEVEL == 0) - return 0; - - hwpgd = __get_free_pages(GFP_KERNEL | __GFP_ZERO, PTRS_PER_S2_PGD_SHIFT); - if (!hwpgd) - return -ENOMEM; - - for (i = 0; i < PTRS_PER_S2_PGD; i++) { - if (KVM_PREALLOC_LEVEL == 1) - pgd_populate(NULL, pgd + i, - (pud_t *)hwpgd + i * PTRS_PER_PUD); - else if (KVM_PREALLOC_LEVEL == 2) - pud_populate(NULL, pud_offset(pgd, 0) + i, - (pmd_t *)hwpgd + i * PTRS_PER_PMD); - } - - return 0; -} - static inline void *kvm_get_hwpgd(struct kvm *kvm) { pgd_t *pgd = kvm->arch.pgd; @@ -224,12 +187,11 @@ static inline void *kvm_get_hwpgd(struct kvm *kvm) return pmd_offset(pud, 0); } -static inline void kvm_free_hwpgd(struct kvm *kvm) +static inline unsigned int kvm_get_hwpgd_size(void) { - if (KVM_PREALLOC_LEVEL > 0) { - unsigned long hwpgd = (unsigned long)kvm_get_hwpgd(kvm); - free_pages(hwpgd, PTRS_PER_S2_PGD_SHIFT); - } + if (KVM_PREALLOC_LEVEL > 0) + return PTRS_PER_S2_PGD * PAGE_SIZE; + return PTRS_PER_S2_PGD * sizeof(pgd_t); } static inline bool kvm_page_empty(void *ptr) -- cgit v1.2.3 From 04b8dc85bf4a64517e3cf20e409eeaa503b15cc1 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 10 Mar 2015 19:07:00 +0000 Subject: arm64: KVM: Do not use pgd_index to index stage-2 pgd The kernel's pgd_index macro is designed to index a normal, page sized array. KVM is a bit diffferent, as we can use concatenated pages to have a bigger address space (for example 40bit IPA with 4kB pages gives us an 8kB PGD. In the above case, the use of pgd_index will always return an index inside the first 4kB, which makes a guest that has memory above 0x8000000000 rather unhappy, as it spins forever in a page fault, whist the host happilly corrupts the lower pgd. The obvious fix is to get our own kvm_pgd_index that does the right thing(tm). Tested on X-Gene with a hacked kvmtool that put memory at a stupidly high address. Reviewed-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm/include/asm/kvm_mmu.h | 3 ++- arch/arm/kvm/mmu.c | 8 ++++---- arch/arm64/include/asm/kvm_mmu.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index c57c41dc7e87..4cf48c3aca13 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -149,13 +149,14 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) (__boundary - 1 < (end) - 1)? __boundary: (end); \ }) +#define kvm_pgd_index(addr) pgd_index(addr) + static inline bool kvm_page_empty(void *ptr) { struct page *ptr_page = virt_to_page(ptr); return page_count(ptr_page) == 1; } - #define kvm_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) #define kvm_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) #define kvm_pud_table_empty(kvm, pudp) (0) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index a48a73c6b866..5656d79c5a44 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -290,7 +290,7 @@ static void unmap_range(struct kvm *kvm, pgd_t *pgdp, phys_addr_t addr = start, end = start + size; phys_addr_t next; - pgd = pgdp + pgd_index(addr); + pgd = pgdp + kvm_pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); if (!pgd_none(*pgd)) @@ -355,7 +355,7 @@ static void stage2_flush_memslot(struct kvm *kvm, phys_addr_t next; pgd_t *pgd; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); do { next = kvm_pgd_addr_end(addr, end); stage2_flush_puds(kvm, pgd, addr, next); @@ -830,7 +830,7 @@ static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache pgd_t *pgd; pud_t *pud; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); if (WARN_ON(pgd_none(*pgd))) { if (!cache) return NULL; @@ -1120,7 +1120,7 @@ static void stage2_wp_range(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) pgd_t *pgd; phys_addr_t next; - pgd = kvm->arch.pgd + pgd_index(addr); + pgd = kvm->arch.pgd + kvm_pgd_index(addr); do { /* * Release kvm_mmu_lock periodically if the memory region is diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index a099cd9cdef8..bbfb600fa822 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -158,6 +158,8 @@ static inline bool kvm_s2pmd_readonly(pmd_t *pmd) #define PTRS_PER_S2_PGD (1 << PTRS_PER_S2_PGD_SHIFT) #define S2_PGD_ORDER get_order(PTRS_PER_S2_PGD * sizeof(pgd_t)) +#define kvm_pgd_index(addr) (((addr) >> PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) + /* * If we are concatenating first level stage-2 page tables, we would have less * than or equal to 16 pointers in the fake PGD, because that's what the -- cgit v1.2.3 From 84ed7412b5eee1011579b3db7454b9cb6d26fa65 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 10 Mar 2015 19:07:01 +0000 Subject: arm64: KVM: Fix outdated comment about VTCR_EL2.PS Commit 87366d8cf7b3 ("arm64: Add boot time configuration of Intermediate Physical Address size") removed the hardcoded setting of VTCR_EL2.PS to use ID_AA64MMFR0_EL1.PARange instead, but didn't remove the (now rather misleading) comment. Fix the comments to match reality (at least for the next few minutes). Acked-by: Christoffer Dall Signed-off-by: Marc Zyngier Signed-off-by: Christoffer Dall --- arch/arm64/include/asm/kvm_arm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/arm64') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 94674eb7e7bb..54bb4ba97441 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -129,6 +129,9 @@ * 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are * not known to exist and will break with this configuration. * + * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time + * (see hyp-init.S). + * * Note that when using 4K pages, we concatenate two first level page tables * together. * @@ -138,7 +141,6 @@ #ifdef CONFIG_ARM64_64K_PAGES /* * Stage2 translation configuration: - * 40bits output (PS = 2) * 40bits input (T0SZ = 24) * 64kB pages (TG0 = 1) * 2 level page tables (SL = 1) @@ -150,7 +152,6 @@ #else /* * Stage2 translation configuration: - * 40bits output (PS = 2) * 40bits input (T0SZ = 24) * 4kB pages (TG0 = 0) * 3 level page tables (SL = 1) -- cgit v1.2.3