summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
authorPaolo Bonzini <pbonzini@redhat.com>2018-10-19 15:24:24 +0200
committerPaolo Bonzini <pbonzini@redhat.com>2018-10-19 15:24:24 +0200
commite42b4a507efa19a90c63e7968c93c4f82d3bc805 (patch)
treecc7bf34323b26d3f53d4a9ec4ca1b5c52bc361cd /arch
parent1e58e5e59148916fa43444a406335a990783fb78 (diff)
parente4e11cc0f81ee7be17d6f6fb96128a6d51c0e838 (diff)
downloadlinux-e42b4a507efa19a90c63e7968c93c4f82d3bc805.tar.bz2
Merge tag 'kvmarm-for-v4.20' of git://git.kernel.org/pub/scm/linux/kernel/git/kvmarm/kvmarm into HEAD
KVM/arm updates for 4.20 - Improved guest IPA space support (32 to 52 bits) - RAS event delivery for 32bit - PMU fixes - Guest entry hardening - Various cleanups
Diffstat (limited to 'arch')
-rw-r--r--arch/arm/include/asm/kvm_arm.h3
-rw-r--r--arch/arm/include/asm/kvm_host.h13
-rw-r--r--arch/arm/include/asm/kvm_mmu.h15
-rw-r--r--arch/arm/include/asm/stage2_pgtable.h54
-rw-r--r--arch/arm64/include/asm/cpufeature.h20
-rw-r--r--arch/arm64/include/asm/kvm_arm.h155
-rw-r--r--arch/arm64/include/asm/kvm_asm.h3
-rw-r--r--arch/arm64/include/asm/kvm_host.h18
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h10
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h42
-rw-r--r--arch/arm64/include/asm/ptrace.h3
-rw-r--r--arch/arm64/include/asm/stage2_pgtable-nopmd.h42
-rw-r--r--arch/arm64/include/asm/stage2_pgtable-nopud.h39
-rw-r--r--arch/arm64/include/asm/stage2_pgtable.h236
-rw-r--r--arch/arm64/kvm/guest.c6
-rw-r--r--arch/arm64/kvm/handle_exit.c7
-rw-r--r--arch/arm64/kvm/hyp/Makefile1
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S16
-rw-r--r--arch/arm64/kvm/hyp/s2-setup.c90
-rw-r--r--arch/arm64/kvm/hyp/switch.c4
-rw-r--r--arch/arm64/kvm/hyp/sysreg-sr.c19
-rw-r--r--arch/arm64/kvm/hyp/tlb.c4
-rw-r--r--arch/arm64/kvm/reset.c108
-rw-r--r--arch/x86/include/asm/fixmap.h10
-rw-r--r--arch/x86/include/asm/mem_encrypt.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h3
-rw-r--r--arch/x86/kernel/cpu/intel_rdt.h17
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c27
-rw-r--r--arch/x86/kernel/cpu/intel_rdt_rdtgroup.c53
-rw-r--r--arch/x86/kernel/head64.c20
-rw-r--r--arch/x86/kernel/head_64.S16
-rw-r--r--arch/x86/kernel/kvmclock.c52
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/vmlinux.lds.S19
-rw-r--r--arch/x86/mm/init.c4
-rw-r--r--arch/x86/mm/mem_encrypt.c24
-rw-r--r--arch/x86/mm/pgtable.c9
-rw-r--r--arch/x86/xen/mmu_pv.c8
-rw-r--r--arch/x86/xen/pmu.c2
39 files changed, 814 insertions, 369 deletions
diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index 3ab8b3781bfe..c3f1f9b304b7 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -133,8 +133,7 @@
* space.
*/
#define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE (_AC(1, ULL) << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK (KVM_PHYS_SIZE - _AC(1, ULL))
+
#define PTRS_PER_S2_PGD (_AC(1, ULL) << (KVM_PHYS_SHIFT - 30))
/* Virtualization Translation Control Register (VTCR) bits */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 3ad482d2f1eb..5ca5d9af0c26 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -273,7 +273,7 @@ static inline void __cpu_init_stage2(void)
kvm_call_hyp(__init_stage2_translation);
}
-static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+static inline int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
return 0;
}
@@ -354,4 +354,15 @@ static inline void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu) {}
struct kvm *kvm_arch_alloc_vm(void);
void kvm_arch_free_vm(struct kvm *kvm);
+static inline int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+ /*
+ * On 32bit ARM, VMs get a static 40bit IPA stage2 setup,
+ * so any non-zero value used as type is illegal.
+ */
+ if (type)
+ return -EINVAL;
+ return 0;
+}
+
#endif /* __ARM_KVM_HOST_H__ */
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index 265ea9cf7df7..5ad1a54f98dc 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -35,16 +35,12 @@
addr; \
})
-/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
- */
-#define KVM_MMU_CACHE_MIN_PAGES 2
-
#ifndef __ASSEMBLY__
#include <linux/highmem.h>
#include <asm/cacheflush.h>
#include <asm/cputype.h>
+#include <asm/kvm_arm.h>
#include <asm/kvm_hyp.h>
#include <asm/pgalloc.h>
#include <asm/stage2_pgtable.h>
@@ -52,6 +48,13 @@
/* Ensure compatibility with arm64 */
#define VA_BITS 32
+#define kvm_phys_shift(kvm) KVM_PHYS_SHIFT
+#define kvm_phys_size(kvm) (1ULL << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - 1ULL)
+#define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK
+
+#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t))
+
int create_hyp_mappings(void *from, void *to, pgprot_t prot);
int create_hyp_io_mappings(phys_addr_t phys_addr, size_t size,
void __iomem **kaddr,
@@ -355,6 +358,8 @@ static inline int hyp_map_aux_data(void)
#define kvm_phys_to_vttbr(addr) (addr)
+static inline void kvm_set_ipa_limit(void) {}
+
#endif /* !__ASSEMBLY__ */
#endif /* __ARM_KVM_MMU_H__ */
diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h
index 460d616bb2d6..f6a7ea805232 100644
--- a/arch/arm/include/asm/stage2_pgtable.h
+++ b/arch/arm/include/asm/stage2_pgtable.h
@@ -19,43 +19,53 @@
#ifndef __ARM_S2_PGTABLE_H_
#define __ARM_S2_PGTABLE_H_
-#define stage2_pgd_none(pgd) pgd_none(pgd)
-#define stage2_pgd_clear(pgd) pgd_clear(pgd)
-#define stage2_pgd_present(pgd) pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address) pud_offset(pgd, address)
-#define stage2_pud_free(pud) pud_free(NULL, pud)
-
-#define stage2_pud_none(pud) pud_none(pud)
-#define stage2_pud_clear(pud) pud_clear(pud)
-#define stage2_pud_present(pud) pud_present(pud)
-#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address) pmd_offset(pud, address)
-#define stage2_pmd_free(pmd) pmd_free(NULL, pmd)
-
-#define stage2_pud_huge(pud) pud_huge(pud)
+/*
+ * kvm_mmu_cache_min_pages() is the number of pages required
+ * to install a stage-2 translation. We pre-allocate the entry
+ * level table at VM creation. Since we have a 3 level page-table,
+ * we need only two pages to add a new mapping.
+ */
+#define kvm_mmu_cache_min_pages(kvm) 2
+
+#define stage2_pgd_none(kvm, pgd) pgd_none(pgd)
+#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd)
+#define stage2_pgd_present(kvm, pgd) pgd_present(pgd)
+#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud)
+#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address)
+#define stage2_pud_free(kvm, pud) pud_free(NULL, pud)
+
+#define stage2_pud_none(kvm, pud) pud_none(pud)
+#define stage2_pud_clear(kvm, pud) pud_clear(pud)
+#define stage2_pud_present(kvm, pud) pud_present(pud)
+#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd)
+#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address)
+#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd)
+
+#define stage2_pud_huge(kvm, pud) pud_huge(pud)
/* Open coded p*d_addr_end that can deal with 64bit addresses */
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
phys_addr_t boundary = (addr + PGDIR_SIZE) & PGDIR_MASK;
return (boundary - 1 < end - 1) ? boundary : end;
}
-#define stage2_pud_addr_end(addr, end) (end)
+#define stage2_pud_addr_end(kvm, addr, end) (end)
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
phys_addr_t boundary = (addr + PMD_SIZE) & PMD_MASK;
return (boundary - 1 < end - 1) ? boundary : end;
}
-#define stage2_pgd_index(addr) pgd_index(addr)
+#define stage2_pgd_index(kvm, addr) pgd_index(addr)
-#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep)
-#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
-#define stage2_pud_table_empty(pudp) false
+#define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep)
+#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp)
+#define stage2_pud_table_empty(kvm, pudp) false
#endif /* __ARM_S2_PGTABLE_H_ */
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 1717ba1db35d..072cc1c970c2 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -530,6 +530,26 @@ void arm64_set_ssbd_mitigation(bool state);
static inline void arm64_set_ssbd_mitigation(bool state) {}
#endif
+static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange)
+{
+ switch (parange) {
+ case 0: return 32;
+ case 1: return 36;
+ case 2: return 40;
+ case 3: return 42;
+ case 4: return 44;
+ case 5: return 48;
+ case 6: return 52;
+ /*
+ * A future PE could use a value unknown to the kernel.
+ * However, by the "D10.1.4 Principles of the ID scheme
+ * for fields in ID registers", ARM DDI 0487C.a, any new
+ * value is guaranteed to be higher than what we know already.
+ * As a safe limit, we return the limit supported by the kernel.
+ */
+ default: return CONFIG_ARM64_PA_BITS;
+ }
+}
#endif /* __ASSEMBLY__ */
#endif
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index aa45df752a16..6e324d1f1231 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -107,6 +107,7 @@
#define VTCR_EL2_RES1 (1 << 31)
#define VTCR_EL2_HD (1 << 22)
#define VTCR_EL2_HA (1 << 21)
+#define VTCR_EL2_PS_SHIFT TCR_EL2_PS_SHIFT
#define VTCR_EL2_PS_MASK TCR_EL2_PS_MASK
#define VTCR_EL2_TG0_MASK TCR_TG0_MASK
#define VTCR_EL2_TG0_4K TCR_TG0_4K
@@ -120,62 +121,149 @@
#define VTCR_EL2_IRGN0_WBWA TCR_IRGN0_WBWA
#define VTCR_EL2_SL0_SHIFT 6
#define VTCR_EL2_SL0_MASK (3 << VTCR_EL2_SL0_SHIFT)
-#define VTCR_EL2_SL0_LVL1 (1 << VTCR_EL2_SL0_SHIFT)
#define VTCR_EL2_T0SZ_MASK 0x3f
-#define VTCR_EL2_T0SZ_40B 24
#define VTCR_EL2_VS_SHIFT 19
#define VTCR_EL2_VS_8BIT (0 << VTCR_EL2_VS_SHIFT)
#define VTCR_EL2_VS_16BIT (1 << VTCR_EL2_VS_SHIFT)
+#define VTCR_EL2_T0SZ(x) TCR_T0SZ(x)
+
/*
* We configure the Stage-2 page tables to always restrict the IPA space to be
* 40 bits wide (T0SZ = 24). Systems with a PARange smaller than 40 bits are
* not known to exist and will break with this configuration.
*
- * VTCR_EL2.PS is extracted from ID_AA64MMFR0_EL1.PARange at boot time
- * (see hyp-init.S).
+ * The VTCR_EL2 is configured per VM and is initialised in kvm_arm_setup_stage2().
*
* Note that when using 4K pages, we concatenate two first level page tables
* together. With 16K pages, we concatenate 16 first level page tables.
*
- * The magic numbers used for VTTBR_X in this patch can be found in Tables
- * D4-23 and D4-25 in ARM DDI 0487A.b.
*/
-#define VTCR_EL2_T0SZ_IPA VTCR_EL2_T0SZ_40B
#define VTCR_EL2_COMMON_BITS (VTCR_EL2_SH0_INNER | VTCR_EL2_ORGN0_WBWA | \
VTCR_EL2_IRGN0_WBWA | VTCR_EL2_RES1)
-#ifdef CONFIG_ARM64_64K_PAGES
/*
- * Stage2 translation configuration:
- * 64kB pages (TG0 = 1)
- * 2 level page tables (SL = 1)
+ * VTCR_EL2:SL0 indicates the entry level for Stage2 translation.
+ * Interestingly, it depends on the page size.
+ * See D.10.2.121, VTCR_EL2, in ARM DDI 0487C.a
+ *
+ * -----------------------------------------
+ * | Entry level | 4K | 16K/64K |
+ * ------------------------------------------
+ * | Level: 0 | 2 | - |
+ * ------------------------------------------
+ * | Level: 1 | 1 | 2 |
+ * ------------------------------------------
+ * | Level: 2 | 0 | 1 |
+ * ------------------------------------------
+ * | Level: 3 | - | 0 |
+ * ------------------------------------------
+ *
+ * The table roughly translates to :
+ *
+ * SL0(PAGE_SIZE, Entry_level) = TGRAN_SL0_BASE - Entry_Level
+ *
+ * Where TGRAN_SL0_BASE is a magic number depending on the page size:
+ * TGRAN_SL0_BASE(4K) = 2
+ * TGRAN_SL0_BASE(16K) = 3
+ * TGRAN_SL0_BASE(64K) = 3
+ * provided we take care of ruling out the unsupported cases and
+ * Entry_Level = 4 - Number_of_levels.
+ *
*/
-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_64K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC 38
+#ifdef CONFIG_ARM64_64K_PAGES
+
+#define VTCR_EL2_TGRAN VTCR_EL2_TG0_64K
+#define VTCR_EL2_TGRAN_SL0_BASE 3UL
+
#elif defined(CONFIG_ARM64_16K_PAGES)
-/*
- * Stage2 translation configuration:
- * 16kB pages (TG0 = 2)
- * 2 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_16K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC 42
+
+#define VTCR_EL2_TGRAN VTCR_EL2_TG0_16K
+#define VTCR_EL2_TGRAN_SL0_BASE 3UL
+
#else /* 4K */
-/*
- * Stage2 translation configuration:
- * 4kB pages (TG0 = 0)
- * 3 level page tables (SL = 1)
- */
-#define VTCR_EL2_TGRAN_FLAGS (VTCR_EL2_TG0_4K | VTCR_EL2_SL0_LVL1)
-#define VTTBR_X_TGRAN_MAGIC 37
+
+#define VTCR_EL2_TGRAN VTCR_EL2_TG0_4K
+#define VTCR_EL2_TGRAN_SL0_BASE 2UL
+
#endif
-#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN_FLAGS)
-#define VTTBR_X (VTTBR_X_TGRAN_MAGIC - VTCR_EL2_T0SZ_IPA)
+#define VTCR_EL2_LVLS_TO_SL0(levels) \
+ ((VTCR_EL2_TGRAN_SL0_BASE - (4 - (levels))) << VTCR_EL2_SL0_SHIFT)
+#define VTCR_EL2_SL0_TO_LVLS(sl0) \
+ ((sl0) + 4 - VTCR_EL2_TGRAN_SL0_BASE)
+#define VTCR_EL2_LVLS(vtcr) \
+ VTCR_EL2_SL0_TO_LVLS(((vtcr) & VTCR_EL2_SL0_MASK) >> VTCR_EL2_SL0_SHIFT)
+
+#define VTCR_EL2_FLAGS (VTCR_EL2_COMMON_BITS | VTCR_EL2_TGRAN)
+#define VTCR_EL2_IPA(vtcr) (64 - ((vtcr) & VTCR_EL2_T0SZ_MASK))
+
+/*
+ * ARM VMSAv8-64 defines an algorithm for finding the translation table
+ * descriptors in section D4.2.8 in ARM DDI 0487C.a.
+ *
+ * The algorithm defines the expectations on the translation table
+ * addresses for each level, based on PAGE_SIZE, entry level
+ * and the translation table size (T0SZ). The variable "x" in the
+ * algorithm determines the alignment of a table base address at a given
+ * level and thus determines the alignment of VTTBR:BADDR for stage2
+ * page table entry level.
+ * Since the number of bits resolved at the entry level could vary
+ * depending on the T0SZ, the value of "x" is defined based on a
+ * Magic constant for a given PAGE_SIZE and Entry Level. The
+ * intermediate levels must be always aligned to the PAGE_SIZE (i.e,
+ * x = PAGE_SHIFT).
+ *
+ * The value of "x" for entry level is calculated as :
+ * x = Magic_N - T0SZ
+ *
+ * where Magic_N is an integer depending on the page size and the entry
+ * level of the page table as below:
+ *
+ * --------------------------------------------
+ * | Entry level | 4K 16K 64K |
+ * --------------------------------------------
+ * | Level: 0 (4 levels) | 28 | - | - |
+ * --------------------------------------------
+ * | Level: 1 (3 levels) | 37 | 31 | 25 |
+ * --------------------------------------------
+ * | Level: 2 (2 levels) | 46 | 42 | 38 |
+ * --------------------------------------------
+ * | Level: 3 (1 level) | - | 53 | 51 |
+ * --------------------------------------------
+ *
+ * We have a magic formula for the Magic_N below:
+ *
+ * Magic_N(PAGE_SIZE, Level) = 64 - ((PAGE_SHIFT - 3) * Number_of_levels)
+ *
+ * where Number_of_levels = (4 - Level). We are only interested in the
+ * value for Entry_Level for the stage2 page table.
+ *
+ * So, given that T0SZ = (64 - IPA_SHIFT), we can compute 'x' as follows:
+ *
+ * x = (64 - ((PAGE_SHIFT - 3) * Number_of_levels)) - (64 - IPA_SHIFT)
+ * = IPA_SHIFT - ((PAGE_SHIFT - 3) * Number of levels)
+ *
+ * Here is one way to explain the Magic Formula:
+ *
+ * x = log2(Size_of_Entry_Level_Table)
+ *
+ * Since, we can resolve (PAGE_SHIFT - 3) bits at each level, and another
+ * PAGE_SHIFT bits in the PTE, we have :
+ *
+ * Bits_Entry_level = IPA_SHIFT - ((PAGE_SHIFT - 3) * (n - 1) + PAGE_SHIFT)
+ * = IPA_SHIFT - (PAGE_SHIFT - 3) * n - 3
+ * where n = number of levels, and since each pointer is 8bytes, we have:
+ *
+ * x = Bits_Entry_Level + 3
+ * = IPA_SHIFT - (PAGE_SHIFT - 3) * n
+ *
+ * The only constraint here is that, we have to find the number of page table
+ * levels for a given IPA size (which we do, see stage2_pt_levels())
+ */
+#define ARM64_VTTBR_X(ipa, levels) ((ipa) - ((levels) * (PAGE_SHIFT - 3)))
-#define VTTBR_BADDR_MASK (((UL(1) << (PHYS_MASK_SHIFT - VTTBR_X)) - 1) << VTTBR_X)
#define VTTBR_VMID_SHIFT (UL(48))
#define VTTBR_VMID_MASK(size) (_AT(u64, (1 << size) - 1) << VTTBR_VMID_SHIFT)
@@ -223,6 +311,13 @@
/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~UL(0xf))
+/*
+ * We have
+ * PAR [PA_Shift - 1 : 12] = PA [PA_Shift - 1 : 12]
+ * HPFAR [PA_Shift - 9 : 4] = FIPA [PA_Shift - 1 : 12]
+ */
+#define PAR_TO_HPFAR(par) \
+ (((par) & GENMASK_ULL(PHYS_MASK_SHIFT - 1, 12)) >> 8)
#define kvm_arm_exception_type \
{0, "IRQ" }, \
diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index 102b5a5c47b6..aea01a09eb94 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -30,6 +30,7 @@
#define ARM_EXCEPTION_IRQ 0
#define ARM_EXCEPTION_EL1_SERROR 1
#define ARM_EXCEPTION_TRAP 2
+#define ARM_EXCEPTION_IL 3
/* The hyp-stub will return this for any kvm_call_hyp() call */
#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR
@@ -72,8 +73,6 @@ extern void __vgic_v3_init_lrs(void);
extern u32 __kvm_get_mdcr_el2(void);
-extern u32 __init_stage2_translation(void);
-
/* Home-grown __this_cpu_{ptr,read} variants that always work at HYP */
#define __hyp_this_cpu_ptr(sym) \
({ \
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 3d6d7336f871..f84052f306af 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -53,7 +53,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
int __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext);
void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
struct kvm_arch {
@@ -61,11 +61,13 @@ struct kvm_arch {
u64 vmid_gen;
u32 vmid;
- /* 1-level 2nd stage table, protected by kvm->mmu_lock */
+ /* stage2 entry level table */
pgd_t *pgd;
/* VTTBR value associated with above pgd and vmid */
u64 vttbr;
+ /* VTCR_EL2 value for this VM */
+ u64 vtcr;
/* The last vcpu id that ran on each physical CPU */
int __percpu *last_vcpu_ran;
@@ -440,13 +442,7 @@ int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr);
-static inline void __cpu_init_stage2(void)
-{
- u32 parange = kvm_call_hyp(__init_stage2_translation);
-
- WARN_ONCE(parange < 40,
- "PARange is %d bits, unsupported configuration!", parange);
-}
+static inline void __cpu_init_stage2(void) {}
/* Guest/host FPSIMD coordination helpers */
int kvm_arch_vcpu_run_map_fp(struct kvm_vcpu *vcpu);
@@ -509,8 +505,12 @@ static inline int kvm_arm_have_ssbd(void)
void kvm_vcpu_load_sysregs(struct kvm_vcpu *vcpu);
void kvm_vcpu_put_sysregs(struct kvm_vcpu *vcpu);
+void kvm_set_ipa_limit(void);
+
#define __KVM_HAVE_ARCH_VM_ALLOC
struct kvm *kvm_arch_alloc_vm(void);
void kvm_arch_free_vm(struct kvm *kvm);
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type);
+
#endif /* __ARM64_KVM_HOST_H__ */
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 384c34397619..23aca66767f9 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -155,5 +155,15 @@ void deactivate_traps_vhe_put(void);
u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
void __noreturn __hyp_do_panic(unsigned long, ...);
+/*
+ * Must be called from hyp code running at EL2 with an updated VTTBR
+ * and interrupts disabled.
+ */
+static __always_inline void __hyp_text __load_guest_stage2(struct kvm *kvm)
+{
+ write_sysreg(kvm->arch.vtcr, vtcr_el2);
+ write_sysreg(kvm->arch.vttbr, vttbr_el2);
+}
+
#endif /* __ARM64_KVM_HYP_H__ */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index d6fff7de5539..77b1af9e64db 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -141,8 +141,16 @@ static inline unsigned long __kern_hyp_va(unsigned long v)
* We currently only support a 40bit IPA.
*/
#define KVM_PHYS_SHIFT (40)
-#define KVM_PHYS_SIZE (1UL << KVM_PHYS_SHIFT)
-#define KVM_PHYS_MASK (KVM_PHYS_SIZE - 1UL)
+
+#define kvm_phys_shift(kvm) VTCR_EL2_IPA(kvm->arch.vtcr)
+#define kvm_phys_size(kvm) (_AC(1, ULL) << kvm_phys_shift(kvm))
+#define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL))
+
+static inline bool kvm_page_empty(void *ptr)
+{
+ struct page *ptr_page = virt_to_page(ptr);
+ return page_count(ptr_page) == 1;
+}
#include <asm/stage2_pgtable.h>
@@ -238,12 +246,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp)
return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN);
}
-static inline bool kvm_page_empty(void *ptr)
-{
- struct page *ptr_page = virt_to_page(ptr);
- return page_count(ptr_page) == 1;
-}
-
#define hyp_pte_table_empty(ptep) kvm_page_empty(ptep)
#ifdef __PAGETABLE_PMD_FOLDED
@@ -517,5 +519,29 @@ static inline int hyp_map_aux_data(void)
#define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr)
+/*
+ * Get the magic number 'x' for VTTBR:BADDR of this KVM instance.
+ * With v8.2 LVA extensions, 'x' should be a minimum of 6 with
+ * 52bit IPS.
+ */
+static inline int arm64_vttbr_x(u32 ipa_shift, u32 levels)
+{
+ int x = ARM64_VTTBR_X(ipa_shift, levels);
+
+ return (IS_ENABLED(CONFIG_ARM64_PA_BITS_52) && x < 6) ? 6 : x;
+}
+
+static inline u64 vttbr_baddr_mask(u32 ipa_shift, u32 levels)
+{
+ unsigned int x = arm64_vttbr_x(ipa_shift, levels);
+
+ return GENMASK_ULL(PHYS_MASK_SHIFT - 1, x);
+}
+
+static inline u64 kvm_vttbr_baddr_mask(struct kvm *kvm)
+{
+ return vttbr_baddr_mask(kvm_phys_shift(kvm), kvm_stage2_levels(kvm));
+}
+
#endif /* __ASSEMBLY__ */
#endif /* __ARM64_KVM_MMU_H__ */
diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h
index 177b851ca6d9..ff35ac1258eb 100644
--- a/arch/arm64/include/asm/ptrace.h
+++ b/arch/arm64/include/asm/ptrace.h
@@ -25,6 +25,9 @@
#define CurrentEL_EL1 (1 << 2)
#define CurrentEL_EL2 (2 << 2)
+/* Additional SPSR bits not exposed in the UABI */
+#define PSR_IL_BIT (1 << 20)
+
/* AArch32-specific ptrace requests */
#define COMPAT_PTRACE_GETREGS 12
#define COMPAT_PTRACE_SETREGS 13
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h
deleted file mode 100644
index 2656a0fd05a6..000000000000
--- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPMD_H_
-#define __ARM64_S2_PGTABLE_NOPMD_H_
-
-#include <asm/stage2_pgtable-nopud.h>
-
-#define __S2_PGTABLE_PMD_FOLDED
-
-#define S2_PMD_SHIFT S2_PUD_SHIFT
-#define S2_PTRS_PER_PMD 1
-#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
-#define S2_PMD_MASK (~(S2_PMD_SIZE-1))
-
-#define stage2_pud_none(pud) (0)
-#define stage2_pud_present(pud) (1)
-#define stage2_pud_clear(pud) do { } while (0)
-#define stage2_pud_populate(pud, pmd) do { } while (0)
-#define stage2_pmd_offset(pud, address) ((pmd_t *)(pud))
-
-#define stage2_pmd_free(pmd) do { } while (0)
-
-#define stage2_pmd_addr_end(addr, end) (end)
-
-#define stage2_pud_huge(pud) (0)
-#define stage2_pmd_table_empty(pmdp) (0)
-
-#endif
diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h
deleted file mode 100644
index 5ee87b54ebf3..000000000000
--- a/arch/arm64/include/asm/stage2_pgtable-nopud.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_S2_PGTABLE_NOPUD_H_
-#define __ARM64_S2_PGTABLE_NOPUD_H_
-
-#define __S2_PGTABLE_PUD_FOLDED
-
-#define S2_PUD_SHIFT S2_PGDIR_SHIFT
-#define S2_PTRS_PER_PUD 1
-#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT)
-#define S2_PUD_MASK (~(S2_PUD_SIZE-1))
-
-#define stage2_pgd_none(pgd) (0)
-#define stage2_pgd_present(pgd) (1)
-#define stage2_pgd_clear(pgd) do { } while (0)
-#define stage2_pgd_populate(pgd, pud) do { } while (0)
-
-#define stage2_pud_offset(pgd, address) ((pud_t *)(pgd))
-
-#define stage2_pud_free(x) do { } while (0)
-
-#define stage2_pud_addr_end(addr, end) (end)
-#define stage2_pud_table_empty(pmdp) (0)
-
-#endif
diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h
index 8b68099348e5..d352f6df8d2c 100644
--- a/arch/arm64/include/asm/stage2_pgtable.h
+++ b/arch/arm64/include/asm/stage2_pgtable.h
@@ -19,9 +19,17 @@
#ifndef __ARM64_S2_PGTABLE_H_
#define __ARM64_S2_PGTABLE_H_
+#include <linux/hugetlb.h>
#include <asm/pgtable.h>
/*
+ * PGDIR_SHIFT determines the size a top-level page table entry can map
+ * and depends on the number of levels in the page table. Compute the
+ * PGDIR_SHIFT for a given number of levels.
+ */
+#define pt_levels_pgdir_shift(lvls) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (lvls))
+
+/*
* The hardware supports concatenation of up to 16 tables at stage2 entry level
* and we use the feature whenever possible.
*
@@ -29,112 +37,208 @@
* On arm64, the smallest PAGE_SIZE supported is 4k, which means
* (PAGE_SHIFT - 3) > 4 holds for all page sizes.
* This implies, the total number of page table levels at stage2 expected
- * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4)
+ * by the hardware is actually the number of levels required for (IPA_SHIFT - 4)
* in normal translations(e.g, stage1), since we cannot have another level in
- * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4).
+ * the range (IPA_SHIFT, IPA_SHIFT - 4).
*/
-#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4)
+#define stage2_pgtable_levels(ipa) ARM64_HW_PGTABLE_LEVELS((ipa) - 4)
+#define kvm_stage2_levels(kvm) VTCR_EL2_LVLS(kvm->arch.vtcr)
-/*
- * With all the supported VA_BITs and 40bit guest IPA, the following condition
- * is always true:
- *
- * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS
- *
- * We base our stage-2 page table walker helpers on this assumption and
- * fall back to using the host version of the helper wherever possible.
- * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back
- * to using the host version, since it is guaranteed it is not folded at host.
- *
- * If the condition breaks in the future, we can rearrange the host level
- * definitions and reuse them for stage2. Till then...
- */
-#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS
-#error "Unsupported combination of guest IPA and host VA_BITS."
-#endif
-
-/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */
-#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS)
-#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT)
-#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1))
+/* stage2_pgdir_shift() is the size mapped by top-level stage2 entry for the VM */
+#define stage2_pgdir_shift(kvm) pt_levels_pgdir_shift(kvm_stage2_levels(kvm))
+#define stage2_pgdir_size(kvm) (1ULL << stage2_pgdir_shift(kvm))
+#define stage2_pgdir_mask(kvm) ~(stage2_pgdir_size(kvm) - 1)
/*
* The number of PTRS across all concatenated stage2 tables given by the
* number of bits resolved at the initial level.
+ * If we force more levels than necessary, we may have (stage2_pgdir_shift > IPA),
+ * in which case, stage2_pgd_ptrs will have one entry.
*/
-#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT))
+#define pgd_ptrs_shift(ipa, pgdir_shift) \
+ ((ipa) > (pgdir_shift) ? ((ipa) - (pgdir_shift)) : 0)
+#define __s2_pgd_ptrs(ipa, lvls) \
+ (1 << (pgd_ptrs_shift((ipa), pt_levels_pgdir_shift(lvls))))
+#define __s2_pgd_size(ipa, lvls) (__s2_pgd_ptrs((ipa), (lvls)) * sizeof(pgd_t))
+
+#define stage2_pgd_ptrs(kvm) __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
+#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm))
/*
- * KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation
- * levels in addition to the PGD.
+ * kvm_mmmu_cache_min_pages() is the number of pages required to install
+ * a stage-2 translation. We pre-allocate the entry level page table at
+ * the VM creation.
*/
-#define KVM_MMU_CACHE_MIN_PAGES (STAGE2_PGTABLE_LEVELS - 1)
+#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1)
-
-#if STAGE2_PGTABLE_LEVELS > 3
+/* Stage2 PUD definitions when the level is present */
+static inline bool kvm_stage2_has_pud(struct kvm *kvm)
+{
+ return (CONFIG_PGTABLE_LEVELS > 3) && (kvm_stage2_levels(kvm) > 3);
+}
#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1)
-#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT)
+#define S2_PUD_SIZE (1UL << S2_PUD_SHIFT)
#define S2_PUD_MASK (~(S2_PUD_SIZE - 1))
-#define stage2_pgd_none(pgd) pgd_none(pgd)
-#define stage2_pgd_clear(pgd) pgd_clear(pgd)
-#define stage2_pgd_present(pgd) pgd_present(pgd)
-#define stage2_pgd_populate(pgd, pud) pgd_populate(NULL, pgd, pud)
-#define stage2_pud_offset(pgd, address) pud_offset(pgd, address)
-#define stage2_pud_free(pud) pud_free(NULL, pud)
+static inline bool stage2_pgd_none(struct kvm *kvm, pgd_t pgd)
+{
+ if (kvm_stage2_has_pud(kvm))
+ return pgd_none(pgd);
+ else
+ return 0;
+}
-#define stage2_pud_table_empty(pudp) kvm_page_empty(pudp)
+static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp)
+{
+ if (kvm_stage2_has_pud(kvm))
+ pgd_clear(pgdp);
+}
-static inline phys_addr_t stage2_pud_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline bool stage2_pgd_present(struct kvm *kvm, pgd_t pgd)
{
- phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+ if (kvm_stage2_has_pud(kvm))
+ return pgd_present(pgd);
+ else
+ return 1;
+}
- return (boundary - 1 < end - 1) ? boundary : end;
+static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgd, pud_t *pud)
+{
+ if (kvm_stage2_has_pud(kvm))
+ pgd_populate(NULL, pgd, pud);
+}
+
+static inline pud_t *stage2_pud_offset(struct kvm *kvm,
+ pgd_t *pgd, unsigned long address)
+{
+ if (kvm_stage2_has_pud(kvm))
+ return pud_offset(pgd, address);
+ else
+ return (pud_t *)pgd;
}
-#endif /* STAGE2_PGTABLE_LEVELS > 3 */
+static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud)
+{
+ if (kvm_stage2_has_pud(kvm))
+ pud_free(NULL, pud);
+}
+static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp)
+{
+ if (kvm_stage2_has_pud(kvm))
+ return kvm_page_empty(pudp);
+ else
+ return false;
+}
-#if STAGE2_PGTABLE_LEVELS > 2
+static inline phys_addr_t
+stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+ if (kvm_stage2_has_pud(kvm)) {
+ phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK;
+
+ return (boundary - 1 < end - 1) ? boundary : end;
+ } else {
+ return end;
+ }
+}
+
+/* Stage2 PMD definitions when the level is present */
+static inline bool kvm_stage2_has_pmd(struct kvm *kvm)
+{
+ return (CONFIG_PGTABLE_LEVELS > 2) && (kvm_stage2_levels(kvm) > 2);
+}
#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2)
-#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT)
+#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT)
#define S2_PMD_MASK (~(S2_PMD_SIZE - 1))
-#define stage2_pud_none(pud) pud_none(pud)
-#define stage2_pud_clear(pud) pud_clear(pud)
-#define stage2_pud_present(pud) pud_present(pud)
-#define stage2_pud_populate(pud, pmd) pud_populate(NULL, pud, pmd)
-#define stage2_pmd_offset(pud, address) pmd_offset(pud, address)
-#define stage2_pmd_free(pmd) pmd_free(NULL, pmd)
+static inline bool stage2_pud_none(struct kvm *kvm, pud_t pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pud_none(pud);
+ else
+ return 0;
+}
+
+static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ pud_clear(pud);
+}
-#define stage2_pud_huge(pud) pud_huge(pud)
-#define stage2_pmd_table_empty(pmdp) kvm_page_empty(pmdp)
+static inline bool stage2_pud_present(struct kvm *kvm, pud_t pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pud_present(pud);
+ else
+ return 1;
+}
-static inline phys_addr_t stage2_pmd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pud, pmd_t *pmd)
{
- phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
+ if (kvm_stage2_has_pmd(kvm))
+ pud_populate(NULL, pud, pmd);
+}
- return (boundary - 1 < end - 1) ? boundary : end;
+static inline pmd_t *stage2_pmd_offset(struct kvm *kvm,
+ pud_t *pud, unsigned long address)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pmd_offset(pud, address);
+ else
+ return (pmd_t *)pud;
}
-#endif /* STAGE2_PGTABLE_LEVELS > 2 */
+static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ pmd_free(NULL, pmd);
+}
+
+static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return pud_huge(pud);
+ else
+ return 0;
+}
+
+static inline bool stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp)
+{
+ if (kvm_stage2_has_pmd(kvm))
+ return kvm_page_empty(pmdp);
+ else
+ return 0;
+}
-#define stage2_pte_table_empty(ptep) kvm_page_empty(ptep)
+static inline phys_addr_t
+stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
+{
+ if (kvm_stage2_has_pmd(kvm)) {
+ phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK;
-#if STAGE2_PGTABLE_LEVELS == 2
-#include <asm/stage2_pgtable-nopmd.h>
-#elif STAGE2_PGTABLE_LEVELS == 3
-#include <asm/stage2_pgtable-nopud.h>
-#endif
+ return (boundary - 1 < end - 1) ? boundary : end;
+ } else {
+ return end;
+ }
+}
+static inline bool stage2_pte_table_empty(struct kvm *kvm, pte_t *ptep)
+{
+ return kvm_page_empty(ptep);
+}
-#define stage2_pgd_index(addr) (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1))
+static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr)
+{
+ return (((addr) >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1));
+}
-static inline phys_addr_t stage2_pgd_addr_end(phys_addr_t addr, phys_addr_t end)
+static inline phys_addr_t
+stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end)
{
- phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK;
+ phys_addr_t boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm);
return (boundary - 1 < end - 1) ? boundary : end;
}
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 07256b08226c..a74f84d09412 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -338,15 +338,15 @@ int __attribute_const__ kvm_target_cpu(void)
return KVM_ARM_TARGET_CORTEX_A53;
case ARM_CPU_PART_CORTEX_A57:
return KVM_ARM_TARGET_CORTEX_A57;
- };
+ }
break;
case ARM_CPU_IMP_APM:
switch (part_number) {
case APM_CPU_PART_POTENZA:
return KVM_ARM_TARGET_XGENE_POTENZA;
- };
+ }
break;
- };
+ }
/* Return a default generic target */
return KVM_ARM_TARGET_GENERIC_V8;
diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index e5e741bfffe1..35a81bebd02b 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -284,6 +284,13 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
*/
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
return 0;
+ case ARM_EXCEPTION_IL:
+ /*
+ * We attempted an illegal exception return. Guest state must
+ * have been corrupted somehow. Give up.
+ */
+ run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+ return -EINVAL;
default:
kvm_pr_unimpl("Unsupported exception type: %d",
exception_index);
diff --git a/arch/arm64/kvm/hyp/Makefile b/arch/arm64/kvm/hyp/Makefile
index 2fabc2dc1966..82d1904328ad 100644
--- a/arch/arm64/kvm/hyp/Makefile
+++ b/arch/arm64/kvm/hyp/Makefile
@@ -19,7 +19,6 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o
obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
-obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o
# KVM code is run at a different exception code with a different map, so
# compiler instrumentation that inserts callbacks or checks into the code may
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 24b4fbafe3e4..b1f14f736962 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -162,6 +162,20 @@ el1_error:
mov x0, #ARM_EXCEPTION_EL1_SERROR
b __guest_exit
+el2_sync:
+ /* Check for illegal exception return, otherwise panic */
+ mrs x0, spsr_el2
+
+ /* if this was something else, then panic! */
+ tst x0, #PSR_IL_BIT
+ b.eq __hyp_panic
+
+ /* Let's attempt a recovery from the illegal exception return */
+ get_vcpu_ptr x1, x0
+ mov x0, #ARM_EXCEPTION_IL
+ b __guest_exit
+
+
el2_error:
ldp x0, x1, [sp], #16
@@ -240,7 +254,7 @@ ENTRY(__kvm_hyp_vector)
invalid_vect el2t_fiq_invalid // FIQ EL2t
invalid_vect el2t_error_invalid // Error EL2t
- invalid_vect el2h_sync_invalid // Synchronous EL2h
+ valid_vect el2_sync // Synchronous EL2h
invalid_vect el2h_irq_invalid // IRQ EL2h
invalid_vect el2h_fiq_invalid // FIQ EL2h
valid_vect el2_error // Error EL2h
diff --git a/arch/arm64/kvm/hyp/s2-setup.c b/arch/arm64/kvm/hyp/s2-setup.c
deleted file mode 100644
index 603e1ee83e89..000000000000
--- a/arch/arm64/kvm/hyp/s2-setup.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2016 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/types.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_hyp.h>
-
-u32 __hyp_text __init_stage2_translation(void)
-{
- u64 val = VTCR_EL2_FLAGS;
- u64 parange;
- u64 tmp;
-
- /*
- * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS
- * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while
- * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2...
- */
- parange = read_sysreg(id_aa64mmfr0_el1) & 7;
- if (parange > ID_AA64MMFR0_PARANGE_MAX)
- parange = ID_AA64MMFR0_PARANGE_MAX;
- val |= parange << 16;
-
- /* Compute the actual PARange... */
- switch (parange) {
- case 0:
- parange = 32;
- break;
- case 1:
- parange = 36;
- break;
- case 2:
- parange = 40;
- break;
- case 3:
- parange = 42;
- break;
- case 4:
- parange = 44;
- break;
- case 5:
- default:
- parange = 48;
- break;
- }
-
- /*
- * ... and clamp it to 40 bits, unless we have some braindead
- * HW that implements less than that. In all cases, we'll
- * return that value for the rest of the kernel to decide what
- * to do.
- */
- val |= 64 - (parange > 40 ? 40 : parange);
-
- /*
- * Check the availability of Hardware Access Flag / Dirty Bit
- * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2.
- */
- tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf;
- if (tmp)
- val |= VTCR_EL2_HA;
-
- /*
- * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
- * bit in VTCR_EL2.
- */
- tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_VMIDBITS_SHIFT) & 0xf;
- val |= (tmp == ID_AA64MMFR1_VMIDBITS_16) ?
- VTCR_EL2_VS_16BIT :
- VTCR_EL2_VS_8BIT;
-
- write_sysreg(val, vtcr_el2);
-
- return parange;
-}
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index ca46153d7915..7cc175c88a37 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -198,7 +198,7 @@ void deactivate_traps_vhe_put(void)
static void __hyp_text __activate_vm(struct kvm *kvm)
{
- write_sysreg(kvm->arch.vttbr, vttbr_el2);
+ __load_guest_stage2(kvm);
}
static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
@@ -263,7 +263,7 @@ static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar)
return false; /* Translation failed, back to guest */
/* Convert PAR to HPFAR format */
- *hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4;
+ *hpfar = PAR_TO_HPFAR(tmp);
return true;
}
diff --git a/arch/arm64/kvm/hyp/sysreg-sr.c b/arch/arm64/kvm/hyp/sysreg-sr.c
index 9ce223944983..8dc285318204 100644
--- a/arch/arm64/kvm/hyp/sysreg-sr.c
+++ b/arch/arm64/kvm/hyp/sysreg-sr.c
@@ -152,8 +152,25 @@ static void __hyp_text __sysreg_restore_el1_state(struct kvm_cpu_context *ctxt)
static void __hyp_text
__sysreg_restore_el2_return_state(struct kvm_cpu_context *ctxt)
{
+ u64 pstate = ctxt->gp_regs.regs.pstate;
+ u64 mode = pstate & PSR_AA32_MODE_MASK;
+
+ /*
+ * Safety check to ensure we're setting the CPU up to enter the guest
+ * in a less privileged mode.
+ *
+ * If we are attempting a return to EL2 or higher in AArch64 state,
+ * program SPSR_EL2 with M=EL2h and the IL bit set which ensures that
+ * we'll take an illegal exception state exception immediately after
+ * the ERET to the guest. Attempts to return to AArch32 Hyp will
+ * result in an illegal exception return because EL2's execution state
+ * is determined by SCR_EL3.RW.
+ */
+ if (!(mode & PSR_MODE32_BIT) && mode >= PSR_MODE_EL2t)
+ pstate = PSR_MODE_EL2h | PSR_IL_BIT;
+
write_sysreg_el2(ctxt->gp_regs.regs.pc, elr);
- write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr);
+ write_sysreg_el2(pstate, spsr);
if (cpus_have_const_cap(ARM64_HAS_RAS_EXTN))
write_sysreg_s(ctxt->sys_regs[DISR_EL1], SYS_VDISR_EL2);
diff --git a/arch/arm64/kvm/hyp/tlb.c b/arch/arm64/kvm/hyp/tlb.c
index 131c7772703c..4dbd9c69a96d 100644
--- a/arch/arm64/kvm/hyp/tlb.c
+++ b/arch/arm64/kvm/hyp/tlb.c
@@ -30,7 +30,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
* bits. Changing E2H is impossible (goodbye TTBR1_EL2), so
* let's flip TGE before executing the TLB operation.
*/
- write_sysreg(kvm->arch.vttbr, vttbr_el2);
+ __load_guest_stage2(kvm);
val = read_sysreg(hcr_el2);
val &= ~HCR_TGE;
write_sysreg(val, hcr_el2);
@@ -39,7 +39,7 @@ static void __hyp_text __tlb_switch_to_guest_vhe(struct kvm *kvm)
static void __hyp_text __tlb_switch_to_guest_nvhe(struct kvm *kvm)
{
- write_sysreg(kvm->arch.vttbr, vttbr_el2);
+ __load_guest_stage2(kvm);
isb();
}
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index e37c78bbe1ca..b72a3dd56204 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -26,6 +26,7 @@
#include <kvm/arm_arch_timer.h>
+#include <asm/cpufeature.h>
#include <asm/cputype.h>
#include <asm/ptrace.h>
#include <asm/kvm_arm.h>
@@ -33,6 +34,9 @@
#include <asm/kvm_coproc.h>
#include <asm/kvm_mmu.h>
+/* Maximum phys_shift supported for any VM on this host */
+static u32 kvm_ipa_limit;
+
/*
* ARMv8 Reset Values
*/
@@ -55,12 +59,12 @@ static bool cpu_has_32bit_el1(void)
}
/**
- * kvm_arch_dev_ioctl_check_extension
+ * kvm_arch_vm_ioctl_check_extension
*
* We currently assume that the number of HW registers is uniform
* across all CPUs (see cpuinfo_sanity_check).
*/
-int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
+int kvm_arch_vm_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -82,9 +86,11 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_SET_GUEST_DEBUG:
case KVM_CAP_VCPU_ATTRIBUTES:
- case KVM_CAP_VCPU_EVENTS:
r = 1;
break;
+ case KVM_CAP_ARM_VM_IPA_SIZE:
+ r = kvm_ipa_limit;
+ break;
default:
r = 0;
}
@@ -133,3 +139,99 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
/* Reset timer */
return kvm_timer_vcpu_reset(vcpu);
}
+
+void kvm_set_ipa_limit(void)
+{
+ unsigned int ipa_max, pa_max, va_max, parange;
+
+ parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 0x7;
+ pa_max = id_aa64mmfr0_parange_to_phys_shift(parange);
+
+ /* Clamp the IPA limit to the PA size supported by the kernel */
+ ipa_max = (pa_max > PHYS_MASK_SHIFT) ? PHYS_MASK_SHIFT : pa_max;
+ /*
+ * Since our stage2 table is dependent on the stage1 page table code,
+ * we must always honor the following condition:
+ *
+ * Number of levels in Stage1 >= Number of levels in Stage2.
+ *
+ * So clamp the ipa limit further down to limit the number of levels.
+ * Since we can concatenate upto 16 tables at entry level, we could
+ * go upto 4bits above the maximum VA addressible with the current
+ * number of levels.
+ */
+ va_max = PGDIR_SHIFT + PAGE_SHIFT - 3;
+ va_max += 4;
+
+ if (va_max < ipa_max)
+ ipa_max = va_max;
+
+ /*
+ * If the final limit is lower than the real physical address
+ * limit of the CPUs, report the reason.
+ */
+ if (ipa_max < pa_max)
+ pr_info("kvm: Limiting the IPA size due to kernel %s Address limit\n",
+ (va_max < pa_max) ? "Virtual" : "Physical");
+
+ WARN(ipa_max < KVM_PHYS_SHIFT,
+ "KVM IPA limit (%d bit) is smaller than default size\n", ipa_max);
+ kvm_ipa_limit = ipa_max;
+ kvm_info("IPA Size Limit: %dbits\n", kvm_ipa_limit);
+}
+
+/*
+ * Configure the VTCR_EL2 for this VM. The VTCR value is common
+ * across all the physical CPUs on the system. We use system wide
+ * sanitised values to fill in different fields, except for Hardware
+ * Management of Access Flags. HA Flag is set unconditionally on
+ * all CPUs, as it is safe to run with or without the feature and
+ * the bit is RES0 on CPUs that don't support it.
+ */
+int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type)
+{
+ u64 vtcr = VTCR_EL2_FLAGS;
+ u32 parange, phys_shift;
+ u8 lvls;
+
+ if (type & ~KVM_VM_TYPE_ARM_IPA_SIZE_MASK)
+ return -EINVAL;
+
+ phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type);
+ if (phys_shift) {
+ if (phys_shift > kvm_ipa_limit ||
+ phys_shift < 32)
+ return -EINVAL;
+ } else {
+ phys_shift = KVM_PHYS_SHIFT;
+ }
+
+ parange = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1) & 7;
+ if (parange > ID_AA64MMFR0_PARANGE_MAX)
+ parange = ID_AA64MMFR0_PARANGE_MAX;
+ vtcr |= parange << VTCR_EL2_PS_SHIFT;
+
+ vtcr |= VTCR_EL2_T0SZ(phys_shift);
+ /*
+ * Use a minimum 2 level page table to prevent splitting
+ * host PMD huge pages at stage2.
+ */
+ lvls = stage2_pgtable_levels(phys_shift);
+ if (lvls < 2)
+ lvls = 2;
+ vtcr |= VTCR_EL2_LVLS_TO_SL0(lvls);
+
+ /*
+ * Enable the Hardware Access Flag management, unconditionally
+ * on all CPUs. The features is RES0 on CPUs without the support
+ * and must be ignored by the CPUs.
+ */
+ vtcr |= VTCR_EL2_HA;
+
+ /* Set the vmid bits */
+ vtcr |= (kvm_get_vmid_bits() == 16) ?
+ VTCR_EL2_VS_16BIT :
+ VTCR_EL2_VS_8BIT;
+ kvm->arch.vtcr = vtcr;
+ return 0;
+}
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index e203169931c7..6390bd8c141b 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -14,6 +14,16 @@
#ifndef _ASM_X86_FIXMAP_H
#define _ASM_X86_FIXMAP_H
+/*
+ * Exposed to assembly code for setting up initial page tables. Cannot be
+ * calculated in assembly code (fixmap entries are an enum), but is sanity
+ * checked in the actual fixmap C code to make sure that the fixmap is
+ * covered fully.
+ */
+#define FIXMAP_PMD_NUM 2
+/* fixmap starts downwards from the 507th entry in level2_fixmap_pgt */
+#define FIXMAP_PMD_TOP 507
+
#ifndef __ASSEMBLY__
#include <linux/kernel.h>
#include <asm/acpi.h>
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index c0643831706e..616f8e637bc3 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -48,10 +48,13 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
/* Architecture __weak replacement functions */
void __init mem_encrypt_init(void);
+void __init mem_encrypt_free_decrypted_mem(void);
bool sme_active(void);
bool sev_active(void);
+#define __bss_decrypted __attribute__((__section__(".bss..decrypted")))
+
#else /* !CONFIG_AMD_MEM_ENCRYPT */
#define sme_me_mask 0ULL
@@ -77,6 +80,8 @@ early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0;
static inline int __init
early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+#define __bss_decrypted
+
#endif /* CONFIG_AMD_MEM_ENCRYPT */
/*
@@ -88,6 +93,8 @@ early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0;
#define __sme_pa(x) (__pa(x) | sme_me_mask)
#define __sme_pa_nodebug(x) (__pa_nodebug(x) | sme_me_mask)
+extern char __start_bss_decrypted[], __end_bss_decrypted[], __start_bss_decrypted_unused[];
+
#endif /* __ASSEMBLY__ */
#endif /* __X86_MEM_ENCRYPT_H__ */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index ce2b59047cb8..9c85b54bf03c 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -14,6 +14,7 @@
#include <asm/processor.h>
#include <linux/bitops.h>
#include <linux/threads.h>
+#include <asm/fixmap.h>
extern p4d_t level4_kernel_pgt[512];
extern p4d_t level4_ident_pgt[512];
@@ -22,7 +23,7 @@ extern pud_t level3_ident_pgt[512];
extern pmd_t level2_kernel_pgt[512];
extern pmd_t level2_fixmap_pgt[512];
extern pmd_t level2_ident_pgt[512];
-extern pte_t level1_fixmap_pgt[512];
+extern pte_t level1_fixmap_pgt[512 * FIXMAP_PMD_NUM];
extern pgd_t init_top_pgt[];
#define swapper_pg_dir init_top_pgt
diff --git a/arch/x86/kernel/cpu/intel_rdt.h b/arch/x86/kernel/cpu/intel_rdt.h
index 4e588f36228f..285eb3ec4200 100644
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -382,6 +382,11 @@ static inline bool is_mbm_event(int e)
e <= QOS_L3_MBM_LOCAL_EVENT_ID);
}
+struct rdt_parse_data {
+ struct rdtgroup *rdtgrp;
+ char *buf;
+};
+
/**
* struct rdt_resource - attributes of an RDT resource
* @rid: The index of the resource
@@ -423,16 +428,19 @@ struct rdt_resource {
struct rdt_cache cache;
struct rdt_membw membw;
const char *format_str;
- int (*parse_ctrlval) (void *data, struct rdt_resource *r,
- struct rdt_domain *d);
+ int (*parse_ctrlval)(struct rdt_parse_data *data,
+ struct rdt_resource *r,
+ struct rdt_domain *d);
struct list_head evt_list;
int num_rmid;
unsigned int mon_scale;
unsigned long fflags;
};
-int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d);
-int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d);
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d);
extern struct mutex rdtgroup_mutex;
@@ -536,6 +544,7 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
int update_domains(struct rdt_resource *r, int closid);
+int closids_supported(void);
void closid_free(int closid);
int alloc_rmid(void);
void free_rmid(u32 rmid);
diff --git a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
index af358ca05160..0f53049719cd 100644
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -64,19 +64,19 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
return true;
}
-int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d)
+int parse_bw(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
{
- unsigned long data;
- char *buf = _buf;
+ unsigned long bw_val;
if (d->have_new_ctrl) {
rdt_last_cmd_printf("duplicate domain %d\n", d->id);
return -EINVAL;
}
- if (!bw_validate(buf, &data, r))
+ if (!bw_validate(data->buf, &bw_val, r))
return -EINVAL;
- d->new_ctrl = data;
+ d->new_ctrl = bw_val;
d->have_new_ctrl = true;
return 0;
@@ -123,18 +123,13 @@ static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
return true;
}
-struct rdt_cbm_parse_data {
- struct rdtgroup *rdtgrp;
- char *buf;
-};
-
/*
* Read one cache bit mask (hex). Check that it is valid for the current
* resource type.
*/
-int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
+int parse_cbm(struct rdt_parse_data *data, struct rdt_resource *r,
+ struct rdt_domain *d)
{
- struct rdt_cbm_parse_data *data = _data;
struct rdtgroup *rdtgrp = data->rdtgrp;
u32 cbm_val;
@@ -195,11 +190,17 @@ int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
static int parse_line(char *line, struct rdt_resource *r,
struct rdtgroup *rdtgrp)
{
- struct rdt_cbm_parse_data data;
+ struct rdt_parse_data data;
char *dom = NULL, *id;
struct rdt_domain *d;
unsigned long dom_id;
+ if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
+ r->rid == RDT_RESOURCE_MBA) {
+ rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n");
+ return -EINVAL;
+ }
+
next:
if (!line || line[0] == '\0')
return 0;
diff --git a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
index b799c00bef09..1b8e86a5d5e1 100644
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -97,6 +97,12 @@ void rdt_last_cmd_printf(const char *fmt, ...)
* limited as the number of resources grows.
*/
static int closid_free_map;
+static int closid_free_map_len;
+
+int closids_supported(void)
+{
+ return closid_free_map_len;
+}
static void closid_init(void)
{
@@ -111,6 +117,7 @@ static void closid_init(void)
/* CLOSID 0 is always reserved for the default group */
closid_free_map &= ~1;
+ closid_free_map_len = rdt_min_closid;
}
static int closid_alloc(void)
@@ -802,7 +809,7 @@ static int rdt_bit_usage_show(struct kernfs_open_file *of,
sw_shareable = 0;
exclusive = 0;
seq_printf(seq, "%d=", dom->id);
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
if (!closid_allocated(i))
continue;
mode = rdtgroup_mode_by_closid(i);
@@ -989,7 +996,7 @@ bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
/* Check for overlap with other resource groups */
ctrl = d->ctrl_val;
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
ctrl_b = (unsigned long *)ctrl;
mode = rdtgroup_mode_by_closid(i);
if (closid_allocated(i) && i != closid &&
@@ -1024,16 +1031,27 @@ static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
{
int closid = rdtgrp->closid;
struct rdt_resource *r;
+ bool has_cache = false;
struct rdt_domain *d;
for_each_alloc_enabled_rdt_resource(r) {
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
+ has_cache = true;
list_for_each_entry(d, &r->domains, list) {
if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
- rdtgrp->closid, false))
+ rdtgrp->closid, false)) {
+ rdt_last_cmd_puts("schemata overlaps\n");
return false;
+ }
}
}
+ if (!has_cache) {
+ rdt_last_cmd_puts("cannot be exclusive without CAT/CDP\n");
+ return false;
+ }
+
return true;
}
@@ -1085,7 +1103,6 @@ static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
rdtgrp->mode = RDT_MODE_SHAREABLE;
} else if (!strcmp(buf, "exclusive")) {
if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
- rdt_last_cmd_printf("schemata overlaps\n");
ret = -EINVAL;
goto out;
}
@@ -1155,8 +1172,8 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
struct rdt_resource *r;
struct rdt_domain *d;
unsigned int size;
- bool sep = false;
- u32 cbm;
+ bool sep;
+ u32 ctrl;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
if (!rdtgrp) {
@@ -1174,6 +1191,7 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
}
for_each_alloc_enabled_rdt_resource(r) {
+ sep = false;
seq_printf(s, "%*s:", max_name_width, r->name);
list_for_each_entry(d, &r->domains, list) {
if (sep)
@@ -1181,8 +1199,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
size = 0;
} else {
- cbm = d->ctrl_val[rdtgrp->closid];
- size = rdtgroup_cbm_to_size(r, d, cbm);
+ ctrl = (!is_mba_sc(r) ?
+ d->ctrl_val[rdtgrp->closid] :
+ d->mbps_val[rdtgrp->closid]);
+ if (r->rid == RDT_RESOURCE_MBA)
+ size = ctrl;
+ else
+ size = rdtgroup_cbm_to_size(r, d, ctrl);
}
seq_printf(s, "%d=%u", d->id, size);
sep = true;
@@ -2336,12 +2359,18 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
u32 *ctrl;
for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
list_for_each_entry(d, &r->domains, list) {
d->have_new_ctrl = false;
d->new_ctrl = r->cache.shareable_bits;
used_b = r->cache.shareable_bits;
ctrl = d->ctrl_val;
- for (i = 0; i < r->num_closid; i++, ctrl++) {
+ for (i = 0; i < closids_supported(); i++, ctrl++) {
if (closid_allocated(i) && i != closid) {
mode = rdtgroup_mode_by_closid(i);
if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
@@ -2373,6 +2402,12 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
}
for_each_alloc_enabled_rdt_resource(r) {
+ /*
+ * Only initialize default allocations for CBM cache
+ * resources
+ */
+ if (r->rid == RDT_RESOURCE_MBA)
+ continue;
ret = update_domains(r, rdtgrp->closid);
if (ret < 0) {
rdt_last_cmd_puts("failed to initialize allocations\n");
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 8047379e575a..ddee1f0870c4 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -35,6 +35,7 @@
#include <asm/bootparam_utils.h>
#include <asm/microcode.h>
#include <asm/kasan.h>
+#include <asm/fixmap.h>
/*
* Manage page tables very early on.
@@ -112,6 +113,7 @@ static bool __head check_la57_support(unsigned long physaddr)
unsigned long __head __startup_64(unsigned long physaddr,
struct boot_params *bp)
{
+ unsigned long vaddr, vaddr_end;
unsigned long load_delta, *p;
unsigned long pgtable_flags;
pgdval_t *pgd;
@@ -165,7 +167,8 @@ unsigned long __head __startup_64(unsigned long physaddr,
pud[511] += load_delta;
pmd = fixup_pointer(level2_fixmap_pgt, physaddr);
- pmd[506] += load_delta;
+ for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--)
+ pmd[i] += load_delta;
/*
* Set up the identity mapping for the switchover. These
@@ -235,6 +238,21 @@ unsigned long __head __startup_64(unsigned long physaddr,
sme_encrypt_kernel(bp);
/*
+ * Clear the memory encryption mask from the .bss..decrypted section.
+ * The bss section will be memset to zero later in the initialization so
+ * there is no need to zero it after changing the memory encryption
+ * attribute.
+ */
+ if (mem_encrypt_active()) {
+ vaddr = (unsigned long)__start_bss_decrypted;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
+ i = pmd_index(vaddr);
+ pmd[i] -= sme_get_me_mask();
+ }
+ }
+
+ /*
* Return the SME encryption mask (if SME is active) to be used as a
* modifier for the initial pgdir entry programmed into CR3.
*/
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 15ebc2fc166e..a3618cf04cf6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -24,6 +24,7 @@
#include "../entry/calling.h"
#include <asm/export.h>
#include <asm/nospec-branch.h>
+#include <asm/fixmap.h>
#ifdef CONFIG_PARAVIRT
#include <asm/asm-offsets.h>
@@ -445,13 +446,20 @@ NEXT_PAGE(level2_kernel_pgt)
KERNEL_IMAGE_SIZE/PMD_SIZE)
NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
+ .fill (512 - 4 - FIXMAP_PMD_NUM),8,0
+ pgtno = 0
+ .rept (FIXMAP_PMD_NUM)
+ .quad level1_fixmap_pgt + (pgtno << PAGE_SHIFT) - __START_KERNEL_map \
+ + _PAGE_TABLE_NOENC;
+ pgtno = pgtno + 1
+ .endr
+ /* 6 MB reserved space + a 2MB hole */
+ .fill 4,8,0
NEXT_PAGE(level1_fixmap_pgt)
+ .rept (FIXMAP_PMD_NUM)
.fill 512,8,0
+ .endr
#undef PMDS
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 1e6764648af3..013fe3d21dbb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -28,6 +28,7 @@
#include <linux/sched/clock.h>
#include <linux/mm.h>
#include <linux/slab.h>
+#include <linux/set_memory.h>
#include <asm/hypervisor.h>
#include <asm/mem_encrypt.h>
@@ -61,9 +62,10 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
static struct pvclock_vsyscall_time_info
- hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE);
-static struct pvclock_wall_clock wall_clock;
+ hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
+static struct pvclock_wall_clock wall_clock __bss_decrypted;
static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
+static struct pvclock_vsyscall_time_info *hvclock_mem;
static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
{
@@ -236,6 +238,45 @@ static void kvm_shutdown(void)
native_machine_shutdown();
}
+static void __init kvmclock_init_mem(void)
+{
+ unsigned long ncpus;
+ unsigned int order;
+ struct page *p;
+ int r;
+
+ if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
+ return;
+
+ ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
+ order = get_order(ncpus * sizeof(*hvclock_mem));
+
+ p = alloc_pages(GFP_KERNEL, order);
+ if (!p) {
+ pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
+ return;
+ }
+
+ hvclock_mem = page_address(p);
+
+ /*
+ * hvclock is shared between the guest and the hypervisor, must
+ * be mapped decrypted.
+ */
+ if (sev_active()) {
+ r = set_memory_decrypted((unsigned long) hvclock_mem,
+ 1UL << order);
+ if (r) {
+ __free_pages(p, order);
+ hvclock_mem = NULL;
+ pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
+ return;
+ }
+ }
+
+ memset(hvclock_mem, 0, PAGE_SIZE << order);
+}
+
static int __init kvm_setup_vsyscall_timeinfo(void)
{
#ifdef CONFIG_X86_64
@@ -250,6 +291,9 @@ static int __init kvm_setup_vsyscall_timeinfo(void)
kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
#endif
+
+ kvmclock_init_mem();
+
return 0;
}
early_initcall(kvm_setup_vsyscall_timeinfo);
@@ -269,8 +313,10 @@ static int kvmclock_setup_percpu(unsigned int cpu)
/* Use the static page for the first CPUs, allocate otherwise */
if (cpu < HVC_BOOT_ARRAY_SIZE)
p = &hv_clock_boot[cpu];
+ else if (hvclock_mem)
+ p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
else
- p = kzalloc(sizeof(*p), GFP_KERNEL);
+ return -ENOMEM;
per_cpu(hv_clock_per_cpu, cpu) = p;
return p ? 0 : -ENOMEM;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index afdb303285f8..8dc69d82567e 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -91,7 +91,7 @@ unsigned paravirt_patch_call(void *insnbuf,
if (len < 5) {
#ifdef CONFIG_RETPOLINE
- WARN_ONCE("Failing to patch indirect CALL in %ps\n", (void *)addr);
+ WARN_ONCE(1, "Failing to patch indirect CALL in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
@@ -111,7 +111,7 @@ unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
if (len < 5) {
#ifdef CONFIG_RETPOLINE
- WARN_ONCE("Failing to patch indirect JMP in %ps\n", (void *)addr);
+ WARN_ONCE(1, "Failing to patch indirect JMP in %ps\n", (void *)addr);
#endif
return len; /* call too long for patch site */
}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8bde0a419f86..5dd3317d761f 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -65,6 +65,23 @@ jiffies_64 = jiffies;
#define ALIGN_ENTRY_TEXT_BEGIN . = ALIGN(PMD_SIZE);
#define ALIGN_ENTRY_TEXT_END . = ALIGN(PMD_SIZE);
+/*
+ * This section contains data which will be mapped as decrypted. Memory
+ * encryption operates on a page basis. Make this section PMD-aligned
+ * to avoid splitting the pages while mapping the section early.
+ *
+ * Note: We use a separate section so that only this section gets
+ * decrypted to avoid exposing more than we wish.
+ */
+#define BSS_DECRYPTED \
+ . = ALIGN(PMD_SIZE); \
+ __start_bss_decrypted = .; \
+ *(.bss..decrypted); \
+ . = ALIGN(PAGE_SIZE); \
+ __start_bss_decrypted_unused = .; \
+ . = ALIGN(PMD_SIZE); \
+ __end_bss_decrypted = .; \
+
#else
#define X86_ALIGN_RODATA_BEGIN
@@ -74,6 +91,7 @@ jiffies_64 = jiffies;
#define ALIGN_ENTRY_TEXT_BEGIN
#define ALIGN_ENTRY_TEXT_END
+#define BSS_DECRYPTED
#endif
@@ -355,6 +373,7 @@ SECTIONS
__bss_start = .;
*(.bss..page_aligned)
*(.bss)
+ BSS_DECRYPTED
. = ALIGN(PAGE_SIZE);
__bss_stop = .;
}
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 7a8fc26c1115..faca978ebf9d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -815,10 +815,14 @@ void free_kernel_image_pages(void *begin, void *end)
set_memory_np_noalias(begin_ul, len_pages);
}
+void __weak mem_encrypt_free_decrypted_mem(void) { }
+
void __ref free_initmem(void)
{
e820__reallocate_tables();
+ mem_encrypt_free_decrypted_mem();
+
free_kernel_image_pages(&__init_begin, &__init_end);
}
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index b2de398d1fd3..006f373f54ab 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -348,6 +348,30 @@ bool sev_active(void)
EXPORT_SYMBOL(sev_active);
/* Architecture __weak replacement functions */
+void __init mem_encrypt_free_decrypted_mem(void)
+{
+ unsigned long vaddr, vaddr_end, npages;
+ int r;
+
+ vaddr = (unsigned long)__start_bss_decrypted_unused;
+ vaddr_end = (unsigned long)__end_bss_decrypted;
+ npages = (vaddr_end - vaddr) >> PAGE_SHIFT;
+
+ /*
+ * The unused memory range was mapped decrypted, change the encryption
+ * attribute from decrypted to encrypted before freeing it.
+ */
+ if (mem_encrypt_active()) {
+ r = set_memory_encrypted(vaddr, npages);
+ if (r) {
+ pr_warn("failed to free unused decrypted pages\n");
+ return;
+ }
+ }
+
+ free_init_pages("unused decrypted", vaddr, vaddr_end);
+}
+
void __init mem_encrypt_init(void)
{
if (!sme_me_mask)
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ae394552fb94..089e78c4effd 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -637,6 +637,15 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
{
unsigned long address = __fix_to_virt(idx);
+#ifdef CONFIG_X86_64
+ /*
+ * Ensure that the static initial page tables are covering the
+ * fixmap completely.
+ */
+ BUILD_BUG_ON(__end_of_permanent_fixed_addresses >
+ (FIXMAP_PMD_NUM * PTRS_PER_PTE));
+#endif
+
if (idx >= __end_of_fixed_addresses) {
BUG();
return;
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 2fe5c9b1816b..dd461c0167ef 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -1907,7 +1907,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
/* L3_k[511] -> level2_fixmap_pgt */
convert_pfn_mfn(level3_kernel_pgt);
- /* L3_k[511][506] -> level1_fixmap_pgt */
+ /* L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt */
convert_pfn_mfn(level2_fixmap_pgt);
/* We get [511][511] and have Xen's version of level2_kernel_pgt */
@@ -1952,7 +1952,11 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
- set_page_prot(level1_fixmap_pgt, PAGE_KERNEL_RO);
+
+ for (i = 0; i < FIXMAP_PMD_NUM; i++) {
+ set_page_prot(level1_fixmap_pgt + i * PTRS_PER_PTE,
+ PAGE_KERNEL_RO);
+ }
/* Pin down new L4 */
pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
diff --git a/arch/x86/xen/pmu.c b/arch/x86/xen/pmu.c
index 7d00d4ad44d4..95997e6c0696 100644
--- a/arch/x86/xen/pmu.c
+++ b/arch/x86/xen/pmu.c
@@ -478,7 +478,7 @@ static void xen_convert_regs(const struct xen_pmu_regs *xen_regs,
irqreturn_t xen_pmu_irq_handler(int irq, void *dev_id)
{
int err, ret = IRQ_NONE;
- struct pt_regs regs;
+ struct pt_regs regs = {0};
const struct xen_pmu_data *xenpmu_data = get_xenpmu_data();
uint8_t xenpmu_flags = get_xenpmu_flags();