diff options
author | Paolo Bonzini <pbonzini@redhat.com> | 2022-05-25 05:09:49 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2022-05-25 05:09:49 -0400 |
commit | b699da3dc27959091fafbd906853a38cfc2887a7 (patch) | |
tree | e8e0d4b274876d23ba8b2ec63a12388395fe5246 /arch | |
parent | 47e8eec83262083c7da220446551eaad614218ea (diff) | |
parent | fed9b26b2501ea0ce41ae3a788bcc498440589c6 (diff) | |
download | linux-b699da3dc27959091fafbd906853a38cfc2887a7.tar.bz2 |
Merge tag 'kvm-riscv-5.19-1' of https://github.com/kvm-riscv/linux into HEAD
KVM/riscv changes for 5.19
- Added Sv57x4 support for G-stage page table
- Added range based local HFENCE functions
- Added remote HFENCE functions based on VCPU requests
- Added ISA extension registers in ONE_REG interface
- Updated KVM RISC-V maintainers entry to cover selftests support
Diffstat (limited to 'arch')
53 files changed, 1181 insertions, 429 deletions
diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 0c70eb688a00..2a0739a2350b 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -440,6 +440,9 @@ extern void pci_iounmap(struct pci_dev *dev, void __iomem *addr); #define ARCH_HAS_VALID_PHYS_ADDR_RANGE extern int valid_phys_addr_range(phys_addr_t addr, size_t size); extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); +extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap #endif /* diff --git a/arch/arm/mm/ioremap.c b/arch/arm/mm/ioremap.c index aa08bcb72db9..290702328a33 100644 --- a/arch/arm/mm/ioremap.c +++ b/arch/arm/mm/ioremap.c @@ -493,3 +493,11 @@ void __init early_ioremap_init(void) { early_ioremap_setup(); } + +bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + return memblock_is_map_memory(pfn); +} diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 7fd836bea7eb..3995652daf81 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -192,4 +192,8 @@ extern void __iomem *ioremap_cache(phys_addr_t phys_addr, size_t size); extern int valid_phys_addr_range(phys_addr_t addr, size_t size); extern int valid_mmap_phys_addr_range(unsigned long pfn, size_t size); +extern bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags); +#define arch_memremap_can_ram_remap arch_memremap_can_ram_remap + #endif /* __ASM_IO_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 986837d7ec82..fa7981d0d917 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -75,6 +75,10 @@ obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso-wrap.o obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o +# Force dependency (vdso*-wrap.S includes vdso.so through incbin) +$(obj)/vdso-wrap.o: $(obj)/vdso/vdso.so +$(obj)/vdso32-wrap.o: $(obj)/vdso32/vdso.so + obj-y += probes/ head-y := head.o extra-y += $(head-y) vmlinux.lds diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 4c9b5b4b7a0b..a0f3d0aaa3c5 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -208,6 +208,8 @@ static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = { #ifdef CONFIG_ARM64_ERRATUM_1286807 { ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 3, 0), + /* Kryo4xx Gold (rcpe to rfpe) => (r0p0 to r3p0) */ + ERRATA_MIDR_RANGE(MIDR_QCOM_KRYO_4XX_GOLD, 0xc, 0xe, 0xf, 0xe), }, #endif {}, diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 78ed5fc762eb..2ffc299c2375 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -682,7 +682,6 @@ static const struct __ftr_reg_entry { ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1, &id_aa64isar1_override), - ARM64_FTR_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2), ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2, &id_aa64isar2_override), @@ -839,7 +838,7 @@ static void __init sort_ftr_regs(void) * to sys_id for subsequent binary search in get_arm64_ftr_reg() * to work correctly. */ - BUG_ON(arm64_ftr_regs[i].sys_id < arm64_ftr_regs[i - 1].sys_id); + BUG_ON(arm64_ftr_regs[i].sys_id <= arm64_ftr_regs[i - 1].sys_id); } } diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index 172452f79e46..ac1964ebed1e 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -52,9 +52,6 @@ GCOV_PROFILE := n targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -# Force dependency (incbin is bad) -$(obj)/vdso.o : $(obj)/vdso.so - # Link rule for the .so file, .lds has to be first $(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE $(call if_changed,vdsold_and_vdso_check) diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index ed181bedbffc..05ba1aae1b6f 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -131,9 +131,6 @@ obj-vdso := $(c-obj-vdso) $(c-obj-vdso-gettimeofday) $(asm-obj-vdso) targets += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) -# Force dependency (vdso.s includes vdso.so through incbin) -$(obj)/vdso.o: $(obj)/vdso.so - include/generated/vdso32-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) diff --git a/arch/arm64/mm/ioremap.c b/arch/arm64/mm/ioremap.c index b7c81dacabf0..b21f91cd830d 100644 --- a/arch/arm64/mm/ioremap.c +++ b/arch/arm64/mm/ioremap.c @@ -99,3 +99,11 @@ void __init early_ioremap_init(void) { early_ioremap_setup(); } + +bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + return pfn_is_map_memory(pfn); +} diff --git a/arch/mips/include/asm/timex.h b/arch/mips/include/asm/timex.h index b05bb70a2e46..8026baf46e72 100644 --- a/arch/mips/include/asm/timex.h +++ b/arch/mips/include/asm/timex.h @@ -40,9 +40,9 @@ typedef unsigned int cycles_t; /* - * On R4000/R4400 before version 5.0 an erratum exists such that if the - * cycle counter is read in the exact moment that it is matching the - * compare register, no interrupt will be generated. + * On R4000/R4400 an erratum exists such that if the cycle counter is + * read in the exact moment that it is matching the compare register, + * no interrupt will be generated. * * There is a suggested workaround and also the erratum can't strike if * the compare interrupt isn't being used as the clock source device. @@ -63,7 +63,7 @@ static inline int can_use_mips_counter(unsigned int prid) if (!__builtin_constant_p(cpu_has_counter)) asm volatile("" : "=m" (cpu_data[0].options)); if (likely(cpu_has_counter && - prid >= (PRID_IMP_R4000 | PRID_REV_ENCODE_44(5, 0)))) + prid > (PRID_IMP_R4000 | PRID_REV_ENCODE_44(15, 15)))) return 1; else return 0; diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index caa01457dce6..ed339d7979f3 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -141,15 +141,10 @@ static __init int cpu_has_mfc0_count_bug(void) case CPU_R4400MC: /* * The published errata for the R4400 up to 3.0 say the CPU - * has the mfc0 from count bug. + * has the mfc0 from count bug. This seems the last version + * produced. */ - if ((current_cpu_data.processor_id & 0xff) <= 0x30) - return 1; - - /* - * we assume newer revisions are ok - */ - return 0; + return 1; } return 0; diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 52e550b45692..bd22578859d0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -38,6 +38,7 @@ config PARISC select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_SMP_IDLE_THREAD select GENERIC_ARCH_TOPOLOGY if SMP + select GENERIC_CPU_DEVICES if !SMP select GENERIC_LIB_DEVMEM_IS_ALLOWED select SYSCTL_ARCH_UNALIGN_ALLOW select SYSCTL_EXCEPTION_TRACE diff --git a/arch/parisc/configs/generic-32bit_defconfig b/arch/parisc/configs/generic-32bit_defconfig index a5fee10d76ee..8ce0ae370680 100644 --- a/arch/parisc/configs/generic-32bit_defconfig +++ b/arch/parisc/configs/generic-32bit_defconfig @@ -6,6 +6,9 @@ CONFIG_BSD_PROCESS_ACCT=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=16 +CONFIG_CGROUPS=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y CONFIG_PERF_EVENTS=y @@ -47,7 +50,6 @@ CONFIG_PARPORT=y CONFIG_PARPORT_PC=m CONFIG_PARPORT_1284=y CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_CRYPTOLOOP=y CONFIG_BLK_DEV_RAM=y CONFIG_BLK_DEV_RAM_SIZE=6144 CONFIG_BLK_DEV_SD=y diff --git a/arch/parisc/configs/generic-64bit_defconfig b/arch/parisc/configs/generic-64bit_defconfig index 1b8fd80cbe7f..57501b0aed92 100644 --- a/arch/parisc/configs/generic-64bit_defconfig +++ b/arch/parisc/configs/generic-64bit_defconfig @@ -16,6 +16,7 @@ CONFIG_CGROUPS=y CONFIG_MEMCG=y CONFIG_CGROUP_PIDS=y CONFIG_CPUSETS=y +CONFIG_USER_NS=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y @@ -267,9 +268,9 @@ CONFIG_CRYPTO_DEFLATE=m CONFIG_CRC_CCITT=m CONFIG_LIBCRC32C=y CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_KERNEL=y CONFIG_STRIP_ASM_SYMS=y CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_FS=y -CONFIG_DEBUG_KERNEL=y CONFIG_DEBUG_STACKOVERFLOW=y # CONFIG_SCHED_DEBUG is not set diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index 939db6fe620b..69765a6dbe89 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -160,7 +160,7 @@ extern void __update_cache(pte_t pte); #define SPACEID_SHIFT (MAX_ADDRBITS - 32) #else #define MAX_ADDRBITS (BITS_PER_LONG) -#define MAX_ADDRESS (1UL << MAX_ADDRBITS) +#define MAX_ADDRESS (1ULL << MAX_ADDRBITS) #define SPACEID_SHIFT 0 #endif diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 23348199f3f8..e7911225a4f8 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -403,7 +403,7 @@ void __init parisc_setup_cache_timing(void) { unsigned long rangetime, alltime; unsigned long size; - unsigned long threshold, threshold2; + unsigned long threshold; alltime = mfctl(16); flush_data_cache(); @@ -418,20 +418,8 @@ void __init parisc_setup_cache_timing(void) alltime, size, rangetime); threshold = L1_CACHE_ALIGN(size * alltime / rangetime); - - /* - * The threshold computed above isn't very reliable since the - * flush times depend greatly on the percentage of dirty lines - * in the flush range. Further, the whole cache time doesn't - * include the time to refill lines that aren't in the mm/vma - * being flushed. By timing glibc build and checks on mako cpus, - * the following formula seems to work reasonably well. The - * value from the timing calculation is too small, and increases - * build and check times by almost a factor two. - */ - threshold2 = cache_info.dc_size * num_online_cpus(); - if (threshold2 > threshold) - threshold = threshold2; + if (threshold > cache_info.dc_size) + threshold = cache_info.dc_size; if (threshold) parisc_cache_flush_threshold = threshold; printk(KERN_INFO "Cache flush threshold set to %lu KiB\n", diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 3343d2fb7889..6e0b86652f30 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -152,7 +152,7 @@ int __kprobes parisc_kprobe_ss_handler(struct pt_regs *regs) /* for absolute branch instructions we can copy iaoq_b. for relative * branch instructions we need to calculate the new address based on the * difference between iaoq_f and iaoq_b. We cannot use iaoq_b without - * modificationt because it's based on our ainsn.insn address. + * modifications because it's based on our ainsn.insn address. */ if (p->post_handler) diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index e59574f65e64..80a0ab372802 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -40,7 +40,10 @@ static void __kprobes *patch_map(void *addr, int fixmap, unsigned long *flags, *need_unmap = 1; set_fixmap(fixmap, page_to_phys(page)); - raw_spin_lock_irqsave(&patch_lock, *flags); + if (flags) + raw_spin_lock_irqsave(&patch_lock, *flags); + else + __acquire(&patch_lock); return (void *) (__fix_to_virt(fixmap) + (uintaddr & ~PAGE_MASK)); } @@ -49,7 +52,10 @@ static void __kprobes patch_unmap(int fixmap, unsigned long *flags) { clear_fixmap(fixmap); - raw_spin_unlock_irqrestore(&patch_lock, *flags); + if (flags) + raw_spin_unlock_irqrestore(&patch_lock, *flags); + else + __release(&patch_lock); } void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) @@ -61,9 +67,8 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) int mapped; /* Make sure we don't have any aliases in cache */ - flush_kernel_dcache_range_asm(start, end); - flush_kernel_icache_range_asm(start, end); - flush_tlb_kernel_range(start, end); + flush_kernel_vmap_range(addr, len); + flush_icache_range(start, end); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, &mapped); @@ -76,10 +81,8 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) * We're crossing a page boundary, so * need to remap */ - flush_kernel_dcache_range_asm((unsigned long)fixmap, - (unsigned long)p); - flush_tlb_kernel_range((unsigned long)fixmap, - (unsigned long)p); + flush_kernel_vmap_range((void *)fixmap, + (p-fixmap) * sizeof(*p)); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); p = fixmap = patch_map(addr, FIX_TEXT_POKE0, &flags, @@ -87,10 +90,10 @@ void __kprobes __patch_text_multiple(void *addr, u32 *insn, unsigned int len) } } - flush_kernel_dcache_range_asm((unsigned long)fixmap, (unsigned long)p); - flush_tlb_kernel_range((unsigned long)fixmap, (unsigned long)p); + flush_kernel_vmap_range((void *)fixmap, (p-fixmap) * sizeof(*p)); if (mapped) patch_unmap(FIX_TEXT_POKE0, &flags); + flush_icache_range(start, end); } void __kprobes __patch_text(void *addr, u32 insn) diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index d98692115221..26eb568f8b96 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -171,6 +171,7 @@ static int __init processor_probe(struct parisc_device *dev) p->cpu_num = cpu_info.cpu_num; p->cpu_loc = cpu_info.cpu_loc; + set_cpu_possible(cpuid, true); store_cpu_topology(cpuid); #ifdef CONFIG_SMP @@ -419,8 +420,7 @@ show_cpuinfo (struct seq_file *m, void *v) } seq_printf(m, " (0x%02lx)\n", boot_cpu_data.pdc.capabilities); - seq_printf(m, "model\t\t: %s\n" - "model name\t: %s\n", + seq_printf(m, "model\t\t: %s - %s\n", boot_cpu_data.pdc.sys_model_name, cpuinfo->dev ? cpuinfo->dev->name : "Unknown"); @@ -461,6 +461,13 @@ static struct parisc_driver cpu_driver __refdata = { */ void __init processor_init(void) { + unsigned int cpu; + reset_cpu_topology(); + + /* reset possible mask. We will mark those which are possible. */ + for_each_possible_cpu(cpu) + set_cpu_possible(cpu, false); + register_parisc_driver(&cpu_driver); } diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index b91cb45ffd4e..f005ddedb50e 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -161,6 +161,8 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_PA11 dma_ops_init(); #endif + + clear_sched_clock_stable(); } /* diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index bb27dfeeddfc..9714fbd7c42d 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -251,13 +251,9 @@ void __init time_init(void) static int __init init_cr16_clocksource(void) { /* - * The cr16 interval timers are not syncronized across CPUs, even if - * they share the same socket. + * The cr16 interval timers are not synchronized across CPUs. */ if (num_online_cpus() > 1 && !running_on_qemu) { - /* mark sched_clock unstable */ - clear_sched_clock_stable(); - clocksource_cr16.name = "cr16_unstable"; clocksource_cr16.flags = CLOCK_SOURCE_UNSTABLE; clocksource_cr16.rating = 0; diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index a6e61cf2cad0..b78f1b9d45c1 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -469,7 +469,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o * panic notifiers, and we should call panic * directly from the location that we wish. * e.g. We should not call panic from - * parisc_terminate, but rather the oter way around. + * parisc_terminate, but rather the other way around. * This hack works, prints the panic message twice, * and it enables reboot timers! */ diff --git a/arch/parisc/math-emu/dfadd.c b/arch/parisc/math-emu/dfadd.c index ec487e07f004..00e561d4aa55 100644 --- a/arch/parisc/math-emu/dfadd.c +++ b/arch/parisc/math-emu/dfadd.c @@ -253,7 +253,7 @@ dbl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/dfsub.c b/arch/parisc/math-emu/dfsub.c index c4f30acf2d48..4f03782284bd 100644 --- a/arch/parisc/math-emu/dfsub.c +++ b/arch/parisc/math-emu/dfsub.c @@ -256,7 +256,7 @@ dbl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfadd.c b/arch/parisc/math-emu/sfadd.c index 838758279d5b..9b98c874dfac 100644 --- a/arch/parisc/math-emu/sfadd.c +++ b/arch/parisc/math-emu/sfadd.c @@ -249,7 +249,7 @@ sgl_fadd( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/parisc/math-emu/sfsub.c b/arch/parisc/math-emu/sfsub.c index 583d3ace4634..29d9eed09d12 100644 --- a/arch/parisc/math-emu/sfsub.c +++ b/arch/parisc/math-emu/sfsub.c @@ -252,7 +252,7 @@ sgl_fsub( return(NOEXCEPTION); } right_exponent = 1; /* Set exponent to reflect different bias - * with denomalized numbers. */ + * with denormalized numbers. */ } else { diff --git a/arch/powerpc/kernel/vdso/gettimeofday.S b/arch/powerpc/kernel/vdso/gettimeofday.S index eb9c81e1c218..0c4ecc8fec5a 100644 --- a/arch/powerpc/kernel/vdso/gettimeofday.S +++ b/arch/powerpc/kernel/vdso/gettimeofday.S @@ -22,12 +22,15 @@ .macro cvdso_call funct call_time=0 .cfi_startproc PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM mflr r0 - .cfi_register lr, r0 PPC_STLU r1, -PPC_MIN_STKFRM(r1) + .cfi_adjust_cfa_offset PPC_MIN_STKFRM PPC_STL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) + .cfi_rel_offset lr, PPC_MIN_STKFRM + PPC_LR_STKOFF #ifdef __powerpc64__ PPC_STL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_rel_offset r2, PPC_MIN_STKFRM + STK_GOT #endif get_datapage r5 .ifeq \call_time @@ -39,13 +42,15 @@ PPC_LL r0, PPC_MIN_STKFRM + PPC_LR_STKOFF(r1) #ifdef __powerpc64__ PPC_LL r2, PPC_MIN_STKFRM + STK_GOT(r1) + .cfi_restore r2 #endif .ifeq \call_time cmpwi r3, 0 .endif mtlr r0 - .cfi_restore lr addi r1, r1, 2 * PPC_MIN_STKFRM + .cfi_restore lr + .cfi_def_cfa_offset 0 crclr so .ifeq \call_time beqlr+ diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S index e3ab9df6cf19..6cfcd20d4668 100644 --- a/arch/powerpc/kvm/book3s_32_sr.S +++ b/arch/powerpc/kvm/book3s_32_sr.S @@ -122,11 +122,27 @@ /* 0x0 - 0xb */ - /* 'current->mm' needs to be in r4 */ - tophys(r4, r2) - lwz r4, MM(r4) - tophys(r4, r4) - /* This only clobbers r0, r3, r4 and r5 */ + /* switch_mmu_context() needs paging, let's enable it */ + mfmsr r9 + ori r11, r9, MSR_DR + mtmsr r11 + sync + + /* switch_mmu_context() clobbers r12, rescue it */ + SAVE_GPR(12, r1) + + /* Calling switch_mmu_context(<inv>, current->mm, <inv>); */ + lwz r4, MM(r2) bl switch_mmu_context + /* restore r12 */ + REST_GPR(12, r1) + + /* Disable paging again */ + mfmsr r9 + li r6, MSR_DR + andc r9, r9, r6 + mtmsr r9 + sync + .endm diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index f58728d5f10d..39962c905542 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -462,7 +462,6 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu { struct papr_scm_perf_stat *stat; struct papr_scm_perf_stats *stats; - char *statid; int index, rc, count; u32 available_events; @@ -493,14 +492,12 @@ static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu for (index = 0, stat = stats->scm_statistic, count = 0; index < available_events; index++, ++stat) { - statid = kzalloc(strlen(stat->stat_id) + 1, GFP_KERNEL); - if (!statid) { + p->nvdimm_events_map[count] = kmemdup_nul(stat->stat_id, 8, GFP_KERNEL); + if (!p->nvdimm_events_map[count]) { rc = -ENOMEM; goto out_nvdimm_events_map; } - strcpy(statid, stat->stat_id); - p->nvdimm_events_map[count] = statid; count++; } p->nvdimm_events_map[count] = NULL; diff --git a/arch/powerpc/platforms/pseries/vas-sysfs.c b/arch/powerpc/platforms/pseries/vas-sysfs.c index 909535ca513a..ec65586cbeb3 100644 --- a/arch/powerpc/platforms/pseries/vas-sysfs.c +++ b/arch/powerpc/platforms/pseries/vas-sysfs.c @@ -27,22 +27,31 @@ struct vas_caps_entry { /* * This function is used to get the notification from the drmgr when - * QoS credits are changed. Though receiving the target total QoS - * credits here, get the official QoS capabilities from the hypervisor. + * QoS credits are changed. */ -static ssize_t update_total_credits_trigger(struct vas_cop_feat_caps *caps, +static ssize_t update_total_credits_store(struct vas_cop_feat_caps *caps, const char *buf, size_t count) { int err; u16 creds; err = kstrtou16(buf, 0, &creds); + /* + * The user space interface from the management console + * notifies OS with the new QoS credits and then the + * hypervisor. So OS has to use this new credits value + * and reconfigure VAS windows (close or reopen depends + * on the credits available) instead of depending on VAS + * QoS capabilities from the hypervisor. + */ if (!err) - err = vas_reconfig_capabilties(caps->win_type); + err = vas_reconfig_capabilties(caps->win_type, creds); if (err) return -EINVAL; + pr_info("Set QoS total credits %u\n", creds); + return count; } @@ -92,7 +101,7 @@ VAS_ATTR_RO(nr_total_credits); VAS_ATTR_RO(nr_used_credits); static struct vas_sysfs_entry update_total_credits_attribute = - __ATTR(update_total_credits, 0200, NULL, update_total_credits_trigger); + __ATTR(update_total_credits, 0200, NULL, update_total_credits_store); static struct attribute *vas_def_capab_attrs[] = { &nr_total_credits_attribute.attr, diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 1f59d78c77a1..ec643bbdb67f 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -779,10 +779,10 @@ static int reconfig_close_windows(struct vas_caps *vcap, int excess_creds, * changes. Reconfig window configurations based on the credits * availability from this new capabilities. */ -int vas_reconfig_capabilties(u8 type) +int vas_reconfig_capabilties(u8 type, int new_nr_creds) { struct vas_cop_feat_caps *caps; - int old_nr_creds, new_nr_creds; + int old_nr_creds; struct vas_caps *vcaps; int rc = 0, nr_active_wins; @@ -795,12 +795,6 @@ int vas_reconfig_capabilties(u8 type) caps = &vcaps->caps; mutex_lock(&vas_pseries_mutex); - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, vcaps->feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (rc) - goto out; - - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); old_nr_creds = atomic_read(&caps->nr_total_credits); @@ -832,7 +826,6 @@ int vas_reconfig_capabilties(u8 type) false); } -out: mutex_unlock(&vas_pseries_mutex); return rc; } @@ -850,7 +843,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int len, rc = 0; + int new_nr_creds, len, rc = 0; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -862,7 +855,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE); + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, + new_nr_creds); + } + if (rc) pr_err("Failed reconfig VAS capabilities with DLPAR\n"); diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 34177881e998..333ffa2f9f42 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -135,7 +135,7 @@ struct pseries_vas_window { }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); -int vas_reconfig_capabilties(u8 type); +int vas_reconfig_capabilties(u8 type, int new_nr_creds); int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h index e935f27b10fd..cc40521e438b 100644 --- a/arch/riscv/include/asm/csr.h +++ b/arch/riscv/include/asm/csr.h @@ -117,6 +117,7 @@ #define HGATP_MODE_SV32X4 _AC(1, UL) #define HGATP_MODE_SV39X4 _AC(8, UL) #define HGATP_MODE_SV48X4 _AC(9, UL) +#define HGATP_MODE_SV57X4 _AC(10, UL) #define HGATP32_MODE_SHIFT 31 #define HGATP32_VMID_SHIFT 22 diff --git a/arch/riscv/include/asm/kvm_host.h b/arch/riscv/include/asm/kvm_host.h index cd4bbcecb0fb..319c8aeb42af 100644 --- a/arch/riscv/include/asm/kvm_host.h +++ b/arch/riscv/include/asm/kvm_host.h @@ -12,12 +12,12 @@ #include <linux/types.h> #include <linux/kvm.h> #include <linux/kvm_types.h> +#include <linux/spinlock.h> #include <asm/csr.h> #include <asm/kvm_vcpu_fp.h> #include <asm/kvm_vcpu_timer.h> -#define KVM_MAX_VCPUS \ - ((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1) +#define KVM_MAX_VCPUS 1024 #define KVM_HALT_POLL_NS_DEFAULT 500000 @@ -27,6 +27,31 @@ KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_VCPU_RESET KVM_ARCH_REQ(1) #define KVM_REQ_UPDATE_HGATP KVM_ARCH_REQ(2) +#define KVM_REQ_FENCE_I \ + KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HFENCE_GVMA_VMID_ALL KVM_REQ_TLB_FLUSH +#define KVM_REQ_HFENCE_VVMA_ALL \ + KVM_ARCH_REQ_FLAGS(4, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_HFENCE \ + KVM_ARCH_REQ_FLAGS(5, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) + +enum kvm_riscv_hfence_type { + KVM_RISCV_HFENCE_UNKNOWN = 0, + KVM_RISCV_HFENCE_GVMA_VMID_GPA, + KVM_RISCV_HFENCE_VVMA_ASID_GVA, + KVM_RISCV_HFENCE_VVMA_ASID_ALL, + KVM_RISCV_HFENCE_VVMA_GVA, +}; + +struct kvm_riscv_hfence { + enum kvm_riscv_hfence_type type; + unsigned long asid; + unsigned long order; + gpa_t addr; + gpa_t size; +}; + +#define KVM_RISCV_VCPU_MAX_HFENCE 64 struct kvm_vm_stat { struct kvm_vm_stat_generic generic; @@ -54,10 +79,10 @@ struct kvm_vmid { }; struct kvm_arch { - /* stage2 vmid */ + /* G-stage vmid */ struct kvm_vmid vmid; - /* stage2 page table */ + /* G-stage page table */ pgd_t *pgd; phys_addr_t pgd_phys; @@ -141,6 +166,9 @@ struct kvm_vcpu_arch { /* VCPU ran at least once */ bool ran_atleast_once; + /* Last Host CPU on which Guest VCPU exited */ + int last_exit_cpu; + /* ISA feature bits (similar to MISA) */ unsigned long isa; @@ -179,6 +207,12 @@ struct kvm_vcpu_arch { /* VCPU Timer */ struct kvm_vcpu_timer timer; + /* HFENCE request queue */ + spinlock_t hfence_lock; + unsigned long hfence_head; + unsigned long hfence_tail; + struct kvm_riscv_hfence hfence_queue[KVM_RISCV_VCPU_MAX_HFENCE]; + /* MMIO instruction details */ struct kvm_mmio_decode mmio_decode; @@ -201,27 +235,71 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} #define KVM_ARCH_WANT_MMU_NOTIFIER -void __kvm_riscv_hfence_gvma_vmid_gpa(unsigned long gpa_divby_4, - unsigned long vmid); -void __kvm_riscv_hfence_gvma_vmid(unsigned long vmid); -void __kvm_riscv_hfence_gvma_gpa(unsigned long gpa_divby_4); -void __kvm_riscv_hfence_gvma_all(void); - -int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, +#define KVM_RISCV_GSTAGE_TLB_MIN_ORDER 12 + +void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, + gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid); +void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_local_hfence_gvma_all(void); +void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, + unsigned long asid, + unsigned long gva, + unsigned long gvsz, + unsigned long order); +void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid, + unsigned long asid); +void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, + unsigned long gva, unsigned long gvsz, + unsigned long order); +void kvm_riscv_local_hfence_vvma_all(unsigned long vmid); + +void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu); + +void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu); +void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu); + +void kvm_riscv_fence_i(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); +void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + gpa_t gpa, gpa_t gpsz, + unsigned long order); +void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); +void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order, unsigned long asid); +void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long asid); +void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order); +void kvm_riscv_hfence_vvma_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask); + +int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write); -int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm); -void kvm_riscv_stage2_free_pgd(struct kvm *kvm); -void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu); -void kvm_riscv_stage2_mode_detect(void); -unsigned long kvm_riscv_stage2_mode(void); -int kvm_riscv_stage2_gpa_bits(void); - -void kvm_riscv_stage2_vmid_detect(void); -unsigned long kvm_riscv_stage2_vmid_bits(void); -int kvm_riscv_stage2_vmid_init(struct kvm *kvm); -bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid); -void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu); +int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm); +void kvm_riscv_gstage_free_pgd(struct kvm *kvm); +void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu); +void kvm_riscv_gstage_mode_detect(void); +unsigned long kvm_riscv_gstage_mode(void); +int kvm_riscv_gstage_gpa_bits(void); + +void kvm_riscv_gstage_vmid_detect(void); +unsigned long kvm_riscv_gstage_vmid_bits(void); +int kvm_riscv_gstage_vmid_init(struct kvm *kvm); +bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid); +void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu); void __kvm_riscv_unpriv_trap(void); diff --git a/arch/riscv/include/uapi/asm/kvm.h b/arch/riscv/include/uapi/asm/kvm.h index f808ad1ce500..6119368ba6d5 100644 --- a/arch/riscv/include/uapi/asm/kvm.h +++ b/arch/riscv/include/uapi/asm/kvm.h @@ -82,6 +82,23 @@ struct kvm_riscv_timer { __u64 state; }; +/* + * ISA extension IDs specific to KVM. This is not the same as the host ISA + * extension IDs as that is internal to the host and should not be exposed + * to the guest. This should always be contiguous to keep the mapping simple + * in KVM implementation. + */ +enum KVM_RISCV_ISA_EXT_ID { + KVM_RISCV_ISA_EXT_A = 0, + KVM_RISCV_ISA_EXT_C, + KVM_RISCV_ISA_EXT_D, + KVM_RISCV_ISA_EXT_F, + KVM_RISCV_ISA_EXT_H, + KVM_RISCV_ISA_EXT_I, + KVM_RISCV_ISA_EXT_M, + KVM_RISCV_ISA_EXT_MAX, +}; + /* Possible states for kvm_riscv_timer */ #define KVM_RISCV_TIMER_STATE_OFF 0 #define KVM_RISCV_TIMER_STATE_ON 1 @@ -123,6 +140,9 @@ struct kvm_riscv_timer { #define KVM_REG_RISCV_FP_D_REG(name) \ (offsetof(struct __riscv_d_ext_state, name) / sizeof(__u64)) +/* ISA Extension registers are mapped as type 7 */ +#define KVM_REG_RISCV_ISA_EXT (0x07 << KVM_REG_RISCV_TYPE_SHIFT) + #endif #endif /* __LINUX_KVM_RISCV_H */ diff --git a/arch/riscv/kvm/main.c b/arch/riscv/kvm/main.c index 2e5ca43c8c49..1549205fe5fe 100644 --- a/arch/riscv/kvm/main.c +++ b/arch/riscv/kvm/main.c @@ -89,13 +89,13 @@ int kvm_arch_init(void *opaque) return -ENODEV; } - kvm_riscv_stage2_mode_detect(); + kvm_riscv_gstage_mode_detect(); - kvm_riscv_stage2_vmid_detect(); + kvm_riscv_gstage_vmid_detect(); kvm_info("hypervisor extension available\n"); - switch (kvm_riscv_stage2_mode()) { + switch (kvm_riscv_gstage_mode()) { case HGATP_MODE_SV32X4: str = "Sv32x4"; break; @@ -105,12 +105,15 @@ int kvm_arch_init(void *opaque) case HGATP_MODE_SV48X4: str = "Sv48x4"; break; + case HGATP_MODE_SV57X4: + str = "Sv57x4"; + break; default: return -ENODEV; } kvm_info("using %s G-stage page table format\n", str); - kvm_info("VMID %ld bits available\n", kvm_riscv_stage2_vmid_bits()); + kvm_info("VMID %ld bits available\n", kvm_riscv_gstage_vmid_bits()); return 0; } diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c index f80a34fbf102..1c00695ebee7 100644 --- a/arch/riscv/kvm/mmu.c +++ b/arch/riscv/kvm/mmu.c @@ -18,53 +18,52 @@ #include <asm/csr.h> #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/sbi.h> #ifdef CONFIG_64BIT -static unsigned long stage2_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); -static unsigned long stage2_pgd_levels = 3; -#define stage2_index_bits 9 +static unsigned long gstage_mode = (HGATP_MODE_SV39X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels = 3; +#define gstage_index_bits 9 #else -static unsigned long stage2_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); -static unsigned long stage2_pgd_levels = 2; -#define stage2_index_bits 10 +static unsigned long gstage_mode = (HGATP_MODE_SV32X4 << HGATP_MODE_SHIFT); +static unsigned long gstage_pgd_levels = 2; +#define gstage_index_bits 10 #endif -#define stage2_pgd_xbits 2 -#define stage2_pgd_size (1UL << (HGATP_PAGE_SHIFT + stage2_pgd_xbits)) -#define stage2_gpa_bits (HGATP_PAGE_SHIFT + \ - (stage2_pgd_levels * stage2_index_bits) + \ - stage2_pgd_xbits) -#define stage2_gpa_size ((gpa_t)(1ULL << stage2_gpa_bits)) +#define gstage_pgd_xbits 2 +#define gstage_pgd_size (1UL << (HGATP_PAGE_SHIFT + gstage_pgd_xbits)) +#define gstage_gpa_bits (HGATP_PAGE_SHIFT + \ + (gstage_pgd_levels * gstage_index_bits) + \ + gstage_pgd_xbits) +#define gstage_gpa_size ((gpa_t)(1ULL << gstage_gpa_bits)) -#define stage2_pte_leaf(__ptep) \ +#define gstage_pte_leaf(__ptep) \ (pte_val(*(__ptep)) & (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC)) -static inline unsigned long stage2_pte_index(gpa_t addr, u32 level) +static inline unsigned long gstage_pte_index(gpa_t addr, u32 level) { unsigned long mask; - unsigned long shift = HGATP_PAGE_SHIFT + (stage2_index_bits * level); + unsigned long shift = HGATP_PAGE_SHIFT + (gstage_index_bits * level); - if (level == (stage2_pgd_levels - 1)) - mask = (PTRS_PER_PTE * (1UL << stage2_pgd_xbits)) - 1; + if (level == (gstage_pgd_levels - 1)) + mask = (PTRS_PER_PTE * (1UL << gstage_pgd_xbits)) - 1; else mask = PTRS_PER_PTE - 1; return (addr >> shift) & mask; } -static inline unsigned long stage2_pte_page_vaddr(pte_t pte) +static inline unsigned long gstage_pte_page_vaddr(pte_t pte) { return (unsigned long)pfn_to_virt(pte_val(pte) >> _PAGE_PFN_SHIFT); } -static int stage2_page_size_to_level(unsigned long page_size, u32 *out_level) +static int gstage_page_size_to_level(unsigned long page_size, u32 *out_level) { u32 i; unsigned long psz = 1UL << 12; - for (i = 0; i < stage2_pgd_levels; i++) { - if (page_size == (psz << (i * stage2_index_bits))) { + for (i = 0; i < gstage_pgd_levels; i++) { + if (page_size == (psz << (i * gstage_index_bits))) { *out_level = i; return 0; } @@ -73,27 +72,39 @@ static int stage2_page_size_to_level(unsigned long page_size, u32 *out_level) return -EINVAL; } -static int stage2_level_to_page_size(u32 level, unsigned long *out_pgsize) +static int gstage_level_to_page_order(u32 level, unsigned long *out_pgorder) { - if (stage2_pgd_levels < level) + if (gstage_pgd_levels < level) return -EINVAL; - *out_pgsize = 1UL << (12 + (level * stage2_index_bits)); + *out_pgorder = 12 + (level * gstage_index_bits); + return 0; +} + +static int gstage_level_to_page_size(u32 level, unsigned long *out_pgsize) +{ + int rc; + unsigned long page_order = PAGE_SHIFT; + + rc = gstage_level_to_page_order(level, &page_order); + if (rc) + return rc; + *out_pgsize = BIT(page_order); return 0; } -static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, +static bool gstage_get_leaf_entry(struct kvm *kvm, gpa_t addr, pte_t **ptepp, u32 *ptep_level) { pte_t *ptep; - u32 current_level = stage2_pgd_levels - 1; + u32 current_level = gstage_pgd_levels - 1; *ptep_level = current_level; ptep = (pte_t *)kvm->arch.pgd; - ptep = &ptep[stage2_pte_index(addr, current_level)]; + ptep = &ptep[gstage_pte_index(addr, current_level)]; while (ptep && pte_val(*ptep)) { - if (stage2_pte_leaf(ptep)) { + if (gstage_pte_leaf(ptep)) { *ptep_level = current_level; *ptepp = ptep; return true; @@ -102,8 +113,8 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, if (current_level) { current_level--; *ptep_level = current_level; - ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); - ptep = &ptep[stage2_pte_index(addr, current_level)]; + ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); + ptep = &ptep[gstage_pte_index(addr, current_level)]; } else { ptep = NULL; } @@ -112,38 +123,30 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr, return false; } -static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) +static void gstage_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr) { - unsigned long size = PAGE_SIZE; - struct kvm_vmid *vmid = &kvm->arch.vmid; + unsigned long order = PAGE_SHIFT; - if (stage2_level_to_page_size(level, &size)) + if (gstage_level_to_page_order(level, &order)) return; - addr &= ~(size - 1); + addr &= ~(BIT(order) - 1); - /* - * TODO: Instead of cpu_online_mask, we should only target CPUs - * where the Guest/VM is running. - */ - preempt_disable(); - sbi_remote_hfence_gvma_vmid(cpu_online_mask, addr, size, - READ_ONCE(vmid->vmid)); - preempt_enable(); + kvm_riscv_hfence_gvma_vmid_gpa(kvm, -1UL, 0, addr, BIT(order), order); } -static int stage2_set_pte(struct kvm *kvm, u32 level, +static int gstage_set_pte(struct kvm *kvm, u32 level, struct kvm_mmu_memory_cache *pcache, gpa_t addr, const pte_t *new_pte) { - u32 current_level = stage2_pgd_levels - 1; + u32 current_level = gstage_pgd_levels - 1; pte_t *next_ptep = (pte_t *)kvm->arch.pgd; - pte_t *ptep = &next_ptep[stage2_pte_index(addr, current_level)]; + pte_t *ptep = &next_ptep[gstage_pte_index(addr, current_level)]; if (current_level < level) return -EINVAL; while (current_level != level) { - if (stage2_pte_leaf(ptep)) + if (gstage_pte_leaf(ptep)) return -EEXIST; if (!pte_val(*ptep)) { @@ -155,23 +158,23 @@ static int stage2_set_pte(struct kvm *kvm, u32 level, *ptep = pfn_pte(PFN_DOWN(__pa(next_ptep)), __pgprot(_PAGE_TABLE)); } else { - if (stage2_pte_leaf(ptep)) + if (gstage_pte_leaf(ptep)) return -EEXIST; - next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); + next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); } current_level--; - ptep = &next_ptep[stage2_pte_index(addr, current_level)]; + ptep = &next_ptep[gstage_pte_index(addr, current_level)]; } *ptep = *new_pte; - if (stage2_pte_leaf(ptep)) - stage2_remote_tlb_flush(kvm, current_level, addr); + if (gstage_pte_leaf(ptep)) + gstage_remote_tlb_flush(kvm, current_level, addr); return 0; } -static int stage2_map_page(struct kvm *kvm, +static int gstage_map_page(struct kvm *kvm, struct kvm_mmu_memory_cache *pcache, gpa_t gpa, phys_addr_t hpa, unsigned long page_size, @@ -182,7 +185,7 @@ static int stage2_map_page(struct kvm *kvm, pte_t new_pte; pgprot_t prot; - ret = stage2_page_size_to_level(page_size, &level); + ret = gstage_page_size_to_level(page_size, &level); if (ret) return ret; @@ -193,9 +196,9 @@ static int stage2_map_page(struct kvm *kvm, * PTE so that software can update these bits. * * We support both options mentioned above. To achieve this, we - * always set 'A' and 'D' PTE bits at time of creating stage2 + * always set 'A' and 'D' PTE bits at time of creating G-stage * mapping. To support KVM dirty page logging with both options - * mentioned above, we will write-protect stage2 PTEs to track + * mentioned above, we will write-protect G-stage PTEs to track * dirty pages. */ @@ -213,24 +216,24 @@ static int stage2_map_page(struct kvm *kvm, new_pte = pfn_pte(PFN_DOWN(hpa), prot); new_pte = pte_mkdirty(new_pte); - return stage2_set_pte(kvm, level, pcache, gpa, &new_pte); + return gstage_set_pte(kvm, level, pcache, gpa, &new_pte); } -enum stage2_op { - STAGE2_OP_NOP = 0, /* Nothing */ - STAGE2_OP_CLEAR, /* Clear/Unmap */ - STAGE2_OP_WP, /* Write-protect */ +enum gstage_op { + GSTAGE_OP_NOP = 0, /* Nothing */ + GSTAGE_OP_CLEAR, /* Clear/Unmap */ + GSTAGE_OP_WP, /* Write-protect */ }; -static void stage2_op_pte(struct kvm *kvm, gpa_t addr, - pte_t *ptep, u32 ptep_level, enum stage2_op op) +static void gstage_op_pte(struct kvm *kvm, gpa_t addr, + pte_t *ptep, u32 ptep_level, enum gstage_op op) { int i, ret; pte_t *next_ptep; u32 next_ptep_level; unsigned long next_page_size, page_size; - ret = stage2_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(ptep_level, &page_size); if (ret) return; @@ -239,31 +242,31 @@ static void stage2_op_pte(struct kvm *kvm, gpa_t addr, if (!pte_val(*ptep)) return; - if (ptep_level && !stage2_pte_leaf(ptep)) { - next_ptep = (pte_t *)stage2_pte_page_vaddr(*ptep); + if (ptep_level && !gstage_pte_leaf(ptep)) { + next_ptep = (pte_t *)gstage_pte_page_vaddr(*ptep); next_ptep_level = ptep_level - 1; - ret = stage2_level_to_page_size(next_ptep_level, + ret = gstage_level_to_page_size(next_ptep_level, &next_page_size); if (ret) return; - if (op == STAGE2_OP_CLEAR) + if (op == GSTAGE_OP_CLEAR) set_pte(ptep, __pte(0)); for (i = 0; i < PTRS_PER_PTE; i++) - stage2_op_pte(kvm, addr + i * next_page_size, + gstage_op_pte(kvm, addr + i * next_page_size, &next_ptep[i], next_ptep_level, op); - if (op == STAGE2_OP_CLEAR) + if (op == GSTAGE_OP_CLEAR) put_page(virt_to_page(next_ptep)); } else { - if (op == STAGE2_OP_CLEAR) + if (op == GSTAGE_OP_CLEAR) set_pte(ptep, __pte(0)); - else if (op == STAGE2_OP_WP) + else if (op == GSTAGE_OP_WP) set_pte(ptep, __pte(pte_val(*ptep) & ~_PAGE_WRITE)); - stage2_remote_tlb_flush(kvm, ptep_level, addr); + gstage_remote_tlb_flush(kvm, ptep_level, addr); } } -static void stage2_unmap_range(struct kvm *kvm, gpa_t start, +static void gstage_unmap_range(struct kvm *kvm, gpa_t start, gpa_t size, bool may_block) { int ret; @@ -274,9 +277,9 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start, gpa_t addr = start, end = start + size; while (addr < end) { - found_leaf = stage2_get_leaf_entry(kvm, addr, + found_leaf = gstage_get_leaf_entry(kvm, addr, &ptep, &ptep_level); - ret = stage2_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(ptep_level, &page_size); if (ret) break; @@ -284,8 +287,8 @@ static void stage2_unmap_range(struct kvm *kvm, gpa_t start, goto next; if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - stage2_op_pte(kvm, addr, ptep, - ptep_level, STAGE2_OP_CLEAR); + gstage_op_pte(kvm, addr, ptep, + ptep_level, GSTAGE_OP_CLEAR); next: addr += page_size; @@ -299,7 +302,7 @@ next: } } -static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) +static void gstage_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) { int ret; pte_t *ptep; @@ -309,9 +312,9 @@ static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) unsigned long page_size; while (addr < end) { - found_leaf = stage2_get_leaf_entry(kvm, addr, + found_leaf = gstage_get_leaf_entry(kvm, addr, &ptep, &ptep_level); - ret = stage2_level_to_page_size(ptep_level, &page_size); + ret = gstage_level_to_page_size(ptep_level, &page_size); if (ret) break; @@ -319,15 +322,15 @@ static void stage2_wp_range(struct kvm *kvm, gpa_t start, gpa_t end) goto next; if (!(addr & (page_size - 1)) && ((end - addr) >= page_size)) - stage2_op_pte(kvm, addr, ptep, - ptep_level, STAGE2_OP_WP); + gstage_op_pte(kvm, addr, ptep, + ptep_level, GSTAGE_OP_WP); next: addr += page_size; } } -static void stage2_wp_memory_region(struct kvm *kvm, int slot) +static void gstage_wp_memory_region(struct kvm *kvm, int slot) { struct kvm_memslots *slots = kvm_memslots(kvm); struct kvm_memory_slot *memslot = id_to_memslot(slots, slot); @@ -335,12 +338,12 @@ static void stage2_wp_memory_region(struct kvm *kvm, int slot) phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); - stage2_wp_range(kvm, start, end); + gstage_wp_range(kvm, start, end); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); } -static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, +static int gstage_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, unsigned long size, bool writable) { pte_t pte; @@ -361,12 +364,12 @@ static int stage2_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa, if (!writable) pte = pte_wrprotect(pte); - ret = kvm_mmu_topup_memory_cache(&pcache, stage2_pgd_levels); + ret = kvm_mmu_topup_memory_cache(&pcache, gstage_pgd_levels); if (ret) goto out; spin_lock(&kvm->mmu_lock); - ret = stage2_set_pte(kvm, 0, &pcache, addr, &pte); + ret = gstage_set_pte(kvm, 0, &pcache, addr, &pte); spin_unlock(&kvm->mmu_lock); if (ret) goto out; @@ -388,7 +391,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, phys_addr_t start = (base_gfn + __ffs(mask)) << PAGE_SHIFT; phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT; - stage2_wp_range(kvm, start, end); + gstage_wp_range(kvm, start, end); } void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) @@ -411,7 +414,7 @@ void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) void kvm_arch_flush_shadow_all(struct kvm *kvm) { - kvm_riscv_stage2_free_pgd(kvm); + kvm_riscv_gstage_free_pgd(kvm); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -421,7 +424,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm, phys_addr_t size = slot->npages << PAGE_SHIFT; spin_lock(&kvm->mmu_lock); - stage2_unmap_range(kvm, gpa, size, false); + gstage_unmap_range(kvm, gpa, size, false); spin_unlock(&kvm->mmu_lock); } @@ -436,7 +439,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, * the memory slot is write protected. */ if (change != KVM_MR_DELETE && new->flags & KVM_MEM_LOG_DIRTY_PAGES) - stage2_wp_memory_region(kvm, new->id); + gstage_wp_memory_region(kvm, new->id); } int kvm_arch_prepare_memory_region(struct kvm *kvm, @@ -458,7 +461,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, * space addressable by the KVM guest GPA space. */ if ((new->base_gfn + new->npages) >= - (stage2_gpa_size >> PAGE_SHIFT)) + (gstage_gpa_size >> PAGE_SHIFT)) return -EFAULT; hva = new->userspace_addr; @@ -514,7 +517,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, goto out; } - ret = stage2_ioremap(kvm, gpa, pa, + ret = gstage_ioremap(kvm, gpa, pa, vm_end - vm_start, writable); if (ret) break; @@ -527,7 +530,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm, spin_lock(&kvm->mmu_lock); if (ret) - stage2_unmap_range(kvm, base_gpa, size, false); + gstage_unmap_range(kvm, base_gpa, size, false); spin_unlock(&kvm->mmu_lock); out: @@ -540,7 +543,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range) if (!kvm->arch.pgd) return false; - stage2_unmap_range(kvm, range->start << PAGE_SHIFT, + gstage_unmap_range(kvm, range->start << PAGE_SHIFT, (range->end - range->start) << PAGE_SHIFT, range->may_block); return false; @@ -556,10 +559,10 @@ bool kvm_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(range->end - range->start != 1); - ret = stage2_map_page(kvm, NULL, range->start << PAGE_SHIFT, + ret = gstage_map_page(kvm, NULL, range->start << PAGE_SHIFT, __pfn_to_phys(pfn), PAGE_SIZE, true, true); if (ret) { - kvm_debug("Failed to map stage2 page (error %d)\n", ret); + kvm_debug("Failed to map G-stage page (error %d)\n", ret); return true; } @@ -577,7 +580,7 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); - if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; @@ -595,14 +598,14 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range) WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PGDIR_SIZE); - if (!stage2_get_leaf_entry(kvm, range->start << PAGE_SHIFT, + if (!gstage_get_leaf_entry(kvm, range->start << PAGE_SHIFT, &ptep, &ptep_level)) return false; return pte_young(*ptep); } -int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, +int kvm_riscv_gstage_map(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, gpa_t gpa, unsigned long hva, bool is_write) { @@ -648,9 +651,9 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, } /* We need minimum second+third level pages */ - ret = kvm_mmu_topup_memory_cache(pcache, stage2_pgd_levels); + ret = kvm_mmu_topup_memory_cache(pcache, gstage_pgd_levels); if (ret) { - kvm_err("Failed to topup stage2 cache\n"); + kvm_err("Failed to topup G-stage cache\n"); return ret; } @@ -680,15 +683,15 @@ int kvm_riscv_stage2_map(struct kvm_vcpu *vcpu, if (writeable) { kvm_set_pfn_dirty(hfn); mark_page_dirty(kvm, gfn); - ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, + ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, vma_pagesize, false, true); } else { - ret = stage2_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, + ret = gstage_map_page(kvm, pcache, gpa, hfn << PAGE_SHIFT, vma_pagesize, true, true); } if (ret) - kvm_err("Failed to map in stage2\n"); + kvm_err("Failed to map in G-stage\n"); out_unlock: spin_unlock(&kvm->mmu_lock); @@ -697,7 +700,7 @@ out_unlock: return ret; } -int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm) +int kvm_riscv_gstage_alloc_pgd(struct kvm *kvm) { struct page *pgd_page; @@ -707,7 +710,7 @@ int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm) } pgd_page = alloc_pages(GFP_KERNEL | __GFP_ZERO, - get_order(stage2_pgd_size)); + get_order(gstage_pgd_size)); if (!pgd_page) return -ENOMEM; kvm->arch.pgd = page_to_virt(pgd_page); @@ -716,13 +719,13 @@ int kvm_riscv_stage2_alloc_pgd(struct kvm *kvm) return 0; } -void kvm_riscv_stage2_free_pgd(struct kvm *kvm) +void kvm_riscv_gstage_free_pgd(struct kvm *kvm) { void *pgd = NULL; spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { - stage2_unmap_range(kvm, 0UL, stage2_gpa_size, false); + gstage_unmap_range(kvm, 0UL, gstage_gpa_size, false); pgd = READ_ONCE(kvm->arch.pgd); kvm->arch.pgd = NULL; kvm->arch.pgd_phys = 0; @@ -730,12 +733,12 @@ void kvm_riscv_stage2_free_pgd(struct kvm *kvm) spin_unlock(&kvm->mmu_lock); if (pgd) - free_pages((unsigned long)pgd, get_order(stage2_pgd_size)); + free_pages((unsigned long)pgd, get_order(gstage_pgd_size)); } -void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu) +void kvm_riscv_gstage_update_hgatp(struct kvm_vcpu *vcpu) { - unsigned long hgatp = stage2_mode; + unsigned long hgatp = gstage_mode; struct kvm_arch *k = &vcpu->kvm->arch; hgatp |= (READ_ONCE(k->vmid.vmid) << HGATP_VMID_SHIFT) & @@ -744,31 +747,40 @@ void kvm_riscv_stage2_update_hgatp(struct kvm_vcpu *vcpu) csr_write(CSR_HGATP, hgatp); - if (!kvm_riscv_stage2_vmid_bits()) - __kvm_riscv_hfence_gvma_all(); + if (!kvm_riscv_gstage_vmid_bits()) + kvm_riscv_local_hfence_gvma_all(); } -void kvm_riscv_stage2_mode_detect(void) +void kvm_riscv_gstage_mode_detect(void) { #ifdef CONFIG_64BIT - /* Try Sv48x4 stage2 mode */ + /* Try Sv57x4 G-stage mode */ + csr_write(CSR_HGATP, HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); + if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV57X4) { + gstage_mode = (HGATP_MODE_SV57X4 << HGATP_MODE_SHIFT); + gstage_pgd_levels = 5; + goto skip_sv48x4_test; + } + + /* Try Sv48x4 G-stage mode */ csr_write(CSR_HGATP, HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); if ((csr_read(CSR_HGATP) >> HGATP_MODE_SHIFT) == HGATP_MODE_SV48X4) { - stage2_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); - stage2_pgd_levels = 4; + gstage_mode = (HGATP_MODE_SV48X4 << HGATP_MODE_SHIFT); + gstage_pgd_levels = 4; } - csr_write(CSR_HGATP, 0); +skip_sv48x4_test: - __kvm_riscv_hfence_gvma_all(); + csr_write(CSR_HGATP, 0); + kvm_riscv_local_hfence_gvma_all(); #endif } -unsigned long kvm_riscv_stage2_mode(void) +unsigned long kvm_riscv_gstage_mode(void) { - return stage2_mode >> HGATP_MODE_SHIFT; + return gstage_mode >> HGATP_MODE_SHIFT; } -int kvm_riscv_stage2_gpa_bits(void) +int kvm_riscv_gstage_gpa_bits(void) { - return stage2_gpa_bits; + return gstage_gpa_bits; } diff --git a/arch/riscv/kvm/tlb.S b/arch/riscv/kvm/tlb.S deleted file mode 100644 index 899f75d60bad..000000000000 --- a/arch/riscv/kvm/tlb.S +++ /dev/null @@ -1,74 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2019 Western Digital Corporation or its affiliates. - * - * Authors: - * Anup Patel <anup.patel@wdc.com> - */ - -#include <linux/linkage.h> -#include <asm/asm.h> - - .text - .altmacro - .option norelax - - /* - * Instruction encoding of hfence.gvma is: - * HFENCE.GVMA rs1, rs2 - * HFENCE.GVMA zero, rs2 - * HFENCE.GVMA rs1 - * HFENCE.GVMA - * - * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2 - * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2 - * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1 - * rs1==zero and rs2==zero ==> HFENCE.GVMA - * - * Instruction encoding of HFENCE.GVMA is: - * 0110001 rs2(5) rs1(5) 000 00000 1110011 - */ - -ENTRY(__kvm_riscv_hfence_gvma_vmid_gpa) - /* - * rs1 = a0 (GPA >> 2) - * rs2 = a1 (VMID) - * HFENCE.GVMA a0, a1 - * 0110001 01011 01010 000 00000 1110011 - */ - .word 0x62b50073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_vmid_gpa) - -ENTRY(__kvm_riscv_hfence_gvma_vmid) - /* - * rs1 = zero - * rs2 = a0 (VMID) - * HFENCE.GVMA zero, a0 - * 0110001 01010 00000 000 00000 1110011 - */ - .word 0x62a00073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_vmid) - -ENTRY(__kvm_riscv_hfence_gvma_gpa) - /* - * rs1 = a0 (GPA >> 2) - * rs2 = zero - * HFENCE.GVMA a0 - * 0110001 00000 01010 000 00000 1110011 - */ - .word 0x62050073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_gpa) - -ENTRY(__kvm_riscv_hfence_gvma_all) - /* - * rs1 = zero - * rs2 = zero - * HFENCE.GVMA - * 0110001 00000 00000 000 00000 1110011 - */ - .word 0x62000073 - ret -ENDPROC(__kvm_riscv_hfence_gvma_all) diff --git a/arch/riscv/kvm/tlb.c b/arch/riscv/kvm/tlb.c new file mode 100644 index 000000000000..1a76d0b1907d --- /dev/null +++ b/arch/riscv/kvm/tlb.c @@ -0,0 +1,461 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2022 Ventana Micro Systems Inc. + */ + +#include <linux/bitmap.h> +#include <linux/cpumask.h> +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/smp.h> +#include <linux/kvm_host.h> +#include <asm/cacheflush.h> +#include <asm/csr.h> + +/* + * Instruction encoding of hfence.gvma is: + * HFENCE.GVMA rs1, rs2 + * HFENCE.GVMA zero, rs2 + * HFENCE.GVMA rs1 + * HFENCE.GVMA + * + * rs1!=zero and rs2!=zero ==> HFENCE.GVMA rs1, rs2 + * rs1==zero and rs2!=zero ==> HFENCE.GVMA zero, rs2 + * rs1!=zero and rs2==zero ==> HFENCE.GVMA rs1 + * rs1==zero and rs2==zero ==> HFENCE.GVMA + * + * Instruction encoding of HFENCE.GVMA is: + * 0110001 rs2(5) rs1(5) 000 00000 1110011 + */ + +void kvm_riscv_local_hfence_gvma_vmid_gpa(unsigned long vmid, + gpa_t gpa, gpa_t gpsz, + unsigned long order) +{ + gpa_t pos; + + if (PTRS_PER_PTE < (gpsz >> order)) { + kvm_riscv_local_hfence_gvma_vmid_all(vmid); + return; + } + + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) { + /* + * rs1 = a0 (GPA >> 2) + * rs2 = a1 (VMID) + * HFENCE.GVMA a0, a1 + * 0110001 01011 01010 000 00000 1110011 + */ + asm volatile ("srli a0, %0, 2\n" + "add a1, %1, zero\n" + ".word 0x62b50073\n" + :: "r" (pos), "r" (vmid) + : "a0", "a1", "memory"); + } +} + +void kvm_riscv_local_hfence_gvma_vmid_all(unsigned long vmid) +{ + /* + * rs1 = zero + * rs2 = a0 (VMID) + * HFENCE.GVMA zero, a0 + * 0110001 01010 00000 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + ".word 0x62a00073\n" + :: "r" (vmid) : "a0", "memory"); +} + +void kvm_riscv_local_hfence_gvma_gpa(gpa_t gpa, gpa_t gpsz, + unsigned long order) +{ + gpa_t pos; + + if (PTRS_PER_PTE < (gpsz >> order)) { + kvm_riscv_local_hfence_gvma_all(); + return; + } + + for (pos = gpa; pos < (gpa + gpsz); pos += BIT(order)) { + /* + * rs1 = a0 (GPA >> 2) + * rs2 = zero + * HFENCE.GVMA a0 + * 0110001 00000 01010 000 00000 1110011 + */ + asm volatile ("srli a0, %0, 2\n" + ".word 0x62050073\n" + :: "r" (pos) : "a0", "memory"); + } +} + +void kvm_riscv_local_hfence_gvma_all(void) +{ + /* + * rs1 = zero + * rs2 = zero + * HFENCE.GVMA + * 0110001 00000 00000 000 00000 1110011 + */ + asm volatile (".word 0x62000073" ::: "memory"); +} + +/* + * Instruction encoding of hfence.gvma is: + * HFENCE.VVMA rs1, rs2 + * HFENCE.VVMA zero, rs2 + * HFENCE.VVMA rs1 + * HFENCE.VVMA + * + * rs1!=zero and rs2!=zero ==> HFENCE.VVMA rs1, rs2 + * rs1==zero and rs2!=zero ==> HFENCE.VVMA zero, rs2 + * rs1!=zero and rs2==zero ==> HFENCE.VVMA rs1 + * rs1==zero and rs2==zero ==> HFENCE.VVMA + * + * Instruction encoding of HFENCE.VVMA is: + * 0010001 rs2(5) rs1(5) 000 00000 1110011 + */ + +void kvm_riscv_local_hfence_vvma_asid_gva(unsigned long vmid, + unsigned long asid, + unsigned long gva, + unsigned long gvsz, + unsigned long order) +{ + unsigned long pos, hgatp; + + if (PTRS_PER_PTE < (gvsz >> order)) { + kvm_riscv_local_hfence_vvma_asid_all(vmid, asid); + return; + } + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) { + /* + * rs1 = a0 (GVA) + * rs2 = a1 (ASID) + * HFENCE.VVMA a0, a1 + * 0010001 01011 01010 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + "add a1, %1, zero\n" + ".word 0x22b50073\n" + :: "r" (pos), "r" (asid) + : "a0", "a1", "memory"); + } + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_hfence_vvma_asid_all(unsigned long vmid, + unsigned long asid) +{ + unsigned long hgatp; + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + /* + * rs1 = zero + * rs2 = a0 (ASID) + * HFENCE.VVMA zero, a0 + * 0010001 01010 00000 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + ".word 0x22a00073\n" + :: "r" (asid) : "a0", "memory"); + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_hfence_vvma_gva(unsigned long vmid, + unsigned long gva, unsigned long gvsz, + unsigned long order) +{ + unsigned long pos, hgatp; + + if (PTRS_PER_PTE < (gvsz >> order)) { + kvm_riscv_local_hfence_vvma_all(vmid); + return; + } + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + for (pos = gva; pos < (gva + gvsz); pos += BIT(order)) { + /* + * rs1 = a0 (GVA) + * rs2 = zero + * HFENCE.VVMA a0 + * 0010001 00000 01010 000 00000 1110011 + */ + asm volatile ("add a0, %0, zero\n" + ".word 0x22050073\n" + :: "r" (pos) : "a0", "memory"); + } + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_hfence_vvma_all(unsigned long vmid) +{ + unsigned long hgatp; + + hgatp = csr_swap(CSR_HGATP, vmid << HGATP_VMID_SHIFT); + + /* + * rs1 = zero + * rs2 = zero + * HFENCE.VVMA + * 0010001 00000 00000 000 00000 1110011 + */ + asm volatile (".word 0x22000073" ::: "memory"); + + csr_write(CSR_HGATP, hgatp); +} + +void kvm_riscv_local_tlb_sanitize(struct kvm_vcpu *vcpu) +{ + unsigned long vmid; + + if (!kvm_riscv_gstage_vmid_bits() || + vcpu->arch.last_exit_cpu == vcpu->cpu) + return; + + /* + * On RISC-V platforms with hardware VMID support, we share same + * VMID for all VCPUs of a particular Guest/VM. This means we might + * have stale G-stage TLB entries on the current Host CPU due to + * some other VCPU of the same Guest which ran previously on the + * current Host CPU. + * + * To cleanup stale TLB entries, we simply flush all G-stage TLB + * entries by VMID whenever underlying Host CPU changes for a VCPU. + */ + + vmid = READ_ONCE(vcpu->kvm->arch.vmid.vmid); + kvm_riscv_local_hfence_gvma_vmid_all(vmid); +} + +void kvm_riscv_fence_i_process(struct kvm_vcpu *vcpu) +{ + local_flush_icache_all(); +} + +void kvm_riscv_hfence_gvma_vmid_all_process(struct kvm_vcpu *vcpu) +{ + struct kvm_vmid *vmid; + + vmid = &vcpu->kvm->arch.vmid; + kvm_riscv_local_hfence_gvma_vmid_all(READ_ONCE(vmid->vmid)); +} + +void kvm_riscv_hfence_vvma_all_process(struct kvm_vcpu *vcpu) +{ + struct kvm_vmid *vmid; + + vmid = &vcpu->kvm->arch.vmid; + kvm_riscv_local_hfence_vvma_all(READ_ONCE(vmid->vmid)); +} + +static bool vcpu_hfence_dequeue(struct kvm_vcpu *vcpu, + struct kvm_riscv_hfence *out_data) +{ + bool ret = false; + struct kvm_vcpu_arch *varch = &vcpu->arch; + + spin_lock(&varch->hfence_lock); + + if (varch->hfence_queue[varch->hfence_head].type) { + memcpy(out_data, &varch->hfence_queue[varch->hfence_head], + sizeof(*out_data)); + varch->hfence_queue[varch->hfence_head].type = 0; + + varch->hfence_head++; + if (varch->hfence_head == KVM_RISCV_VCPU_MAX_HFENCE) + varch->hfence_head = 0; + + ret = true; + } + + spin_unlock(&varch->hfence_lock); + + return ret; +} + +static bool vcpu_hfence_enqueue(struct kvm_vcpu *vcpu, + const struct kvm_riscv_hfence *data) +{ + bool ret = false; + struct kvm_vcpu_arch *varch = &vcpu->arch; + + spin_lock(&varch->hfence_lock); + + if (!varch->hfence_queue[varch->hfence_tail].type) { + memcpy(&varch->hfence_queue[varch->hfence_tail], + data, sizeof(*data)); + + varch->hfence_tail++; + if (varch->hfence_tail == KVM_RISCV_VCPU_MAX_HFENCE) + varch->hfence_tail = 0; + + ret = true; + } + + spin_unlock(&varch->hfence_lock); + + return ret; +} + +void kvm_riscv_hfence_process(struct kvm_vcpu *vcpu) +{ + struct kvm_riscv_hfence d = { 0 }; + struct kvm_vmid *v = &vcpu->kvm->arch.vmid; + + while (vcpu_hfence_dequeue(vcpu, &d)) { + switch (d.type) { + case KVM_RISCV_HFENCE_UNKNOWN: + break; + case KVM_RISCV_HFENCE_GVMA_VMID_GPA: + kvm_riscv_local_hfence_gvma_vmid_gpa( + READ_ONCE(v->vmid), + d.addr, d.size, d.order); + break; + case KVM_RISCV_HFENCE_VVMA_ASID_GVA: + kvm_riscv_local_hfence_vvma_asid_gva( + READ_ONCE(v->vmid), d.asid, + d.addr, d.size, d.order); + break; + case KVM_RISCV_HFENCE_VVMA_ASID_ALL: + kvm_riscv_local_hfence_vvma_asid_all( + READ_ONCE(v->vmid), d.asid); + break; + case KVM_RISCV_HFENCE_VVMA_GVA: + kvm_riscv_local_hfence_vvma_gva( + READ_ONCE(v->vmid), + d.addr, d.size, d.order); + break; + default: + break; + } + } +} + +static void make_xfence_request(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned int req, unsigned int fallback_req, + const struct kvm_riscv_hfence *data) +{ + unsigned long i; + struct kvm_vcpu *vcpu; + unsigned int actual_req = req; + DECLARE_BITMAP(vcpu_mask, KVM_MAX_VCPUS); + + bitmap_clear(vcpu_mask, 0, KVM_MAX_VCPUS); + kvm_for_each_vcpu(i, vcpu, kvm) { + if (hbase != -1UL) { + if (vcpu->vcpu_id < hbase) + continue; + if (!(hmask & (1UL << (vcpu->vcpu_id - hbase)))) + continue; + } + + bitmap_set(vcpu_mask, i, 1); + + if (!data || !data->type) + continue; + + /* + * Enqueue hfence data to VCPU hfence queue. If we don't + * have space in the VCPU hfence queue then fallback to + * a more conservative hfence request. + */ + if (!vcpu_hfence_enqueue(vcpu, data)) + actual_req = fallback_req; + } + + kvm_make_vcpus_request_mask(kvm, actual_req, vcpu_mask); +} + +void kvm_riscv_fence_i(struct kvm *kvm, + unsigned long hbase, unsigned long hmask) +{ + make_xfence_request(kvm, hbase, hmask, KVM_REQ_FENCE_I, + KVM_REQ_FENCE_I, NULL); +} + +void kvm_riscv_hfence_gvma_vmid_gpa(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + gpa_t gpa, gpa_t gpsz, + unsigned long order) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_GVMA_VMID_GPA; + data.asid = 0; + data.addr = gpa; + data.size = gpsz; + data.order = order; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_GVMA_VMID_ALL, &data); +} + +void kvm_riscv_hfence_gvma_vmid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask) +{ + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_GVMA_VMID_ALL, + KVM_REQ_HFENCE_GVMA_VMID_ALL, NULL); +} + +void kvm_riscv_hfence_vvma_asid_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order, unsigned long asid) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_VVMA_ASID_GVA; + data.asid = asid; + data.addr = gva; + data.size = gvsz; + data.order = order; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); +} + +void kvm_riscv_hfence_vvma_asid_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long asid) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_VVMA_ASID_ALL; + data.asid = asid; + data.addr = data.size = data.order = 0; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); +} + +void kvm_riscv_hfence_vvma_gva(struct kvm *kvm, + unsigned long hbase, unsigned long hmask, + unsigned long gva, unsigned long gvsz, + unsigned long order) +{ + struct kvm_riscv_hfence data; + + data.type = KVM_RISCV_HFENCE_VVMA_GVA; + data.asid = 0; + data.addr = gva; + data.size = gvsz; + data.order = order; + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE, + KVM_REQ_HFENCE_VVMA_ALL, &data); +} + +void kvm_riscv_hfence_vvma_all(struct kvm *kvm, + unsigned long hbase, unsigned long hmask) +{ + make_xfence_request(kvm, hbase, hmask, KVM_REQ_HFENCE_VVMA_ALL, + KVM_REQ_HFENCE_VVMA_ALL, NULL); +} diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 7461f964d20a..7f4ad5e4373a 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -67,6 +67,8 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) if (loaded) kvm_arch_vcpu_put(vcpu); + vcpu->arch.last_exit_cpu = -1; + memcpy(csr, reset_csr, sizeof(*csr)); memcpy(cntx, reset_cntx, sizeof(*cntx)); @@ -78,6 +80,10 @@ static void kvm_riscv_reset_vcpu(struct kvm_vcpu *vcpu) WRITE_ONCE(vcpu->arch.irqs_pending, 0); WRITE_ONCE(vcpu->arch.irqs_pending_mask, 0); + vcpu->arch.hfence_head = 0; + vcpu->arch.hfence_tail = 0; + memset(vcpu->arch.hfence_queue, 0, sizeof(vcpu->arch.hfence_queue)); + /* Reset the guest CSRs for hotplug usecase */ if (loaded) kvm_arch_vcpu_load(vcpu, smp_processor_id()); @@ -101,6 +107,9 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) /* Setup ISA features available to VCPU */ vcpu->arch.isa = riscv_isa_extension_base(NULL) & KVM_RISCV_ISA_ALLOWED; + /* Setup VCPU hfence queue */ + spin_lock_init(&vcpu->arch.hfence_lock); + /* Setup reset state of shadow SSTATUS and HSTATUS CSRs */ cntx = &vcpu->arch.guest_reset_context; cntx->sstatus = SR_SPP | SR_SPIE; @@ -137,7 +146,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) /* Cleanup VCPU timer */ kvm_riscv_vcpu_timer_deinit(vcpu); - /* Free unused pages pre-allocated for Stage2 page table mappings */ + /* Free unused pages pre-allocated for G-stage page table mappings */ kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache); } @@ -365,6 +374,101 @@ static int kvm_riscv_vcpu_set_reg_csr(struct kvm_vcpu *vcpu, return 0; } +/* Mapping between KVM ISA Extension ID & Host ISA extension ID */ +static unsigned long kvm_isa_ext_arr[] = { + RISCV_ISA_EXT_a, + RISCV_ISA_EXT_c, + RISCV_ISA_EXT_d, + RISCV_ISA_EXT_f, + RISCV_ISA_EXT_h, + RISCV_ISA_EXT_i, + RISCV_ISA_EXT_m, +}; + +static int kvm_riscv_vcpu_get_reg_isa_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_ISA_EXT); + unsigned long reg_val = 0; + unsigned long host_isa_ext; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) + return -EINVAL; + + host_isa_ext = kvm_isa_ext_arr[reg_num]; + if (__riscv_isa_extension_available(&vcpu->arch.isa, host_isa_ext)) + reg_val = 1; /* Mark the given extension as available */ + + if (copy_to_user(uaddr, ®_val, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + return 0; +} + +static int kvm_riscv_vcpu_set_reg_isa_ext(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg) +{ + unsigned long __user *uaddr = + (unsigned long __user *)(unsigned long)reg->addr; + unsigned long reg_num = reg->id & ~(KVM_REG_ARCH_MASK | + KVM_REG_SIZE_MASK | + KVM_REG_RISCV_ISA_EXT); + unsigned long reg_val; + unsigned long host_isa_ext; + unsigned long host_isa_ext_mask; + + if (KVM_REG_SIZE(reg->id) != sizeof(unsigned long)) + return -EINVAL; + + if (reg_num >= KVM_RISCV_ISA_EXT_MAX || reg_num >= ARRAY_SIZE(kvm_isa_ext_arr)) + return -EINVAL; + + if (copy_from_user(®_val, uaddr, KVM_REG_SIZE(reg->id))) + return -EFAULT; + + host_isa_ext = kvm_isa_ext_arr[reg_num]; + if (!__riscv_isa_extension_available(NULL, host_isa_ext)) + return -EOPNOTSUPP; + + if (host_isa_ext >= RISCV_ISA_EXT_BASE && + host_isa_ext < RISCV_ISA_EXT_MAX) { + /* + * Multi-letter ISA extension. Currently there is no provision + * to enable/disable the multi-letter ISA extensions for guests. + * Return success if the request is to enable any ISA extension + * that is available in the hardware. + * Return -EOPNOTSUPP otherwise. + */ + if (!reg_val) + return -EOPNOTSUPP; + else + return 0; + } + + /* Single letter base ISA extension */ + if (!vcpu->arch.ran_atleast_once) { + host_isa_ext_mask = BIT_MASK(host_isa_ext); + if (!reg_val && (host_isa_ext_mask & KVM_RISCV_ISA_DISABLE_ALLOWED)) + vcpu->arch.isa &= ~host_isa_ext_mask; + else + vcpu->arch.isa |= host_isa_ext_mask; + vcpu->arch.isa &= riscv_isa_extension_base(NULL); + vcpu->arch.isa &= KVM_RISCV_ISA_ALLOWED; + kvm_riscv_vcpu_fp_reset(vcpu); + } else { + return -EOPNOTSUPP; + } + + return 0; +} + static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg) { @@ -382,6 +486,8 @@ static int kvm_riscv_vcpu_set_reg(struct kvm_vcpu *vcpu, else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D) return kvm_riscv_vcpu_set_reg_fp(vcpu, reg, KVM_REG_RISCV_FP_D); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_ISA_EXT) + return kvm_riscv_vcpu_set_reg_isa_ext(vcpu, reg); return -EINVAL; } @@ -403,6 +509,8 @@ static int kvm_riscv_vcpu_get_reg(struct kvm_vcpu *vcpu, else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_FP_D) return kvm_riscv_vcpu_get_reg_fp(vcpu, reg, KVM_REG_RISCV_FP_D); + else if ((reg->id & KVM_REG_RISCV_TYPE_MASK) == KVM_REG_RISCV_ISA_EXT) + return kvm_riscv_vcpu_get_reg_isa_ext(vcpu, reg); return -EINVAL; } @@ -635,7 +743,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) csr_write(CSR_HVIP, csr->hvip); csr_write(CSR_VSATP, csr->vsatp); - kvm_riscv_stage2_update_hgatp(vcpu); + kvm_riscv_gstage_update_hgatp(vcpu); kvm_riscv_vcpu_timer_restore(vcpu); @@ -690,10 +798,23 @@ static void kvm_riscv_check_vcpu_requests(struct kvm_vcpu *vcpu) kvm_riscv_reset_vcpu(vcpu); if (kvm_check_request(KVM_REQ_UPDATE_HGATP, vcpu)) - kvm_riscv_stage2_update_hgatp(vcpu); + kvm_riscv_gstage_update_hgatp(vcpu); - if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) - __kvm_riscv_hfence_gvma_all(); + if (kvm_check_request(KVM_REQ_FENCE_I, vcpu)) + kvm_riscv_fence_i_process(vcpu); + + /* + * The generic KVM_REQ_TLB_FLUSH is same as + * KVM_REQ_HFENCE_GVMA_VMID_ALL + */ + if (kvm_check_request(KVM_REQ_HFENCE_GVMA_VMID_ALL, vcpu)) + kvm_riscv_hfence_gvma_vmid_all_process(vcpu); + + if (kvm_check_request(KVM_REQ_HFENCE_VVMA_ALL, vcpu)) + kvm_riscv_hfence_vvma_all_process(vcpu); + + if (kvm_check_request(KVM_REQ_HFENCE, vcpu)) + kvm_riscv_hfence_process(vcpu); } } @@ -715,6 +836,7 @@ static void noinstr kvm_riscv_vcpu_enter_exit(struct kvm_vcpu *vcpu) { guest_state_enter_irqoff(); __kvm_riscv_switch_to(&vcpu->arch); + vcpu->arch.last_exit_cpu = vcpu->cpu; guest_state_exit_irqoff(); } @@ -762,7 +884,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Check conditions before entering the guest */ cond_resched(); - kvm_riscv_stage2_vmid_update(vcpu); + kvm_riscv_gstage_vmid_update(vcpu); kvm_riscv_check_vcpu_requests(vcpu); @@ -800,7 +922,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_riscv_update_hvip(vcpu); if (ret <= 0 || - kvm_riscv_stage2_vmid_ver_changed(&vcpu->kvm->arch.vmid) || + kvm_riscv_gstage_vmid_ver_changed(&vcpu->kvm->arch.vmid) || kvm_request_pending(vcpu)) { vcpu->mode = OUTSIDE_GUEST_MODE; local_irq_enable(); @@ -809,6 +931,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) continue; } + /* + * Cleanup stale TLB enteries + * + * Note: This should be done after G-stage VMID has been + * updated using kvm_riscv_gstage_vmid_ver_changed() + */ + kvm_riscv_local_tlb_sanitize(vcpu); + guest_timing_enter_irqoff(); kvm_riscv_vcpu_enter_exit(vcpu); diff --git a/arch/riscv/kvm/vcpu_exit.c b/arch/riscv/kvm/vcpu_exit.c index a72c15d4b42a..dbb09afd7546 100644 --- a/arch/riscv/kvm/vcpu_exit.c +++ b/arch/riscv/kvm/vcpu_exit.c @@ -412,7 +412,7 @@ static int emulate_store(struct kvm_vcpu *vcpu, struct kvm_run *run, return 0; } -static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, +static int gstage_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, struct kvm_cpu_trap *trap) { struct kvm_memory_slot *memslot; @@ -440,7 +440,7 @@ static int stage2_page_fault(struct kvm_vcpu *vcpu, struct kvm_run *run, }; } - ret = kvm_riscv_stage2_map(vcpu, memslot, fault_addr, hva, + ret = kvm_riscv_gstage_map(vcpu, memslot, fault_addr, hva, (trap->scause == EXC_STORE_GUEST_PAGE_FAULT) ? true : false); if (ret < 0) return ret; @@ -686,7 +686,7 @@ int kvm_riscv_vcpu_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case EXC_LOAD_GUEST_PAGE_FAULT: case EXC_STORE_GUEST_PAGE_FAULT: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) - ret = stage2_page_fault(vcpu, run, trap); + ret = gstage_page_fault(vcpu, run, trap); break; case EXC_SUPERVISOR_SYSCALL: if (vcpu->arch.guest_context.hstatus & HSTATUS_SPV) diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c index 0f217365c287..4c034d8a606a 100644 --- a/arch/riscv/kvm/vcpu_sbi_replace.c +++ b/arch/riscv/kvm/vcpu_sbi_replace.c @@ -81,43 +81,41 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run struct kvm_cpu_trap *utrap, bool *exit) { int ret = 0; - unsigned long i; - struct cpumask cm; - struct kvm_vcpu *tmp; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; unsigned long hmask = cp->a0; unsigned long hbase = cp->a1; unsigned long funcid = cp->a6; - cpumask_clear(&cm); - kvm_for_each_vcpu(i, tmp, vcpu->kvm) { - if (hbase != -1UL) { - if (tmp->vcpu_id < hbase) - continue; - if (!(hmask & (1UL << (tmp->vcpu_id - hbase)))) - continue; - } - if (tmp->cpu < 0) - continue; - cpumask_set_cpu(tmp->cpu, &cm); - } - switch (funcid) { case SBI_EXT_RFENCE_REMOTE_FENCE_I: - ret = sbi_remote_fence_i(&cm); + kvm_riscv_fence_i(vcpu->kvm, hbase, hmask); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA: - ret = sbi_remote_hfence_vvma(&cm, cp->a2, cp->a3); + if (cp->a2 == 0 && cp->a3 == 0) + kvm_riscv_hfence_vvma_all(vcpu->kvm, hbase, hmask); + else + kvm_riscv_hfence_vvma_gva(vcpu->kvm, hbase, hmask, + cp->a2, cp->a3, PAGE_SHIFT); break; case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID: - ret = sbi_remote_hfence_vvma_asid(&cm, cp->a2, - cp->a3, cp->a4); + if (cp->a2 == 0 && cp->a3 == 0) + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, + hbase, hmask, cp->a4); + else + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, + hbase, hmask, + cp->a2, cp->a3, + PAGE_SHIFT, cp->a4); break; case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA: case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID: case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA: case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID: - /* TODO: implement for nested hypervisor case */ + /* + * Until nested virtualization is implemented, the + * SBI HFENCE calls should be treated as NOPs + */ + break; default: ret = -EOPNOTSUPP; } diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c index da4d6c99c2cf..8a91a14e7139 100644 --- a/arch/riscv/kvm/vcpu_sbi_v01.c +++ b/arch/riscv/kvm/vcpu_sbi_v01.c @@ -23,7 +23,6 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, int i, ret = 0; u64 next_cycle; struct kvm_vcpu *rvcpu; - struct cpumask cm; struct kvm *kvm = vcpu->kvm; struct kvm_cpu_context *cp = &vcpu->arch.guest_context; @@ -80,19 +79,29 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run, if (utrap->scause) break; - cpumask_clear(&cm); - for_each_set_bit(i, &hmask, BITS_PER_LONG) { - rvcpu = kvm_get_vcpu_by_id(vcpu->kvm, i); - if (rvcpu->cpu < 0) - continue; - cpumask_set_cpu(rvcpu->cpu, &cm); - } if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I) - ret = sbi_remote_fence_i(&cm); - else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) - ret = sbi_remote_hfence_vvma(&cm, cp->a1, cp->a2); - else - ret = sbi_remote_hfence_vvma_asid(&cm, cp->a1, cp->a2, cp->a3); + kvm_riscv_fence_i(vcpu->kvm, 0, hmask); + else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA) { + if (cp->a1 == 0 && cp->a2 == 0) + kvm_riscv_hfence_vvma_all(vcpu->kvm, + 0, hmask); + else + kvm_riscv_hfence_vvma_gva(vcpu->kvm, + 0, hmask, + cp->a1, cp->a2, + PAGE_SHIFT); + } else { + if (cp->a1 == 0 && cp->a2 == 0) + kvm_riscv_hfence_vvma_asid_all(vcpu->kvm, + 0, hmask, + cp->a3); + else + kvm_riscv_hfence_vvma_asid_gva(vcpu->kvm, + 0, hmask, + cp->a1, cp->a2, + PAGE_SHIFT, + cp->a3); + } break; default: ret = -EINVAL; diff --git a/arch/riscv/kvm/vm.c b/arch/riscv/kvm/vm.c index c768f75279ef..945a2bf5e3f6 100644 --- a/arch/riscv/kvm/vm.c +++ b/arch/riscv/kvm/vm.c @@ -31,13 +31,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { int r; - r = kvm_riscv_stage2_alloc_pgd(kvm); + r = kvm_riscv_gstage_alloc_pgd(kvm); if (r) return r; - r = kvm_riscv_stage2_vmid_init(kvm); + r = kvm_riscv_gstage_vmid_init(kvm); if (r) { - kvm_riscv_stage2_free_pgd(kvm); + kvm_riscv_gstage_free_pgd(kvm); return r; } @@ -75,7 +75,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = KVM_USER_MEM_SLOTS; break; case KVM_CAP_VM_GPA_BITS: - r = kvm_riscv_stage2_gpa_bits(); + r = kvm_riscv_gstage_gpa_bits(); break; default: r = 0; diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c index 2fa4f7b1813d..9f764df125db 100644 --- a/arch/riscv/kvm/vmid.c +++ b/arch/riscv/kvm/vmid.c @@ -11,16 +11,16 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/smp.h> #include <linux/kvm_host.h> #include <asm/csr.h> -#include <asm/sbi.h> static unsigned long vmid_version = 1; static unsigned long vmid_next; static unsigned long vmid_bits; static DEFINE_SPINLOCK(vmid_lock); -void kvm_riscv_stage2_vmid_detect(void) +void kvm_riscv_gstage_vmid_detect(void) { unsigned long old; @@ -33,19 +33,19 @@ void kvm_riscv_stage2_vmid_detect(void) csr_write(CSR_HGATP, old); /* We polluted local TLB so flush all guest TLB */ - __kvm_riscv_hfence_gvma_all(); + kvm_riscv_local_hfence_gvma_all(); /* We don't use VMID bits if they are not sufficient */ if ((1UL << vmid_bits) < num_possible_cpus()) vmid_bits = 0; } -unsigned long kvm_riscv_stage2_vmid_bits(void) +unsigned long kvm_riscv_gstage_vmid_bits(void) { return vmid_bits; } -int kvm_riscv_stage2_vmid_init(struct kvm *kvm) +int kvm_riscv_gstage_vmid_init(struct kvm *kvm) { /* Mark the initial VMID and VMID version invalid */ kvm->arch.vmid.vmid_version = 0; @@ -54,7 +54,7 @@ int kvm_riscv_stage2_vmid_init(struct kvm *kvm) return 0; } -bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid) +bool kvm_riscv_gstage_vmid_ver_changed(struct kvm_vmid *vmid) { if (!vmid_bits) return false; @@ -63,13 +63,18 @@ bool kvm_riscv_stage2_vmid_ver_changed(struct kvm_vmid *vmid) READ_ONCE(vmid_version)); } -void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) +static void __local_hfence_gvma_all(void *info) +{ + kvm_riscv_local_hfence_gvma_all(); +} + +void kvm_riscv_gstage_vmid_update(struct kvm_vcpu *vcpu) { unsigned long i; struct kvm_vcpu *v; struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid; - if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) + if (!kvm_riscv_gstage_vmid_ver_changed(vmid)) return; spin_lock(&vmid_lock); @@ -78,7 +83,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) * We need to re-check the vmid_version here to ensure that if * another vcpu already allocated a valid vmid for this vm. */ - if (!kvm_riscv_stage2_vmid_ver_changed(vmid)) { + if (!kvm_riscv_gstage_vmid_ver_changed(vmid)) { spin_unlock(&vmid_lock); return; } @@ -96,12 +101,13 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) * instances is invalid and we have force VMID re-assignement * for all Guest instances. The Guest instances that were not * running will automatically pick-up new VMIDs because will - * call kvm_riscv_stage2_vmid_update() whenever they enter + * call kvm_riscv_gstage_vmid_update() whenever they enter * in-kernel run loop. For Guest instances that are already * running, we force VM exits on all host CPUs using IPI and * flush all Guest TLBs. */ - sbi_remote_hfence_gvma(cpu_online_mask, 0, 0); + on_each_cpu_mask(cpu_online_mask, __local_hfence_gvma_all, + NULL, 1); } vmid->vmid = vmid_next; @@ -112,7 +118,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu) spin_unlock(&vmid_lock); - /* Request stage2 page table update for all VCPUs */ + /* Request G-stage page table update for all VCPUs */ kvm_for_each_vcpu(i, v, vcpu->kvm) kvm_make_request(KVM_REQ_UPDATE_HGATP, v); } diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index b0793dc0c291..05ed641a1134 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -208,8 +208,25 @@ static void __init setup_bootmem(void) * early_init_fdt_reserve_self() since __pa() does * not work for DTB pointers that are fixmap addresses */ - if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) - memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + if (!IS_ENABLED(CONFIG_BUILTIN_DTB)) { + /* + * In case the DTB is not located in a memory region we won't + * be able to locate it later on via the linear mapping and + * get a segfault when accessing it via __va(dtb_early_pa). + * To avoid this situation copy DTB to a memory region. + * Note that memblock_phys_alloc will also reserve DTB region. + */ + if (!memblock_is_memory(dtb_early_pa)) { + size_t fdt_size = fdt_totalsize(dtb_early_va); + phys_addr_t new_dtb_early_pa = memblock_phys_alloc(fdt_size, PAGE_SIZE); + void *new_dtb_early_va = early_memremap(new_dtb_early_pa, fdt_size); + + memcpy(new_dtb_early_va, dtb_early_va, fdt_size); + early_memunmap(new_dtb_early_va, fdt_size); + _dtb_early_pa = new_dtb_early_pa; + } else + memblock_reserve(dtb_early_pa, fdt_totalsize(dtb_early_va)); + } early_init_fdt_scan_reserved_mem(); dma_contiguous_reserve(dma32_phys_limit); diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e441b60b1812..df325eacf62d 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -30,6 +30,16 @@ KBUILD_CFLAGS_DECOMPRESSOR += -fno-stack-protector KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),-g) KBUILD_CFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO_DWARF4), $(call cc-option, -gdwarf-4,)) + +ifdef CONFIG_CC_IS_GCC + ifeq ($(call cc-ifversion, -ge, 1200, y), y) + ifeq ($(call cc-ifversion, -lt, 1300, y), y) + KBUILD_CFLAGS += $(call cc-disable-warning, array-bounds) + KBUILD_CFLAGS_DECOMPRESSOR += $(call cc-disable-warning, array-bounds) + endif + endif +endif + UTS_MACHINE := s390x STACK_SIZE := $(if $(CONFIG_KASAN),65536,16384) CHECKFLAGS += -D__s390__ -D__s390x__ diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index da3dabda1a12..76ad6408cb2c 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -2384,7 +2384,16 @@ static int kvm_s390_vm_mem_op(struct kvm *kvm, struct kvm_s390_mem_op *mop) return -EINVAL; if (mop->size > MEM_OP_MAX_SIZE) return -E2BIG; - if (kvm_s390_pv_is_protected(kvm)) + /* + * This is technically a heuristic only, if the kvm->lock is not + * taken, it is not guaranteed that the vm is/remains non-protected. + * This is ok from a kernel perspective, wrongdoing is detected + * on the access, -EFAULT is returned and the vm may crash the + * next time it accesses the memory in question. + * There is no sane usecase to do switching and a memop on two + * different CPUs at the same time. + */ + if (kvm_s390_pv_get_handle(kvm)) return -EINVAL; if (mop->flags & KVM_S390_MEMOP_F_SKEY_PROTECTION) { if (access_key_invalid(mop->key)) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index af03cacf34ec..1ac73917a8d3 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1183,6 +1183,7 @@ EXPORT_SYMBOL_GPL(gmap_read_table); static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, struct gmap_rmap *rmap) { + struct gmap_rmap *temp; void __rcu **slot; BUG_ON(!gmap_is_shadow(sg)); @@ -1190,6 +1191,12 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); + for (temp = rmap->next; temp; temp = temp->next) { + if (temp->raddr == rmap->raddr) { + kfree(rmap); + return; + } + } radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index c049561f373a..e28ab0ecc537 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -41,17 +41,7 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* - * Track whether the kernel is using the FPU state - * currently. - * - * This flag is used: - * - * - by IRQ context code to potentially use the FPU - * if it's unused. - * - * - to debug kernel_fpu_begin()/end() correctness - */ +/* Track in-kernel FPU usage */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* @@ -59,42 +49,37 @@ static DEFINE_PER_CPU(bool, in_kernel_fpu); */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); -static bool kernel_fpu_disabled(void) -{ - return this_cpu_read(in_kernel_fpu); -} - -static bool interrupted_kernel_fpu_idle(void) -{ - return !kernel_fpu_disabled(); -} - -/* - * Were we in user mode (or vm86 mode) when we were - * interrupted? - * - * Doing kernel_fpu_begin/end() is ok if we are running - * in an interrupt context from user mode - we'll just - * save the FPU state as required. - */ -static bool interrupted_user_mode(void) -{ - struct pt_regs *regs = get_irq_regs(); - return regs && user_mode(regs); -} - /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? - * - * It's always ok in process context (ie "not interrupt") - * but it is sometimes ok even from an irq. */ bool irq_fpu_usable(void) { - return !in_interrupt() || - interrupted_user_mode() || - interrupted_kernel_fpu_idle(); + if (WARN_ON_ONCE(in_nmi())) + return false; + + /* In kernel FPU usage already active? */ + if (this_cpu_read(in_kernel_fpu)) + return false; + + /* + * When not in NMI or hard interrupt context, FPU can be used in: + * + * - Task context except from within fpregs_lock()'ed critical + * regions. + * + * - Soft interrupt processing context which cannot happen + * while in a fpregs_lock()'ed critical region. + */ + if (!in_hardirq()) + return true; + + /* + * In hard interrupt context it's safe when soft interrupts + * are enabled, which means the interrupt did not hit in + * a fpregs_lock()'ed critical region. + */ + return !softirq_count(); } EXPORT_SYMBOL(irq_fpu_usable); diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 94d62c9958b9..655770522471 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1594,24 +1594,51 @@ static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) atomic_set_release(&src_sev->migration_in_progress, 0); } +/* vCPU mutex subclasses. */ +enum sev_migration_role { + SEV_MIGRATION_SOURCE = 0, + SEV_MIGRATION_TARGET, + SEV_NR_MIGRATION_ROLES, +}; -static int sev_lock_vcpus_for_migration(struct kvm *kvm) +static int sev_lock_vcpus_for_migration(struct kvm *kvm, + enum sev_migration_role role) { struct kvm_vcpu *vcpu; unsigned long i, j; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { - if (mutex_lock_killable(&vcpu->mutex)) + if (mutex_lock_killable_nested(&vcpu->mutex, role)) goto out_unlock; + + if (first) { + /* + * Reset the role to one that avoids colliding with + * the role used for the first vcpu mutex. + */ + role = SEV_NR_MIGRATION_ROLES; + first = false; + } else { + mutex_release(&vcpu->mutex.dep_map, _THIS_IP_); + } } return 0; out_unlock: + + first = true; kvm_for_each_vcpu(j, vcpu, kvm) { if (i == j) break; + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, role, 0, _THIS_IP_); + + mutex_unlock(&vcpu->mutex); } return -EINTR; @@ -1621,8 +1648,15 @@ static void sev_unlock_vcpus_for_migration(struct kvm *kvm) { struct kvm_vcpu *vcpu; unsigned long i; + bool first = true; kvm_for_each_vcpu(i, vcpu, kvm) { + if (first) + first = false; + else + mutex_acquire(&vcpu->mutex.dep_map, + SEV_NR_MIGRATION_ROLES, 0, _THIS_IP_); + mutex_unlock(&vcpu->mutex); } } @@ -1748,10 +1782,10 @@ int sev_vm_move_enc_context_from(struct kvm *kvm, unsigned int source_fd) charged = true; } - ret = sev_lock_vcpus_for_migration(kvm); + ret = sev_lock_vcpus_for_migration(kvm, SEV_MIGRATION_SOURCE); if (ret) goto out_dst_cgroup; - ret = sev_lock_vcpus_for_migration(source_kvm); + ret = sev_lock_vcpus_for_migration(source_kvm, SEV_MIGRATION_TARGET); if (ret) goto out_dst_vcpu; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index cbbcf97d9e66..3e271495181e 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5470,7 +5470,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); return vmx->emulation_required && !vmx->rmode.vm86_active && - vcpu->arch.exception.pending; + (vcpu->arch.exception.pending || vcpu->arch.exception.injected); } static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 96d34ebb20a9..e2942335d143 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -902,6 +902,8 @@ static void __meminit vmemmap_use_sub_pmd(unsigned long start, unsigned long end static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) { + const unsigned long page = ALIGN_DOWN(start, PMD_SIZE); + vmemmap_flush_unused_pmd(); /* @@ -914,8 +916,7 @@ static void __meminit vmemmap_use_new_sub_pmd(unsigned long start, unsigned long * Mark with PAGE_UNUSED the unused parts of the new memmap range */ if (!IS_ALIGNED(start, PMD_SIZE)) - memset((void *)start, PAGE_UNUSED, - start - ALIGN_DOWN(start, PMD_SIZE)); + memset((void *)page, PAGE_UNUSED, start - page); /* * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of |