summaryrefslogtreecommitdiffstats
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/alpha/include/asm/bitops.h2
-rw-r--r--arch/alpha/include/uapi/asm/socket.h2
-rw-r--r--arch/alpha/kernel/rtc.c7
-rw-r--r--arch/alpha/kernel/srm_env.c4
-rw-r--r--arch/arc/Kconfig1
-rw-r--r--arch/arc/include/asm/bitops.h1
-rw-r--r--arch/arm/Kconfig1
-rw-r--r--arch/arm/configs/bcm2835_defconfig1
-rw-r--r--arch/arm/configs/qcom_defconfig1
-rw-r--r--arch/arm/include/asm/assembler.h2
-rw-r--r--arch/arm/include/asm/bitops.h1
-rw-r--r--arch/arm/include/asm/processor.h1
-rw-r--r--arch/arm/include/asm/uaccess.h10
-rw-r--r--arch/arm/kernel/atags_proc.c2
-rw-r--r--arch/arm/mm/alignment.c2
-rw-r--r--arch/arm/probes/kprobes/Makefile3
-rw-r--r--arch/arm64/Kconfig82
-rw-r--r--arch/arm64/boot/dts/xilinx/zynqmp.dtsi8
-rw-r--r--arch/arm64/include/asm/atomic_lse.h2
-rw-r--r--arch/arm64/include/asm/bitops.h1
-rw-r--r--arch/arm64/include/asm/cmpxchg.h2
-rw-r--r--arch/arm64/include/asm/cputype.h4
-rw-r--r--arch/arm64/kernel/cpu_errata.c29
-rw-r--r--arch/arm64/kernel/cpufeature.c3
-rw-r--r--arch/arm64/kernel/stacktrace.c5
-rw-r--r--arch/arm64/kernel/vdso/Makefile5
-rw-r--r--arch/arm64/kvm/hyp/exception.c5
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c18
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c3
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c17
-rw-r--r--arch/arm64/mm/extable.c4
-rw-r--r--arch/arm64/mm/init.c2
-rw-r--r--arch/arm64/tools/cpucaps3
-rw-r--r--arch/csky/include/asm/bitops.h1
-rw-r--r--arch/h8300/include/asm/bitops.h1
-rw-r--r--arch/hexagon/include/asm/bitops.h1
-rw-r--r--arch/ia64/Kconfig2
-rw-r--r--arch/ia64/include/asm/bitops.h2
-rw-r--r--arch/ia64/kernel/salinfo.c10
-rw-r--r--arch/ia64/pci/fixup.c4
-rw-r--r--arch/m68k/configs/amiga_defconfig1
-rw-r--r--arch/m68k/configs/apollo_defconfig1
-rw-r--r--arch/m68k/configs/atari_defconfig1
-rw-r--r--arch/m68k/configs/bvme6000_defconfig1
-rw-r--r--arch/m68k/configs/hp300_defconfig1
-rw-r--r--arch/m68k/configs/mac_defconfig1
-rw-r--r--arch/m68k/configs/multi_defconfig1
-rw-r--r--arch/m68k/configs/mvme147_defconfig1
-rw-r--r--arch/m68k/configs/mvme16x_defconfig1
-rw-r--r--arch/m68k/configs/q40_defconfig1
-rw-r--r--arch/m68k/configs/sun3_defconfig1
-rw-r--r--arch/m68k/configs/sun3x_defconfig1
-rw-r--r--arch/m68k/include/asm/bitops.h2
-rw-r--r--arch/mips/Kconfig1
-rw-r--r--arch/mips/cavium-octeon/octeon-memcpy.S2
-rw-r--r--arch/mips/include/asm/asm.h4
-rw-r--r--arch/mips/include/asm/bitops.h1
-rw-r--r--arch/mips/include/asm/ftrace.h4
-rw-r--r--arch/mips/include/asm/r4kcache.h4
-rw-r--r--arch/mips/include/asm/unaligned-emul.h176
-rw-r--r--arch/mips/include/uapi/asm/socket.h2
-rw-r--r--arch/mips/kernel/mips-r2-to-r6-emul.c104
-rw-r--r--arch/mips/kernel/r2300_fpu.S6
-rw-r--r--arch/mips/kernel/r4k_fpu.S2
-rw-r--r--arch/mips/kernel/relocate_kernel.S22
-rw-r--r--arch/mips/kernel/scall32-o32.S10
-rw-r--r--arch/mips/kernel/scall64-n32.S2
-rw-r--r--arch/mips/kernel/scall64-n64.S2
-rw-r--r--arch/mips/kernel/scall64-o32.S10
-rw-r--r--arch/mips/kernel/syscall.c8
-rw-r--r--arch/mips/kvm/vz.c12
-rw-r--r--arch/mips/lib/csum_partial.S4
-rw-r--r--arch/mips/lib/memcpy.S4
-rw-r--r--arch/mips/lib/memset.S2
-rw-r--r--arch/mips/lib/strncpy_user.S4
-rw-r--r--arch/mips/lib/strnlen_user.S2
-rw-r--r--arch/mips/loongson64/vbios_quirk.c9
-rw-r--r--arch/openrisc/include/asm/bitops.h1
-rw-r--r--arch/parisc/include/asm/bitops.h1
-rw-r--r--arch/parisc/include/asm/processor.h1
-rw-r--r--arch/parisc/include/uapi/asm/socket.h2
-rw-r--r--arch/parisc/kernel/setup.c15
-rw-r--r--arch/parisc/kernel/toc.c3
-rw-r--r--arch/powerpc/boot/dts/wii.dts5
-rw-r--r--arch/powerpc/configs/gamecube_defconfig2
-rw-r--r--arch/powerpc/configs/wii_defconfig2
-rw-r--r--arch/powerpc/include/asm/bitops.h2
-rw-r--r--arch/powerpc/include/asm/book3s/32/mmu-hash.h2
-rw-r--r--arch/powerpc/include/asm/book3s/32/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/book3s/64/pgtable.h2
-rw-r--r--arch/powerpc/include/asm/fixmap.h6
-rw-r--r--arch/powerpc/include/asm/hw_irq.h2
-rw-r--r--arch/powerpc/include/asm/kvm_book3s_64.h1
-rw-r--r--arch/powerpc/include/asm/kvm_host.h1
-rw-r--r--arch/powerpc/include/asm/nohash/32/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/nohash/64/pgtable.h1
-rw-r--r--arch/powerpc/include/asm/ppc-opcode.h1
-rw-r--r--arch/powerpc/include/asm/syscall.h4
-rw-r--r--arch/powerpc/include/asm/thread_info.h2
-rw-r--r--arch/powerpc/kernel/interrupt_64.S2
-rw-r--r--arch/powerpc/kernel/proc_powerpc.c4
-rw-r--r--arch/powerpc/kernel/time.c5
-rw-r--r--arch/powerpc/kvm/book3s_hv.c3
-rw-r--r--arch/powerpc/kvm/book3s_hv_nested.c2
-rw-r--r--arch/powerpc/mm/book3s32/mmu.c10
-rw-r--r--arch/powerpc/mm/kasan/book3s_32.c59
-rw-r--r--arch/powerpc/mm/pgtable.c9
-rw-r--r--arch/powerpc/net/bpf_jit_comp.c29
-rw-r--r--arch/powerpc/net/bpf_jit_comp32.c9
-rw-r--r--arch/powerpc/net/bpf_jit_comp64.c29
-rw-r--r--arch/powerpc/perf/core-book3s.c75
-rw-r--r--arch/powerpc/platforms/pasemi/dma_lib.c4
-rw-r--r--arch/riscv/Kconfig52
-rw-r--r--arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts5
-rw-r--r--arch/riscv/configs/nommu_k210_defconfig1
-rw-r--r--arch/riscv/configs/nommu_k210_sdcard_defconfig1
-rw-r--r--arch/riscv/configs/nommu_virt_defconfig2
-rw-r--r--arch/riscv/include/asm/bitops.h1
-rw-r--r--arch/riscv/include/asm/cpu_ops.h2
-rw-r--r--arch/riscv/include/asm/cpu_ops_sbi.h25
-rw-r--r--arch/riscv/include/asm/csr.h3
-rw-r--r--arch/riscv/include/asm/fixmap.h1
-rw-r--r--arch/riscv/include/asm/kasan.h11
-rw-r--r--arch/riscv/include/asm/page.h16
-rw-r--r--arch/riscv/include/asm/pgalloc.h40
-rw-r--r--arch/riscv/include/asm/pgtable-64.h108
-rw-r--r--arch/riscv/include/asm/pgtable.h65
-rw-r--r--arch/riscv/include/asm/sbi.h19
-rw-r--r--arch/riscv/include/asm/smp.h2
-rw-r--r--arch/riscv/include/asm/sparsemem.h6
-rw-r--r--arch/riscv/kernel/Makefile3
-rw-r--r--arch/riscv/kernel/asm-offsets.c3
-rw-r--r--arch/riscv/kernel/cpu.c23
-rw-r--r--arch/riscv/kernel/cpu_ops.c26
-rw-r--r--arch/riscv/kernel/cpu_ops_sbi.c26
-rw-r--r--arch/riscv/kernel/cpu_ops_spinwait.c27
-rw-r--r--arch/riscv/kernel/head.S38
-rw-r--r--arch/riscv/kernel/head.h6
-rw-r--r--arch/riscv/kernel/ptrace.c4
-rw-r--r--arch/riscv/kernel/sbi.c189
-rw-r--r--arch/riscv/kernel/setup.c10
-rw-r--r--arch/riscv/kernel/smpboot.c2
-rw-r--r--arch/riscv/kvm/mmu.c4
-rw-r--r--arch/riscv/kvm/vcpu_sbi_replace.c11
-rw-r--r--arch/riscv/kvm/vcpu_sbi_v01.c11
-rw-r--r--arch/riscv/kvm/vmid.c4
-rw-r--r--arch/riscv/mm/cacheflush.c5
-rw-r--r--arch/riscv/mm/context.c4
-rw-r--r--arch/riscv/mm/init.c378
-rw-r--r--arch/riscv/mm/kasan_init.c248
-rw-r--r--arch/riscv/mm/tlbflush.c9
-rw-r--r--arch/riscv/net/bpf_jit_comp64.c2
-rw-r--r--arch/s390/Kconfig16
-rw-r--r--arch/s390/configs/debug_defconfig21
-rw-r--r--arch/s390/configs/defconfig17
-rw-r--r--arch/s390/configs/zfcpdump_defconfig3
-rw-r--r--arch/s390/hypfs/hypfs_vm.c6
-rw-r--r--arch/s390/include/asm/bitops.h1
-rw-r--r--arch/s390/include/asm/cpu_mf.h4
-rw-r--r--arch/s390/include/asm/uaccess.h120
-rw-r--r--arch/s390/kernel/module.c37
-rw-r--r--arch/s390/kernel/nmi.c27
-rw-r--r--arch/s390/kernel/perf_cpum_cf_common.c4
-rw-r--r--arch/s390/kernel/perf_cpum_cf_events.c6
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c2
-rw-r--r--arch/s390/kvm/kvm-s390.c2
-rw-r--r--arch/s390/lib/Makefile3
-rw-r--r--arch/s390/lib/test_modules.c35
-rw-r--r--arch/s390/lib/test_modules.h50
-rw-r--r--arch/s390/lib/test_modules_helpers.c13
-rw-r--r--arch/s390/lib/uaccess.c24
-rw-r--r--arch/sh/include/asm/bitops.h1
-rw-r--r--arch/sh/mm/alignment.c4
-rw-r--r--arch/sparc/include/asm/bitops_32.h1
-rw-r--r--arch/sparc/include/asm/bitops_64.h2
-rw-r--r--arch/sparc/include/uapi/asm/socket.h2
-rw-r--r--arch/x86/Kconfig2
-rw-r--r--arch/x86/events/intel/core.c15
-rw-r--r--arch/x86/events/intel/lbr.c168
-rw-r--r--arch/x86/events/intel/uncore.c2
-rw-r--r--arch/x86/events/intel/uncore.h3
-rw-r--r--arch/x86/events/intel/uncore_discovery.c4
-rw-r--r--arch/x86/events/intel/uncore_discovery.h2
-rw-r--r--arch/x86/events/intel/uncore_snb.c214
-rw-r--r--arch/x86/events/intel/uncore_snbep.c2
-rw-r--r--arch/x86/events/perf_event.h10
-rw-r--r--arch/x86/events/rapl.c9
-rw-r--r--arch/x86/include/asm/bitops.h2
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h18
-rw-r--r--arch/x86/include/uapi/asm/kvm.h3
-rw-r--r--arch/x86/kernel/apic/vector.c4
-rw-r--r--arch/x86/kernel/cpu/mce/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mce/intel.c1
-rw-r--r--arch/x86/kernel/early-quirks.c10
-rw-r--r--arch/x86/kernel/hpet.c8
-rw-r--r--arch/x86/kvm/cpuid.c165
-rw-r--r--arch/x86/kvm/lapic.c4
-rw-r--r--arch/x86/kvm/mmu/mmu.c31
-rw-r--r--arch/x86/kvm/mmu/spte.c1
-rw-r--r--arch/x86/kvm/mmu/spte.h42
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c6
-rw-r--r--arch/x86/kvm/pmu.c33
-rw-r--r--arch/x86/kvm/svm/avic.c123
-rw-r--r--arch/x86/kvm/svm/nested.c9
-rw-r--r--arch/x86/kvm/svm/pmu.c2
-rw-r--r--arch/x86/kvm/svm/sev.c9
-rw-r--r--arch/x86/kvm/svm/svm.c667
-rw-r--r--arch/x86/kvm/svm/svm.h24
-rw-r--r--arch/x86/kvm/svm/svm_onhyperv.h12
-rw-r--r--arch/x86/kvm/vmx/capabilities.h5
-rw-r--r--arch/x86/kvm/vmx/evmcs.c4
-rw-r--r--arch/x86/kvm/vmx/evmcs.h48
-rw-r--r--arch/x86/kvm/vmx/nested.c82
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c20
-rw-r--r--arch/x86/kvm/vmx/posted_intr.c183
-rw-r--r--arch/x86/kvm/vmx/posted_intr.h8
-rw-r--r--arch/x86/kvm/vmx/vmcs12.c4
-rw-r--r--arch/x86/kvm/vmx/vmcs12.h6
-rw-r--r--arch/x86/kvm/vmx/vmx.c115
-rw-r--r--arch/x86/kvm/vmx/vmx.h3
-rw-r--r--arch/x86/kvm/x86.c164
-rw-r--r--arch/x86/kvm/x86.h1
-rw-r--r--arch/x86/kvm/xen.c10
-rw-r--r--arch/x86/pci/fixup.c4
-rw-r--r--arch/x86/um/Kconfig1
-rw-r--r--arch/xtensa/include/asm/bitops.h1
-rw-r--r--arch/xtensa/platforms/iss/simdisk.c4
228 files changed, 3321 insertions, 1710 deletions
diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index 5adca78830b5..e1d8483a45f2 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -430,8 +430,6 @@ static inline unsigned int __arch_hweight8(unsigned int w)
#endif /* __KERNEL__ */
-#include <asm-generic/bitops/find.h>
-
#ifdef __KERNEL__
/*
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index 284d28755b8d..7d81535893af 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h
@@ -133,6 +133,8 @@
#define SO_RESERVE_MEM 73
+#define SO_TXREHASH 74
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/alpha/kernel/rtc.c b/arch/alpha/kernel/rtc.c
index ce3077946e1d..fb3025396ac9 100644
--- a/arch/alpha/kernel/rtc.c
+++ b/arch/alpha/kernel/rtc.c
@@ -80,7 +80,12 @@ init_rtc_epoch(void)
static int
alpha_rtc_read_time(struct device *dev, struct rtc_time *tm)
{
- mc146818_get_time(tm);
+ int ret = mc146818_get_time(tm);
+
+ if (ret < 0) {
+ dev_err_ratelimited(dev, "unable to read current time\n");
+ return ret;
+ }
/* Adjust for non-default epochs. It's easier to depend on the
generic __get_rtc_time and adjust the epoch here than create
diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c
index 528d2be58182..217b4dca51dd 100644
--- a/arch/alpha/kernel/srm_env.c
+++ b/arch/alpha/kernel/srm_env.c
@@ -83,14 +83,14 @@ static int srm_env_proc_show(struct seq_file *m, void *v)
static int srm_env_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, srm_env_proc_show, PDE_DATA(inode));
+ return single_open(file, srm_env_proc_show, pde_data(inode));
}
static ssize_t srm_env_proc_write(struct file *file, const char __user *buffer,
size_t count, loff_t *pos)
{
int res;
- unsigned long id = (unsigned long)PDE_DATA(file_inode(file));
+ unsigned long id = (unsigned long)pde_data(file_inode(file));
char *buf = (char *) __get_free_page(GFP_USER);
unsigned long ret1, ret2;
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index f74d9860a442..3c2a4753d09b 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -20,7 +20,6 @@ config ARC
select COMMON_CLK
select DMA_DIRECT_REMAP
select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC)
- select GENERIC_FIND_FIRST_BIT
# for now, we don't need GENERIC_IRQ_PROBE, CONFIG_GENERIC_IRQ_CHIP
select GENERIC_IRQ_SHOW
select GENERIC_PCI_IOMAP
diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index a7daaf64ae34..bdb7e190a294 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -189,7 +189,6 @@ static inline __attribute__ ((const)) unsigned long __ffs(unsigned long x)
#include <asm-generic/bitops/atomic.h>
#include <asm-generic/bitops/non-atomic.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index fabe39169b12..4c97cb40eebb 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -83,6 +83,7 @@ config ARM
select HAVE_EBPF_JIT if !CPU_ENDIAN_BE32
select HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
+ select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK if !XIP_KERNEL
select HAVE_DMA_CONTIGUOUS if MMU
select HAVE_DYNAMIC_FTRACE if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig
index 383c632eba7b..a9ed79b7f871 100644
--- a/arch/arm/configs/bcm2835_defconfig
+++ b/arch/arm/configs/bcm2835_defconfig
@@ -31,7 +31,6 @@ CONFIG_ARCH_BCM2835=y
CONFIG_PREEMPT_VOLUNTARY=y
CONFIG_AEABI=y
CONFIG_KSM=y
-CONFIG_CLEANCACHE=y
CONFIG_CMA=y
CONFIG_SECCOMP=y
CONFIG_KEXEC=y
diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig
index 0daa9c0d298e..9981566f2096 100644
--- a/arch/arm/configs/qcom_defconfig
+++ b/arch/arm/configs/qcom_defconfig
@@ -27,7 +27,6 @@ CONFIG_PCIE_QCOM=y
CONFIG_SMP=y
CONFIG_PREEMPT=y
CONFIG_HIGHMEM=y
-CONFIG_CLEANCACHE=y
CONFIG_ARM_APPENDED_DTB=y
CONFIG_ARM_ATAG_DTB_COMPAT=y
CONFIG_CPU_IDLE=y
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index 7d23d4bb2168..6fe67963ba5a 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -288,6 +288,7 @@
*/
#define ALT_UP(instr...) \
.pushsection ".alt.smp.init", "a" ;\
+ .align 2 ;\
.long 9998b - . ;\
9997: instr ;\
.if . - 9997b == 2 ;\
@@ -299,6 +300,7 @@
.popsection
#define ALT_UP_B(label) \
.pushsection ".alt.smp.init", "a" ;\
+ .align 2 ;\
.long 9998b - . ;\
W(b) . + (label - 9998b) ;\
.popsection
diff --git a/arch/arm/include/asm/bitops.h b/arch/arm/include/asm/bitops.h
index c92e42a5c8f7..8e94fe7ab5eb 100644
--- a/arch/arm/include/asm/bitops.h
+++ b/arch/arm/include/asm/bitops.h
@@ -264,7 +264,6 @@ static inline int find_next_bit_le(const void *p, int size, int offset)
#endif
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/le.h>
/*
diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h
index 6af68edfa53a..bdc35c0e8dfb 100644
--- a/arch/arm/include/asm/processor.h
+++ b/arch/arm/include/asm/processor.h
@@ -96,6 +96,7 @@ unsigned long __get_wchan(struct task_struct *p);
#define __ALT_SMP_ASM(smp, up) \
"9998: " smp "\n" \
" .pushsection \".alt.smp.init\", \"a\"\n" \
+ " .align 2\n" \
" .long 9998b - .\n" \
" " up "\n" \
" .popsection\n"
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 36fbc3329252..32dbfd81f42a 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -11,6 +11,7 @@
#include <linux/string.h>
#include <asm/memory.h>
#include <asm/domain.h>
+#include <asm/unaligned.h>
#include <asm/unified.h>
#include <asm/compiler.h>
@@ -497,7 +498,10 @@ do { \
} \
default: __err = __get_user_bad(); break; \
} \
- *(type *)(dst) = __val; \
+ if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) \
+ put_unaligned(__val, (type *)(dst)); \
+ else \
+ *(type *)(dst) = __val; /* aligned by caller */ \
if (__err) \
goto err_label; \
} while (0)
@@ -507,7 +511,9 @@ do { \
const type *__pk_ptr = (dst); \
unsigned long __dst = (unsigned long)__pk_ptr; \
int __err = 0; \
- type __val = *(type *)src; \
+ type __val = IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) \
+ ? get_unaligned((type *)(src)) \
+ : *(type *)(src); /* aligned by caller */ \
switch (sizeof(type)) { \
case 1: __put_user_asm_byte(__val, __dst, __err, ""); break; \
case 2: __put_user_asm_half(__val, __dst, __err, ""); break; \
diff --git a/arch/arm/kernel/atags_proc.c b/arch/arm/kernel/atags_proc.c
index 3c2faf2bd124..3ec2afe78423 100644
--- a/arch/arm/kernel/atags_proc.c
+++ b/arch/arm/kernel/atags_proc.c
@@ -13,7 +13,7 @@ struct buffer {
static ssize_t atags_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct buffer *b = PDE_DATA(file_inode(file));
+ struct buffer *b = pde_data(file_inode(file));
return simple_read_from_buffer(buf, count, ppos, b->data, b->size);
}
diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c
index adbb3817d0be..6f499559d193 100644
--- a/arch/arm/mm/alignment.c
+++ b/arch/arm/mm/alignment.c
@@ -1005,7 +1005,7 @@ static int __init noalign_setup(char *__unused)
__setup("noalign", noalign_setup);
/*
- * This needs to be done after sysctl_init, otherwise sys/ will be
+ * This needs to be done after sysctl_init_bases(), otherwise sys/ will be
* overwritten. Actually, this shouldn't be in sys/ at all since
* it isn't a sysctl, and it doesn't contain sysctl information.
* We now locate it in /proc/cpu/alignment instead.
diff --git a/arch/arm/probes/kprobes/Makefile b/arch/arm/probes/kprobes/Makefile
index 14db56f49f0a..6159010dac4a 100644
--- a/arch/arm/probes/kprobes/Makefile
+++ b/arch/arm/probes/kprobes/Makefile
@@ -1,4 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
+KASAN_SANITIZE_actions-common.o := n
+KASAN_SANITIZE_actions-arm.o := n
+KASAN_SANITIZE_actions-thumb.o := n
obj-$(CONFIG_KPROBES) += core.o actions-common.o checkers-common.o
obj-$(CONFIG_ARM_KPROBES_TEST) += test-kprobes.o
test-kprobes-objs := test-core.o
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dc10d26cb432..f2b5a4abef21 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -120,7 +120,6 @@ config ARM64
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
- select GENERIC_FIND_FIRST_BIT
select GENERIC_IDLE_POLL_SETUP
select GENERIC_IRQ_IPI
select GENERIC_IRQ_PROBE
@@ -671,15 +670,25 @@ config ARM64_ERRATUM_1508412
config ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE
bool
+config ARM64_ERRATUM_2051678
+ bool "Cortex-A510: 2051678: disable Hardware Update of the page table dirty bit"
+ help
+ This options adds the workaround for ARM Cortex-A510 erratum ARM64_ERRATUM_2051678.
+ Affected Coretex-A510 might not respect the ordering rules for
+ hardware update of the page table's dirty bit. The workaround
+ is to not enable the feature on affected CPUs.
+
+ If unsure, say Y.
+
config ARM64_ERRATUM_2119858
- bool "Cortex-A710: 2119858: workaround TRBE overwriting trace data in FILL mode"
+ bool "Cortex-A710/X2: 2119858: workaround TRBE overwriting trace data in FILL mode"
default y
depends on CORESIGHT_TRBE
select ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE
help
- This option adds the workaround for ARM Cortex-A710 erratum 2119858.
+ This option adds the workaround for ARM Cortex-A710/X2 erratum 2119858.
- Affected Cortex-A710 cores could overwrite up to 3 cache lines of trace
+ Affected Cortex-A710/X2 cores could overwrite up to 3 cache lines of trace
data at the base of the buffer (pointed to by TRBASER_EL1) in FILL mode in
the event of a WRAP event.
@@ -762,14 +771,14 @@ config ARM64_ERRATUM_2253138
If unsure, say Y.
config ARM64_ERRATUM_2224489
- bool "Cortex-A710: 2224489: workaround TRBE writing to address out-of-range"
+ bool "Cortex-A710/X2: 2224489: workaround TRBE writing to address out-of-range"
depends on CORESIGHT_TRBE
default y
select ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
help
- This option adds the workaround for ARM Cortex-A710 erratum 2224489.
+ This option adds the workaround for ARM Cortex-A710/X2 erratum 2224489.
- Affected Cortex-A710 cores might write to an out-of-range address, not reserved
+ Affected Cortex-A710/X2 cores might write to an out-of-range address, not reserved
for TRBE. Under some conditions, the TRBE might generate a write to the next
virtually addressed page following the last page of the TRBE address space
(i.e., the TRBLIMITR_EL1.LIMIT), instead of wrapping around to the base.
@@ -779,6 +788,65 @@ config ARM64_ERRATUM_2224489
If unsure, say Y.
+config ARM64_ERRATUM_2064142
+ bool "Cortex-A510: 2064142: workaround TRBE register writes while disabled"
+ depends on COMPILE_TEST # Until the CoreSight TRBE driver changes are in
+ default y
+ help
+ This option adds the workaround for ARM Cortex-A510 erratum 2064142.
+
+ Affected Cortex-A510 core might fail to write into system registers after the
+ TRBE has been disabled. Under some conditions after the TRBE has been disabled
+ writes into TRBE registers TRBLIMITR_EL1, TRBPTR_EL1, TRBBASER_EL1, TRBSR_EL1,
+ and TRBTRG_EL1 will be ignored and will not be effected.
+
+ Work around this in the driver by executing TSB CSYNC and DSB after collection
+ is stopped and before performing a system register write to one of the affected
+ registers.
+
+ If unsure, say Y.
+
+config ARM64_ERRATUM_2038923
+ bool "Cortex-A510: 2038923: workaround TRBE corruption with enable"
+ depends on COMPILE_TEST # Until the CoreSight TRBE driver changes are in
+ default y
+ help
+ This option adds the workaround for ARM Cortex-A510 erratum 2038923.
+
+ Affected Cortex-A510 core might cause an inconsistent view on whether trace is
+ prohibited within the CPU. As a result, the trace buffer or trace buffer state
+ might be corrupted. This happens after TRBE buffer has been enabled by setting
+ TRBLIMITR_EL1.E, followed by just a single context synchronization event before
+ execution changes from a context, in which trace is prohibited to one where it
+ isn't, or vice versa. In these mentioned conditions, the view of whether trace
+ is prohibited is inconsistent between parts of the CPU, and the trace buffer or
+ the trace buffer state might be corrupted.
+
+ Work around this in the driver by preventing an inconsistent view of whether the
+ trace is prohibited or not based on TRBLIMITR_EL1.E by immediately following a
+ change to TRBLIMITR_EL1.E with at least one ISB instruction before an ERET, or
+ two ISB instructions if no ERET is to take place.
+
+ If unsure, say Y.
+
+config ARM64_ERRATUM_1902691
+ bool "Cortex-A510: 1902691: workaround TRBE trace corruption"
+ depends on COMPILE_TEST # Until the CoreSight TRBE driver changes are in
+ default y
+ help
+ This option adds the workaround for ARM Cortex-A510 erratum 1902691.
+
+ Affected Cortex-A510 core might cause trace data corruption, when being written
+ into the memory. Effectively TRBE is broken and hence cannot be used to capture
+ trace data.
+
+ Work around this problem in the driver by just preventing TRBE initialization on
+ affected cpus. The firmware must have disabled the access to TRBE for the kernel
+ on such implementations. This will cover the kernel for any firmware that doesn't
+ do this already.
+
+ If unsure, say Y.
+
config CAVIUM_ERRATUM_22375
bool "Cavium erratum 22375, 24313"
default y
diff --git a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
index 74e66443e4ce..9bec3ba20c69 100644
--- a/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
+++ b/arch/arm64/boot/dts/xilinx/zynqmp.dtsi
@@ -512,6 +512,8 @@
#stream-id-cells = <1>;
iommus = <&smmu 0x874>;
power-domains = <&zynqmp_firmware PD_ETH_0>;
+ resets = <&zynqmp_reset ZYNQMP_RESET_GEM0>;
+ reset-names = "gem0_rst";
};
gem1: ethernet@ff0c0000 {
@@ -526,6 +528,8 @@
#stream-id-cells = <1>;
iommus = <&smmu 0x875>;
power-domains = <&zynqmp_firmware PD_ETH_1>;
+ resets = <&zynqmp_reset ZYNQMP_RESET_GEM1>;
+ reset-names = "gem1_rst";
};
gem2: ethernet@ff0d0000 {
@@ -540,6 +544,8 @@
#stream-id-cells = <1>;
iommus = <&smmu 0x876>;
power-domains = <&zynqmp_firmware PD_ETH_2>;
+ resets = <&zynqmp_reset ZYNQMP_RESET_GEM2>;
+ reset-names = "gem2_rst";
};
gem3: ethernet@ff0e0000 {
@@ -554,6 +560,8 @@
#stream-id-cells = <1>;
iommus = <&smmu 0x877>;
power-domains = <&zynqmp_firmware PD_ETH_3>;
+ resets = <&zynqmp_reset ZYNQMP_RESET_GEM3>;
+ reset-names = "gem3_rst";
};
gpio: gpio@ff0a0000 {
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index d955ade5df7c..5d460f6b7675 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -249,7 +249,7 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \
" mov %" #w "[tmp], %" #w "[old]\n" \
" cas" #mb #sfx "\t%" #w "[tmp], %" #w "[new], %[v]\n" \
" mov %" #w "[ret], %" #w "[tmp]" \
- : [ret] "+r" (x0), [v] "+Q" (*(unsigned long *)ptr), \
+ : [ret] "+r" (x0), [v] "+Q" (*(u##sz *)ptr), \
[tmp] "=&r" (tmp) \
: [old] "r" (x1), [new] "r" (x2) \
: cl); \
diff --git a/arch/arm64/include/asm/bitops.h b/arch/arm64/include/asm/bitops.h
index 81a3e519b07d..9b3c787132d2 100644
--- a/arch/arm64/include/asm/bitops.h
+++ b/arch/arm64/include/asm/bitops.h
@@ -18,7 +18,6 @@
#include <asm-generic/bitops/ffz.h>
#include <asm-generic/bitops/fls64.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index f9bef42c1411..497acf134d99 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -243,7 +243,7 @@ static inline void __cmpwait_case_##sz(volatile void *ptr, \
" cbnz %" #w "[tmp], 1f\n" \
" wfe\n" \
"1:" \
- : [tmp] "=&r" (tmp), [v] "+Q" (*(unsigned long *)ptr) \
+ : [tmp] "=&r" (tmp), [v] "+Q" (*(u##sz *)ptr) \
: [val] "r" (val)); \
}
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h
index 19b8441aa8f2..999b9149f856 100644
--- a/arch/arm64/include/asm/cputype.h
+++ b/arch/arm64/include/asm/cputype.h
@@ -73,7 +73,9 @@
#define ARM_CPU_PART_CORTEX_A76 0xD0B
#define ARM_CPU_PART_NEOVERSE_N1 0xD0C
#define ARM_CPU_PART_CORTEX_A77 0xD0D
+#define ARM_CPU_PART_CORTEX_A510 0xD46
#define ARM_CPU_PART_CORTEX_A710 0xD47
+#define ARM_CPU_PART_CORTEX_X2 0xD48
#define ARM_CPU_PART_NEOVERSE_N2 0xD49
#define APM_CPU_PART_POTENZA 0x000
@@ -115,7 +117,9 @@
#define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76)
#define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1)
#define MIDR_CORTEX_A77 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A77)
+#define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510)
#define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710)
+#define MIDR_CORTEX_X2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X2)
#define MIDR_NEOVERSE_N2 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N2)
#define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX)
#define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 9e1c1aef9ebd..066098198c24 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -347,6 +347,7 @@ static const struct midr_range trbe_overwrite_fill_mode_cpus[] = {
#endif
#ifdef CONFIG_ARM64_ERRATUM_2119858
MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
+ MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0),
#endif
{},
};
@@ -371,6 +372,7 @@ static struct midr_range trbe_write_out_of_range_cpus[] = {
#endif
#ifdef CONFIG_ARM64_ERRATUM_2224489
MIDR_ALL_VERSIONS(MIDR_CORTEX_A710),
+ MIDR_RANGE(MIDR_CORTEX_X2, 0, 0, 2, 0),
#endif
{},
};
@@ -598,6 +600,33 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
CAP_MIDR_RANGE_LIST(trbe_write_out_of_range_cpus),
},
#endif
+#ifdef CONFIG_ARM64_ERRATUM_2064142
+ {
+ .desc = "ARM erratum 2064142",
+ .capability = ARM64_WORKAROUND_2064142,
+
+ /* Cortex-A510 r0p0 - r0p2 */
+ ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2)
+ },
+#endif
+#ifdef CONFIG_ARM64_ERRATUM_2038923
+ {
+ .desc = "ARM erratum 2038923",
+ .capability = ARM64_WORKAROUND_2038923,
+
+ /* Cortex-A510 r0p0 - r0p2 */
+ ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2)
+ },
+#endif
+#ifdef CONFIG_ARM64_ERRATUM_1902691
+ {
+ .desc = "ARM erratum 1902691",
+ .capability = ARM64_WORKAROUND_1902691,
+
+ /* Cortex-A510 r0p0 - r0p1 */
+ ERRATA_MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 1)
+ },
+#endif
{
}
};
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index a46ab3b1c4d5..e5f23dab1c8d 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -1646,6 +1646,9 @@ static bool cpu_has_broken_dbm(void)
/* Kryo4xx Silver (rdpe => r1p0) */
MIDR_REV(MIDR_QCOM_KRYO_4XX_SILVER, 0xd, 0xe),
#endif
+#ifdef CONFIG_ARM64_ERRATUM_2051678
+ MIDR_REV_RANGE(MIDR_CORTEX_A510, 0, 0, 2),
+#endif
{},
};
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 0fb58fed54cb..e4103e085681 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -33,8 +33,8 @@
*/
-static void start_backtrace(struct stackframe *frame, unsigned long fp,
- unsigned long pc)
+static notrace void start_backtrace(struct stackframe *frame, unsigned long fp,
+ unsigned long pc)
{
frame->fp = fp;
frame->pc = pc;
@@ -55,6 +55,7 @@ static void start_backtrace(struct stackframe *frame, unsigned long fp,
frame->prev_fp = 0;
frame->prev_type = STACK_TYPE_UNKNOWN;
}
+NOKPROBE_SYMBOL(start_backtrace);
/*
* Unwind from one frame record (A) to the next frame record (B).
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 60813497a381..172452f79e46 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -29,8 +29,11 @@ ldflags-y := -shared -soname=linux-vdso.so.1 --hash-style=sysv \
ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18
ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
+# -Wmissing-prototypes and -Wmissing-declarations are removed from
+# the CFLAGS of vgettimeofday.c to make possible to build the
+# kernel with CONFIG_WERROR enabled.
CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) \
- $(CC_FLAGS_LTO)
+ $(CC_FLAGS_LTO) -Wmissing-prototypes -Wmissing-declarations
KASAN_SANITIZE := n
KCSAN_SANITIZE := n
UBSAN_SANITIZE := n
diff --git a/arch/arm64/kvm/hyp/exception.c b/arch/arm64/kvm/hyp/exception.c
index 0418399e0a20..c5d009715402 100644
--- a/arch/arm64/kvm/hyp/exception.c
+++ b/arch/arm64/kvm/hyp/exception.c
@@ -38,7 +38,10 @@ static inline void __vcpu_write_sys_reg(struct kvm_vcpu *vcpu, u64 val, int reg)
static void __vcpu_write_spsr(struct kvm_vcpu *vcpu, u64 val)
{
- write_sysreg_el1(val, SYS_SPSR);
+ if (has_vhe())
+ write_sysreg_el1(val, SYS_SPSR);
+ else
+ __vcpu_sys_reg(vcpu, SPSR_EL1) = val;
}
static void __vcpu_write_spsr_abt(struct kvm_vcpu *vcpu, u64 val)
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index 844a6f003fd5..2cb3867eb7c2 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -983,13 +983,9 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
*/
stage2_put_pte(ptep, mmu, addr, level, mm_ops);
- if (need_flush) {
- kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops);
-
- dcache_clean_inval_poc((unsigned long)pte_follow,
- (unsigned long)pte_follow +
- kvm_granule_size(level));
- }
+ if (need_flush && mm_ops->dcache_clean_inval_poc)
+ mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
+ kvm_granule_size(level));
if (childp)
mm_ops->put_page(childp);
@@ -1151,15 +1147,13 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep,
struct kvm_pgtable *pgt = arg;
struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops;
kvm_pte_t pte = *ptep;
- kvm_pte_t *pte_follow;
if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte))
return 0;
- pte_follow = kvm_pte_follow(pte, mm_ops);
- dcache_clean_inval_poc((unsigned long)pte_follow,
- (unsigned long)pte_follow +
- kvm_granule_size(level));
+ if (mm_ops->dcache_clean_inval_poc)
+ mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),
+ kvm_granule_size(level));
return 0;
}
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index 20db2f281cf2..4fb419f7b8b6 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -983,6 +983,9 @@ static void __vgic_v3_read_ctlr(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
val = ((vtr >> 29) & 7) << ICC_CTLR_EL1_PRI_BITS_SHIFT;
/* IDbits */
val |= ((vtr >> 23) & 7) << ICC_CTLR_EL1_ID_BITS_SHIFT;
+ /* SEIS */
+ if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK)
+ val |= BIT(ICC_CTLR_EL1_SEIS_SHIFT);
/* A3V */
val |= ((vtr >> 21) & 1) << ICC_CTLR_EL1_A3V_SHIFT;
/* EOImode */
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index a33d4366b326..b549af8b1dc2 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -609,6 +609,18 @@ static int __init early_gicv4_enable(char *buf)
}
early_param("kvm-arm.vgic_v4_enable", early_gicv4_enable);
+static const struct midr_range broken_seis[] = {
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_ICESTORM),
+ MIDR_ALL_VERSIONS(MIDR_APPLE_M1_FIRESTORM),
+ {},
+};
+
+static bool vgic_v3_broken_seis(void)
+{
+ return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) &&
+ is_midr_in_range_list(read_cpuid_id(), broken_seis));
+}
+
/**
* vgic_v3_probe - probe for a VGICv3 compatible interrupt controller
* @info: pointer to the GIC description
@@ -676,9 +688,10 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
group1_trap = true;
}
- if (kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_SEIS_MASK) {
- kvm_info("GICv3 with locally generated SEI\n");
+ if (vgic_v3_broken_seis()) {
+ kvm_info("GICv3 with broken locally generated SEI\n");
+ kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_SEIS_MASK;
group0_trap = true;
group1_trap = true;
if (ich_vtr_el2 & ICH_VTR_TDS_MASK)
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index c0181e60cc98..489455309695 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -40,8 +40,8 @@ static bool
ex_handler_load_unaligned_zeropad(const struct exception_table_entry *ex,
struct pt_regs *regs)
{
- int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->type);
- int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->type);
+ int reg_data = FIELD_GET(EX_DATA_REG_DATA, ex->data);
+ int reg_addr = FIELD_GET(EX_DATA_REG_ADDR, ex->data);
unsigned long data, addr, offset;
addr = pt_regs_read_reg(regs, reg_addr);
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index a8834434af99..db63cc885771 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -172,7 +172,7 @@ int pfn_is_map_memory(unsigned long pfn)
}
EXPORT_SYMBOL(pfn_is_map_memory);
-static phys_addr_t memory_limit = PHYS_ADDR_MAX;
+static phys_addr_t memory_limit __ro_after_init = PHYS_ADDR_MAX;
/*
* Limit the memory size that was specified via FDT.
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 870c39537dd0..e7719e8f18de 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -55,6 +55,9 @@ WORKAROUND_1418040
WORKAROUND_1463225
WORKAROUND_1508412
WORKAROUND_1542419
+WORKAROUND_2064142
+WORKAROUND_2038923
+WORKAROUND_1902691
WORKAROUND_TRBE_OVERWRITE_FILL_MODE
WORKAROUND_TSB_FLUSH_FAILURE
WORKAROUND_TRBE_WRITE_OUT_OF_RANGE
diff --git a/arch/csky/include/asm/bitops.h b/arch/csky/include/asm/bitops.h
index 02b72a000767..72e1b2aa29a0 100644
--- a/arch/csky/include/asm/bitops.h
+++ b/arch/csky/include/asm/bitops.h
@@ -59,7 +59,6 @@ static __always_inline unsigned long __fls(unsigned long x)
#include <asm-generic/bitops/ffz.h>
#include <asm-generic/bitops/fls64.h>
-#include <asm-generic/bitops/find.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
diff --git a/arch/h8300/include/asm/bitops.h b/arch/h8300/include/asm/bitops.h
index c867a80cab5b..4489e3d6edd3 100644
--- a/arch/h8300/include/asm/bitops.h
+++ b/arch/h8300/include/asm/bitops.h
@@ -168,7 +168,6 @@ static inline unsigned long __ffs(unsigned long word)
return result;
}
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/lock.h>
diff --git a/arch/hexagon/include/asm/bitops.h b/arch/hexagon/include/asm/bitops.h
index 71429f756af0..75d6ba3643b8 100644
--- a/arch/hexagon/include/asm/bitops.h
+++ b/arch/hexagon/include/asm/bitops.h
@@ -271,7 +271,6 @@ static inline unsigned long __fls(unsigned long word)
}
#include <asm-generic/bitops/lock.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/sched.h>
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 703952819e10..a7e01573abd8 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -318,7 +318,7 @@ config ARCH_PROC_KCORE_TEXT
depends on PROC_KCORE
config IA64_MCA_RECOVERY
- tristate "MCA recovery from errors other than TLB."
+ bool "MCA recovery from errors other than TLB."
config IA64_PALINFO
tristate "/proc/pal support"
diff --git a/arch/ia64/include/asm/bitops.h b/arch/ia64/include/asm/bitops.h
index 2f24ee6459d2..577be93c0818 100644
--- a/arch/ia64/include/asm/bitops.h
+++ b/arch/ia64/include/asm/bitops.h
@@ -441,8 +441,6 @@ static __inline__ unsigned long __arch_hweight64(unsigned long x)
#endif /* __KERNEL__ */
-#include <asm-generic/bitops/find.h>
-
#ifdef __KERNEL__
#include <asm-generic/bitops/le.h>
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index a25ab9b37953..bd3ba276e69c 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -282,7 +282,7 @@ salinfo_event_open(struct inode *inode, struct file *file)
static ssize_t
salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
{
- struct salinfo_data *data = PDE_DATA(file_inode(file));
+ struct salinfo_data *data = pde_data(file_inode(file));
char cmd[32];
size_t size;
int i, n, cpu = -1;
@@ -340,7 +340,7 @@ static const struct proc_ops salinfo_event_proc_ops = {
static int
salinfo_log_open(struct inode *inode, struct file *file)
{
- struct salinfo_data *data = PDE_DATA(inode);
+ struct salinfo_data *data = pde_data(inode);
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -365,7 +365,7 @@ salinfo_log_open(struct inode *inode, struct file *file)
static int
salinfo_log_release(struct inode *inode, struct file *file)
{
- struct salinfo_data *data = PDE_DATA(inode);
+ struct salinfo_data *data = pde_data(inode);
if (data->state == STATE_NO_DATA) {
vfree(data->log_buffer);
@@ -433,7 +433,7 @@ retry:
static ssize_t
salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
{
- struct salinfo_data *data = PDE_DATA(file_inode(file));
+ struct salinfo_data *data = pde_data(file_inode(file));
u8 *buf;
u64 bufsize;
@@ -494,7 +494,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu)
static ssize_t
salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
{
- struct salinfo_data *data = PDE_DATA(file_inode(file));
+ struct salinfo_data *data = pde_data(file_inode(file));
char cmd[32];
size_t size;
u32 offset;
diff --git a/arch/ia64/pci/fixup.c b/arch/ia64/pci/fixup.c
index acb55a41260d..2bcdd7d3a1ad 100644
--- a/arch/ia64/pci/fixup.c
+++ b/arch/ia64/pci/fixup.c
@@ -76,5 +76,5 @@ static void pci_fixup_video(struct pci_dev *pdev)
}
}
}
-DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
- PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig
index a4b6c7108465..bc9952f8be66 100644
--- a/arch/m68k/configs/amiga_defconfig
+++ b/arch/m68k/configs/amiga_defconfig
@@ -45,7 +45,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig
index 2db721965520..a77269c6e5ba 100644
--- a/arch/m68k/configs/apollo_defconfig
+++ b/arch/m68k/configs/apollo_defconfig
@@ -41,7 +41,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig
index c266a704eecd..7a74efa6b9a1 100644
--- a/arch/m68k/configs/atari_defconfig
+++ b/arch/m68k/configs/atari_defconfig
@@ -48,7 +48,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig
index f644f08dd6ed..a5323bf2eb33 100644
--- a/arch/m68k/configs/bvme6000_defconfig
+++ b/arch/m68k/configs/bvme6000_defconfig
@@ -38,7 +38,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig
index e4924650b687..5e80aa0869d5 100644
--- a/arch/m68k/configs/hp300_defconfig
+++ b/arch/m68k/configs/hp300_defconfig
@@ -40,7 +40,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig
index 24113871ea76..e84326a3f62d 100644
--- a/arch/m68k/configs/mac_defconfig
+++ b/arch/m68k/configs/mac_defconfig
@@ -39,7 +39,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig
index 6a7e4be70eea..337552f43339 100644
--- a/arch/m68k/configs/multi_defconfig
+++ b/arch/m68k/configs/multi_defconfig
@@ -59,7 +59,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig
index 1d223247aff0..7b688f7d272a 100644
--- a/arch/m68k/configs/mvme147_defconfig
+++ b/arch/m68k/configs/mvme147_defconfig
@@ -37,7 +37,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig
index 961f789f96c9..7c2cb31d63dd 100644
--- a/arch/m68k/configs/mvme16x_defconfig
+++ b/arch/m68k/configs/mvme16x_defconfig
@@ -38,7 +38,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig
index ff4b5e469390..ca43897af26d 100644
--- a/arch/m68k/configs/q40_defconfig
+++ b/arch/m68k/configs/q40_defconfig
@@ -39,7 +39,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig
index 5f228621d0cc..e3d515f37144 100644
--- a/arch/m68k/configs/sun3_defconfig
+++ b/arch/m68k/configs/sun3_defconfig
@@ -35,7 +35,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig
index a600cb9e68c2..d601606c969b 100644
--- a/arch/m68k/configs/sun3x_defconfig
+++ b/arch/m68k/configs/sun3x_defconfig
@@ -35,7 +35,6 @@ CONFIG_IOSCHED_BFQ=m
CONFIG_BINFMT_AOUT=m
CONFIG_BINFMT_MISC=m
# CONFIG_COMPACTION is not set
-CONFIG_CLEANCACHE=y
CONFIG_ZPOOL=m
CONFIG_NET=y
CONFIG_PACKET=y
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index 7b93e1fd8ffa..51283db53667 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -529,6 +529,4 @@ static inline int __fls(int x)
#include <asm-generic/bitops/le.h>
#endif /* __KERNEL__ */
-#include <asm-generic/bitops/find.h>
-
#endif /* _M68K_BITOPS_H */
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index edf6c1577449..058446f01487 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -32,7 +32,6 @@ config MIPS
select GENERIC_ATOMIC64 if !64BIT
select GENERIC_CMOS_UPDATE
select GENERIC_CPU_AUTOPROBE
- select GENERIC_FIND_FIRST_BIT
select GENERIC_GETTIMEOFDAY
select GENERIC_IOMAP
select GENERIC_IRQ_PROBE
diff --git a/arch/mips/cavium-octeon/octeon-memcpy.S b/arch/mips/cavium-octeon/octeon-memcpy.S
index 0a515cde1c18..25860fba6218 100644
--- a/arch/mips/cavium-octeon/octeon-memcpy.S
+++ b/arch/mips/cavium-octeon/octeon-memcpy.S
@@ -74,7 +74,7 @@
#define EXC(inst_reg,addr,handler) \
9: inst_reg, addr; \
.section __ex_table,"a"; \
- PTR 9b, handler; \
+ PTR_WD 9b, handler; \
.previous
/*
diff --git a/arch/mips/include/asm/asm.h b/arch/mips/include/asm/asm.h
index 6ffdd4b5e1d0..336ac9b65235 100644
--- a/arch/mips/include/asm/asm.h
+++ b/arch/mips/include/asm/asm.h
@@ -285,7 +285,7 @@ symbol = value
#define PTR_SCALESHIFT 2
-#define PTR .word
+#define PTR_WD .word
#define PTRSIZE 4
#define PTRLOG 2
#endif
@@ -310,7 +310,7 @@ symbol = value
#define PTR_SCALESHIFT 3
-#define PTR .dword
+#define PTR_WD .dword
#define PTRSIZE 8
#define PTRLOG 3
#endif
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index 3812082b8295..b4bf754f7db3 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -444,7 +444,6 @@ static inline int ffs(int word)
}
#include <asm-generic/bitops/ffz.h>
-#include <asm-generic/bitops/find.h>
#ifdef __KERNEL__
diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h
index b463f2aa5a61..db497a8167da 100644
--- a/arch/mips/include/asm/ftrace.h
+++ b/arch/mips/include/asm/ftrace.h
@@ -32,7 +32,7 @@ do { \
".previous\n" \
\
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR) "\t1b, 3b\n\t" \
+ STR(PTR_WD) "\t1b, 3b\n\t" \
".previous\n" \
\
: [tmp_dst] "=&r" (dst), [tmp_err] "=r" (error)\
@@ -54,7 +54,7 @@ do { \
".previous\n" \
\
".section\t__ex_table,\"a\"\n\t"\
- STR(PTR) "\t1b, 3b\n\t" \
+ STR(PTR_WD) "\t1b, 3b\n\t" \
".previous\n" \
\
: [tmp_err] "=r" (error) \
diff --git a/arch/mips/include/asm/r4kcache.h b/arch/mips/include/asm/r4kcache.h
index af3788589ee6..431a1c9d53fc 100644
--- a/arch/mips/include/asm/r4kcache.h
+++ b/arch/mips/include/asm/r4kcache.h
@@ -119,7 +119,7 @@ static inline void flush_scache_line(unsigned long addr)
" j 2b \n" \
" .previous \n" \
" .section __ex_table,\"a\" \n" \
- " "STR(PTR)" 1b, 3b \n" \
+ " "STR(PTR_WD)" 1b, 3b \n" \
" .previous" \
: "+r" (__err) \
: "i" (op), "r" (addr), "i" (-EFAULT)); \
@@ -142,7 +142,7 @@ static inline void flush_scache_line(unsigned long addr)
" j 2b \n" \
" .previous \n" \
" .section __ex_table,\"a\" \n" \
- " "STR(PTR)" 1b, 3b \n" \
+ " "STR(PTR_WD)" 1b, 3b \n" \
" .previous" \
: "+r" (__err) \
: "i" (op), "r" (addr), "i" (-EFAULT)); \
diff --git a/arch/mips/include/asm/unaligned-emul.h b/arch/mips/include/asm/unaligned-emul.h
index 2022b18944b9..9af0f4d3d288 100644
--- a/arch/mips/include/asm/unaligned-emul.h
+++ b/arch/mips/include/asm/unaligned-emul.h
@@ -20,8 +20,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -41,8 +41,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -74,10 +74,10 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -102,8 +102,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -125,8 +125,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -145,8 +145,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -178,10 +178,10 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -223,14 +223,14 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
- STR(PTR)"\t5b, 11b\n\t" \
- STR(PTR)"\t6b, 11b\n\t" \
- STR(PTR)"\t7b, 11b\n\t" \
- STR(PTR)"\t8b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t5b, 11b\n\t" \
+ STR(PTR_WD)"\t6b, 11b\n\t" \
+ STR(PTR_WD)"\t7b, 11b\n\t" \
+ STR(PTR_WD)"\t8b, 11b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -255,8 +255,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT));\
@@ -276,8 +276,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT)); \
@@ -296,8 +296,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT)); \
@@ -325,10 +325,10 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
".previous" \
: "=&r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT) \
@@ -365,14 +365,14 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
- STR(PTR)"\t5b, 11b\n\t" \
- STR(PTR)"\t6b, 11b\n\t" \
- STR(PTR)"\t7b, 11b\n\t" \
- STR(PTR)"\t8b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t5b, 11b\n\t" \
+ STR(PTR_WD)"\t6b, 11b\n\t" \
+ STR(PTR_WD)"\t7b, 11b\n\t" \
+ STR(PTR_WD)"\t8b, 11b\n\t" \
".previous" \
: "=&r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT) \
@@ -398,8 +398,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -419,8 +419,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -452,10 +452,10 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -481,8 +481,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -504,8 +504,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -524,8 +524,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -557,10 +557,10 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -602,14 +602,14 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
- STR(PTR)"\t5b, 11b\n\t" \
- STR(PTR)"\t6b, 11b\n\t" \
- STR(PTR)"\t7b, 11b\n\t" \
- STR(PTR)"\t8b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t5b, 11b\n\t" \
+ STR(PTR_WD)"\t6b, 11b\n\t" \
+ STR(PTR_WD)"\t7b, 11b\n\t" \
+ STR(PTR_WD)"\t8b, 11b\n\t" \
".previous" \
: "=&r" (value), "=r" (res) \
: "r" (addr), "i" (-EFAULT)); \
@@ -632,8 +632,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT));\
@@ -653,8 +653,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT)); \
@@ -673,8 +673,8 @@ do { \
"j\t3b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 4b\n\t" \
- STR(PTR)"\t2b, 4b\n\t" \
+ STR(PTR_WD)"\t1b, 4b\n\t" \
+ STR(PTR_WD)"\t2b, 4b\n\t" \
".previous" \
: "=r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT)); \
@@ -703,10 +703,10 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
".previous" \
: "=&r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT) \
@@ -743,14 +743,14 @@ do { \
"j\t10b\n\t" \
".previous\n\t" \
".section\t__ex_table,\"a\"\n\t" \
- STR(PTR)"\t1b, 11b\n\t" \
- STR(PTR)"\t2b, 11b\n\t" \
- STR(PTR)"\t3b, 11b\n\t" \
- STR(PTR)"\t4b, 11b\n\t" \
- STR(PTR)"\t5b, 11b\n\t" \
- STR(PTR)"\t6b, 11b\n\t" \
- STR(PTR)"\t7b, 11b\n\t" \
- STR(PTR)"\t8b, 11b\n\t" \
+ STR(PTR_WD)"\t1b, 11b\n\t" \
+ STR(PTR_WD)"\t2b, 11b\n\t" \
+ STR(PTR_WD)"\t3b, 11b\n\t" \
+ STR(PTR_WD)"\t4b, 11b\n\t" \
+ STR(PTR_WD)"\t5b, 11b\n\t" \
+ STR(PTR_WD)"\t6b, 11b\n\t" \
+ STR(PTR_WD)"\t7b, 11b\n\t" \
+ STR(PTR_WD)"\t8b, 11b\n\t" \
".previous" \
: "=&r" (res) \
: "r" (value), "r" (addr), "i" (-EFAULT) \
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 24e0efb360f6..1d55e57b8466 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h
@@ -144,6 +144,8 @@
#define SO_RESERVE_MEM 73
+#define SO_TXREHASH 74
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/mips/kernel/mips-r2-to-r6-emul.c b/arch/mips/kernel/mips-r2-to-r6-emul.c
index a39ec755e4c2..750fe569862b 100644
--- a/arch/mips/kernel/mips-r2-to-r6-emul.c
+++ b/arch/mips/kernel/mips-r2-to-r6-emul.c
@@ -1258,10 +1258,10 @@ fpu_emul:
" j 10b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -1333,10 +1333,10 @@ fpu_emul:
" j 10b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -1404,10 +1404,10 @@ fpu_emul:
" j 9b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -1474,10 +1474,10 @@ fpu_emul:
" j 9b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -1589,14 +1589,14 @@ fpu_emul:
" j 9b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
- STR(PTR) " 5b,8b\n"
- STR(PTR) " 6b,8b\n"
- STR(PTR) " 7b,8b\n"
- STR(PTR) " 0b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
+ STR(PTR_WD) " 5b,8b\n"
+ STR(PTR_WD) " 6b,8b\n"
+ STR(PTR_WD) " 7b,8b\n"
+ STR(PTR_WD) " 0b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -1708,14 +1708,14 @@ fpu_emul:
" j 9b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
- STR(PTR) " 5b,8b\n"
- STR(PTR) " 6b,8b\n"
- STR(PTR) " 7b,8b\n"
- STR(PTR) " 0b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
+ STR(PTR_WD) " 5b,8b\n"
+ STR(PTR_WD) " 6b,8b\n"
+ STR(PTR_WD) " 7b,8b\n"
+ STR(PTR_WD) " 0b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -1827,14 +1827,14 @@ fpu_emul:
" j 9b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
- STR(PTR) " 5b,8b\n"
- STR(PTR) " 6b,8b\n"
- STR(PTR) " 7b,8b\n"
- STR(PTR) " 0b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
+ STR(PTR_WD) " 5b,8b\n"
+ STR(PTR_WD) " 6b,8b\n"
+ STR(PTR_WD) " 7b,8b\n"
+ STR(PTR_WD) " 0b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -1945,14 +1945,14 @@ fpu_emul:
" j 9b\n"
" .previous\n"
" .section __ex_table,\"a\"\n"
- STR(PTR) " 1b,8b\n"
- STR(PTR) " 2b,8b\n"
- STR(PTR) " 3b,8b\n"
- STR(PTR) " 4b,8b\n"
- STR(PTR) " 5b,8b\n"
- STR(PTR) " 6b,8b\n"
- STR(PTR) " 7b,8b\n"
- STR(PTR) " 0b,8b\n"
+ STR(PTR_WD) " 1b,8b\n"
+ STR(PTR_WD) " 2b,8b\n"
+ STR(PTR_WD) " 3b,8b\n"
+ STR(PTR_WD) " 4b,8b\n"
+ STR(PTR_WD) " 5b,8b\n"
+ STR(PTR_WD) " 6b,8b\n"
+ STR(PTR_WD) " 7b,8b\n"
+ STR(PTR_WD) " 0b,8b\n"
" .previous\n"
" .set pop\n"
: "+&r"(rt), "=&r"(rs),
@@ -2007,7 +2007,7 @@ fpu_emul:
"j 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
- STR(PTR) " 1b,3b\n"
+ STR(PTR_WD) " 1b,3b\n"
".previous\n"
: "=&r"(res), "+&r"(err)
: "r"(vaddr), "i"(SIGSEGV)
@@ -2065,7 +2065,7 @@ fpu_emul:
"j 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
- STR(PTR) " 1b,3b\n"
+ STR(PTR_WD) " 1b,3b\n"
".previous\n"
: "+&r"(res), "+&r"(err)
: "r"(vaddr), "i"(SIGSEGV));
@@ -2126,7 +2126,7 @@ fpu_emul:
"j 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
- STR(PTR) " 1b,3b\n"
+ STR(PTR_WD) " 1b,3b\n"
".previous\n"
: "=&r"(res), "+&r"(err)
: "r"(vaddr), "i"(SIGSEGV)
@@ -2189,7 +2189,7 @@ fpu_emul:
"j 2b\n"
".previous\n"
".section __ex_table,\"a\"\n"
- STR(PTR) " 1b,3b\n"
+ STR(PTR_WD) " 1b,3b\n"
".previous\n"
: "+&r"(res), "+&r"(err)
: "r"(vaddr), "i"(SIGSEGV));
diff --git a/arch/mips/kernel/r2300_fpu.S b/arch/mips/kernel/r2300_fpu.S
index cbf6db98cfb3..2748c55820c2 100644
--- a/arch/mips/kernel/r2300_fpu.S
+++ b/arch/mips/kernel/r2300_fpu.S
@@ -23,14 +23,14 @@
#define EX(a,b) \
9: a,##b; \
.section __ex_table,"a"; \
- PTR 9b,fault; \
+ PTR_WD 9b,fault; \
.previous
#define EX2(a,b) \
9: a,##b; \
.section __ex_table,"a"; \
- PTR 9b,fault; \
- PTR 9b+4,fault; \
+ PTR_WD 9b,fault; \
+ PTR_WD 9b+4,fault; \
.previous
.set mips1
diff --git a/arch/mips/kernel/r4k_fpu.S b/arch/mips/kernel/r4k_fpu.S
index b91e91106475..2e687c60bc4f 100644
--- a/arch/mips/kernel/r4k_fpu.S
+++ b/arch/mips/kernel/r4k_fpu.S
@@ -31,7 +31,7 @@
.ex\@: \insn \reg, \src
.set pop
.section __ex_table,"a"
- PTR .ex\@, fault
+ PTR_WD .ex\@, fault
.previous
.endm
diff --git a/arch/mips/kernel/relocate_kernel.S b/arch/mips/kernel/relocate_kernel.S
index f3c908abdbb8..cfde14b48fd8 100644
--- a/arch/mips/kernel/relocate_kernel.S
+++ b/arch/mips/kernel/relocate_kernel.S
@@ -147,10 +147,10 @@ LEAF(kexec_smp_wait)
kexec_args:
EXPORT(kexec_args)
-arg0: PTR 0x0
-arg1: PTR 0x0
-arg2: PTR 0x0
-arg3: PTR 0x0
+arg0: PTR_WD 0x0
+arg1: PTR_WD 0x0
+arg2: PTR_WD 0x0
+arg3: PTR_WD 0x0
.size kexec_args,PTRSIZE*4
#ifdef CONFIG_SMP
@@ -161,10 +161,10 @@ arg3: PTR 0x0
*/
secondary_kexec_args:
EXPORT(secondary_kexec_args)
-s_arg0: PTR 0x0
-s_arg1: PTR 0x0
-s_arg2: PTR 0x0
-s_arg3: PTR 0x0
+s_arg0: PTR_WD 0x0
+s_arg1: PTR_WD 0x0
+s_arg2: PTR_WD 0x0
+s_arg3: PTR_WD 0x0
.size secondary_kexec_args,PTRSIZE*4
kexec_flag:
LONG 0x1
@@ -173,17 +173,17 @@ kexec_flag:
kexec_start_address:
EXPORT(kexec_start_address)
- PTR 0x0
+ PTR_WD 0x0
.size kexec_start_address, PTRSIZE
kexec_indirection_page:
EXPORT(kexec_indirection_page)
- PTR 0
+ PTR_WD 0
.size kexec_indirection_page, PTRSIZE
relocate_new_kernel_end:
relocate_new_kernel_size:
EXPORT(relocate_new_kernel_size)
- PTR relocate_new_kernel_end - relocate_new_kernel
+ PTR_WD relocate_new_kernel_end - relocate_new_kernel
.size relocate_new_kernel_size, PTRSIZE
diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S
index b1b2e106f711..9bfce5f75f60 100644
--- a/arch/mips/kernel/scall32-o32.S
+++ b/arch/mips/kernel/scall32-o32.S
@@ -72,10 +72,10 @@ loads_done:
.set pop
.section __ex_table,"a"
- PTR load_a4, bad_stack_a4
- PTR load_a5, bad_stack_a5
- PTR load_a6, bad_stack_a6
- PTR load_a7, bad_stack_a7
+ PTR_WD load_a4, bad_stack_a4
+ PTR_WD load_a5, bad_stack_a5
+ PTR_WD load_a6, bad_stack_a6
+ PTR_WD load_a7, bad_stack_a7
.previous
lw t0, TI_FLAGS($28) # syscall tracing enabled?
@@ -216,7 +216,7 @@ einval: li v0, -ENOSYS
#endif /* CONFIG_MIPS_MT_FPAFF */
#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, native)
-#define __SYSCALL(nr, entry) PTR entry
+#define __SYSCALL(nr, entry) PTR_WD entry
.align 2
.type sys_call_table, @object
EXPORT(sys_call_table)
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index f650c55a17dc..97456b2ca7dc 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -101,7 +101,7 @@ not_n32_scall:
END(handle_sysn32)
-#define __SYSCALL(nr, entry) PTR entry
+#define __SYSCALL(nr, entry) PTR_WD entry
.type sysn32_call_table, @object
EXPORT(sysn32_call_table)
#include <asm/syscall_table_n32.h>
diff --git a/arch/mips/kernel/scall64-n64.S b/arch/mips/kernel/scall64-n64.S
index 5d7bfc65e4d0..5f6ed4b4c399 100644
--- a/arch/mips/kernel/scall64-n64.S
+++ b/arch/mips/kernel/scall64-n64.S
@@ -109,7 +109,7 @@ illegal_syscall:
j n64_syscall_exit
END(handle_sys64)
-#define __SYSCALL(nr, entry) PTR entry
+#define __SYSCALL(nr, entry) PTR_WD entry
.align 3
.type sys_call_table, @object
EXPORT(sys_call_table)
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index cedc8bd88804..d3c2616cba22 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -73,10 +73,10 @@ load_a7: lw a7, 28(t0) # argument #8 from usp
loads_done:
.section __ex_table,"a"
- PTR load_a4, bad_stack_a4
- PTR load_a5, bad_stack_a5
- PTR load_a6, bad_stack_a6
- PTR load_a7, bad_stack_a7
+ PTR_WD load_a4, bad_stack_a4
+ PTR_WD load_a5, bad_stack_a5
+ PTR_WD load_a6, bad_stack_a6
+ PTR_WD load_a7, bad_stack_a7
.previous
li t1, _TIF_WORK_SYSCALL_ENTRY
@@ -214,7 +214,7 @@ einval: li v0, -ENOSYS
END(sys32_syscall)
#define __SYSCALL_WITH_COMPAT(nr, native, compat) __SYSCALL(nr, compat)
-#define __SYSCALL(nr, entry) PTR entry
+#define __SYSCALL(nr, entry) PTR_WD entry
.align 3
.type sys32_call_table,@object
EXPORT(sys32_call_table)
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index 5512cd586e6e..ae93a607ddf7 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -122,8 +122,8 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new)
" j 3b \n"
" .previous \n"
" .section __ex_table,\"a\" \n"
- " "STR(PTR)" 1b, 4b \n"
- " "STR(PTR)" 2b, 4b \n"
+ " "STR(PTR_WD)" 1b, 4b \n"
+ " "STR(PTR_WD)" 2b, 4b \n"
" .previous \n"
" .set pop \n"
: [old] "=&r" (old),
@@ -152,8 +152,8 @@ static inline int mips_atomic_set(unsigned long addr, unsigned long new)
" j 3b \n"
" .previous \n"
" .section __ex_table,\"a\" \n"
- " "STR(PTR)" 1b, 5b \n"
- " "STR(PTR)" 2b, 5b \n"
+ " "STR(PTR_WD)" 1b, 5b \n"
+ " "STR(PTR_WD)" 2b, 5b \n"
" .previous \n"
" .set pop \n"
: [old] "=&r" (old),
diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c
index 4adca5abbc72..c706f5890a05 100644
--- a/arch/mips/kvm/vz.c
+++ b/arch/mips/kvm/vz.c
@@ -458,8 +458,8 @@ void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu)
/**
* _kvm_vz_save_htimer() - Switch to software emulation of guest timer.
* @vcpu: Virtual CPU.
- * @compare: Pointer to write compare value to.
- * @cause: Pointer to write cause value to.
+ * @out_compare: Pointer to write compare value to.
+ * @out_cause: Pointer to write cause value to.
*
* Save VZ guest timer state and switch to software emulation of guest CP0
* timer. The hard timer must already be in use, so preemption should be
@@ -1541,11 +1541,14 @@ static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu)
}
/**
- * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor.
+ * kvm_trap_vz_handle_cop_unusable() - Guest used unusable coprocessor.
* @vcpu: Virtual CPU context.
*
* Handle when the guest attempts to use a coprocessor which hasn't been allowed
* by the root context.
+ *
+ * Return: value indicating whether to resume the host or the guest
+ * (RESUME_HOST or RESUME_GUEST)
*/
static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
{
@@ -1592,6 +1595,9 @@ static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu)
*
* Handle when the guest attempts to use MSA when it is disabled in the root
* context.
+ *
+ * Return: value indicating whether to resume the host or the guest
+ * (RESUME_HOST or RESUME_GUEST)
*/
static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu)
{
diff --git a/arch/mips/lib/csum_partial.S b/arch/mips/lib/csum_partial.S
index a46db0807195..7767137c3e49 100644
--- a/arch/mips/lib/csum_partial.S
+++ b/arch/mips/lib/csum_partial.S
@@ -347,7 +347,7 @@ EXPORT_SYMBOL(csum_partial)
.if \mode == LEGACY_MODE; \
9: insn reg, addr; \
.section __ex_table,"a"; \
- PTR 9b, .L_exc; \
+ PTR_WD 9b, .L_exc; \
.previous; \
/* This is enabled in EVA mode */ \
.else; \
@@ -356,7 +356,7 @@ EXPORT_SYMBOL(csum_partial)
((\to == USEROP) && (type == ST_INSN)); \
9: __BUILD_EVA_INSN(insn##e, reg, addr); \
.section __ex_table,"a"; \
- PTR 9b, .L_exc; \
+ PTR_WD 9b, .L_exc; \
.previous; \
.else; \
/* EVA without exception */ \
diff --git a/arch/mips/lib/memcpy.S b/arch/mips/lib/memcpy.S
index 277c32296636..18a43f2e29c8 100644
--- a/arch/mips/lib/memcpy.S
+++ b/arch/mips/lib/memcpy.S
@@ -116,7 +116,7 @@
.if \mode == LEGACY_MODE; \
9: insn reg, addr; \
.section __ex_table,"a"; \
- PTR 9b, handler; \
+ PTR_WD 9b, handler; \
.previous; \
/* This is assembled in EVA mode */ \
.else; \
@@ -125,7 +125,7 @@
((\to == USEROP) && (type == ST_INSN)); \
9: __BUILD_EVA_INSN(insn##e, reg, addr); \
.section __ex_table,"a"; \
- PTR 9b, handler; \
+ PTR_WD 9b, handler; \
.previous; \
.else; \
/* \
diff --git a/arch/mips/lib/memset.S b/arch/mips/lib/memset.S
index b0baa3c79fad..0b342bae9a98 100644
--- a/arch/mips/lib/memset.S
+++ b/arch/mips/lib/memset.S
@@ -52,7 +52,7 @@
9: ___BUILD_EVA_INSN(insn, reg, addr); \
.endif; \
.section __ex_table,"a"; \
- PTR 9b, handler; \
+ PTR_WD 9b, handler; \
.previous
.macro f_fill64 dst, offset, val, fixup, mode
diff --git a/arch/mips/lib/strncpy_user.S b/arch/mips/lib/strncpy_user.S
index 556acf684d7b..13aaa9927ad1 100644
--- a/arch/mips/lib/strncpy_user.S
+++ b/arch/mips/lib/strncpy_user.S
@@ -15,7 +15,7 @@
#define EX(insn,reg,addr,handler) \
9: insn reg, addr; \
.section __ex_table,"a"; \
- PTR 9b, handler; \
+ PTR_WD 9b, handler; \
.previous
/*
@@ -59,7 +59,7 @@ LEAF(__strncpy_from_user_asm)
jr ra
.section __ex_table,"a"
- PTR 1b, .Lfault
+ PTR_WD 1b, .Lfault
.previous
EXPORT_SYMBOL(__strncpy_from_user_asm)
diff --git a/arch/mips/lib/strnlen_user.S b/arch/mips/lib/strnlen_user.S
index 92b63f20ec05..6de31b616f9c 100644
--- a/arch/mips/lib/strnlen_user.S
+++ b/arch/mips/lib/strnlen_user.S
@@ -14,7 +14,7 @@
#define EX(insn,reg,addr,handler) \
9: insn reg, addr; \
.section __ex_table,"a"; \
- PTR 9b, handler; \
+ PTR_WD 9b, handler; \
.previous
/*
diff --git a/arch/mips/loongson64/vbios_quirk.c b/arch/mips/loongson64/vbios_quirk.c
index 9a29e94d3db1..3115d4de982c 100644
--- a/arch/mips/loongson64/vbios_quirk.c
+++ b/arch/mips/loongson64/vbios_quirk.c
@@ -3,7 +3,7 @@
#include <linux/pci.h>
#include <loongson.h>
-static void pci_fixup_radeon(struct pci_dev *pdev)
+static void pci_fixup_video(struct pci_dev *pdev)
{
struct resource *res = &pdev->resource[PCI_ROM_RESOURCE];
@@ -22,8 +22,7 @@ static void pci_fixup_radeon(struct pci_dev *pdev)
res->flags = IORESOURCE_MEM | IORESOURCE_ROM_SHADOW |
IORESOURCE_PCI_FIXED;
- dev_info(&pdev->dev, "BAR %d: assigned %pR for Radeon ROM\n",
- PCI_ROM_RESOURCE, res);
+ dev_info(&pdev->dev, "Video device with shadowed ROM at %pR\n", res);
}
-DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_ATI, 0x9615,
- PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_radeon);
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_ATI, 0x9615,
+ PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
diff --git a/arch/openrisc/include/asm/bitops.h b/arch/openrisc/include/asm/bitops.h
index 7f1ca35213d8..d773ed938acb 100644
--- a/arch/openrisc/include/asm/bitops.h
+++ b/arch/openrisc/include/asm/bitops.h
@@ -30,7 +30,6 @@
#include <asm/bitops/fls.h>
#include <asm/bitops/__fls.h>
#include <asm-generic/bitops/fls64.h>
-#include <asm-generic/bitops/find.h>
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h
index daa2afd974fb..0ec9cfc5131f 100644
--- a/arch/parisc/include/asm/bitops.h
+++ b/arch/parisc/include/asm/bitops.h
@@ -203,7 +203,6 @@ static __inline__ int fls(unsigned int x)
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/sched.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h
index b669f4b9040b..3a3d05438408 100644
--- a/arch/parisc/include/asm/processor.h
+++ b/arch/parisc/include/asm/processor.h
@@ -289,6 +289,7 @@ extern int _parisc_requires_coherency;
extern int running_on_qemu;
+extern void __noreturn toc_intr(struct pt_regs *regs);
extern void toc_handler(void);
extern unsigned int toc_handler_size;
extern unsigned int toc_handler_csum;
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index 845ddc63c882..654061e0964e 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h
@@ -125,6 +125,8 @@
#define SO_RESERVE_MEM 0x4047
+#define SO_TXREHASH 0x4048
+
#if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64
diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c
index cceb09855e03..b91cb45ffd4e 100644
--- a/arch/parisc/kernel/setup.c
+++ b/arch/parisc/kernel/setup.c
@@ -48,6 +48,7 @@ struct proc_dir_entry * proc_mckinley_root __read_mostly = NULL;
void __init setup_cmdline(char **cmdline_p)
{
extern unsigned int boot_args[];
+ char *p;
/* Collect stuff passed in from the boot loader */
@@ -56,9 +57,19 @@ void __init setup_cmdline(char **cmdline_p)
/* called from hpux boot loader */
boot_command_line[0] = '\0';
} else {
- strlcpy(boot_command_line, (char *)__va(boot_args[1]),
+ strscpy(boot_command_line, (char *)__va(boot_args[1]),
COMMAND_LINE_SIZE);
+ /* autodetect console type (if not done by palo yet) */
+ p = boot_command_line;
+ if (!str_has_prefix(p, "console=") && !strstr(p, " console=")) {
+ strlcat(p, " console=", COMMAND_LINE_SIZE);
+ if (PAGE0->mem_cons.cl_class == CL_DUPLEX)
+ strlcat(p, "ttyS0", COMMAND_LINE_SIZE);
+ else
+ strlcat(p, "tty0", COMMAND_LINE_SIZE);
+ }
+
#ifdef CONFIG_BLK_DEV_INITRD
if (boot_args[2] != 0) /* did palo pass us a ramdisk? */
{
@@ -68,7 +79,7 @@ void __init setup_cmdline(char **cmdline_p)
#endif
}
- strcpy(command_line, boot_command_line);
+ strscpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
*cmdline_p = command_line;
}
diff --git a/arch/parisc/kernel/toc.c b/arch/parisc/kernel/toc.c
index be9a0bebe61e..e4b48d07afbd 100644
--- a/arch/parisc/kernel/toc.c
+++ b/arch/parisc/kernel/toc.c
@@ -10,9 +10,10 @@
#include <asm/pdc.h>
#include <asm/pdc_chassis.h>
#include <asm/ldcw.h>
+#include <asm/processor.h>
static unsigned int __aligned(16) toc_lock = 1;
-DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack);
+DEFINE_PER_CPU_PAGE_ALIGNED(char [16384], toc_stack) __visible;
static void toc20_to_pt_regs(struct pt_regs *regs, struct pdc_toc_pim_20 *toc)
{
diff --git a/arch/powerpc/boot/dts/wii.dts b/arch/powerpc/boot/dts/wii.dts
index e9c945b123c6..e46143c32308 100644
--- a/arch/powerpc/boot/dts/wii.dts
+++ b/arch/powerpc/boot/dts/wii.dts
@@ -168,6 +168,11 @@
interrupts = <14>;
};
+ srnprot@d800060 {
+ compatible = "nintendo,hollywood-srnprot";
+ reg = <0x0d800060 0x4>;
+ };
+
GPIO: gpio@d8000c0 {
#gpio-cells = <2>;
compatible = "nintendo,hollywood-gpio";
diff --git a/arch/powerpc/configs/gamecube_defconfig b/arch/powerpc/configs/gamecube_defconfig
index 24c0e0ea5aeb..91a1b99f4e8f 100644
--- a/arch/powerpc/configs/gamecube_defconfig
+++ b/arch/powerpc/configs/gamecube_defconfig
@@ -68,7 +68,7 @@ CONFIG_SND_SEQUENCER=y
CONFIG_SND_SEQUENCER_OSS=y
# CONFIG_USB_SUPPORT is not set
CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_GENERIC=y
+CONFIG_RTC_DRV_GAMECUBE=y
CONFIG_EXT2_FS=y
CONFIG_EXT4_FS=y
CONFIG_ISO9660_FS=y
diff --git a/arch/powerpc/configs/wii_defconfig b/arch/powerpc/configs/wii_defconfig
index a0c45bf2bfb1..0ab78c51455d 100644
--- a/arch/powerpc/configs/wii_defconfig
+++ b/arch/powerpc/configs/wii_defconfig
@@ -98,7 +98,7 @@ CONFIG_LEDS_TRIGGERS=y
CONFIG_LEDS_TRIGGER_HEARTBEAT=y
CONFIG_LEDS_TRIGGER_PANIC=y
CONFIG_RTC_CLASS=y
-CONFIG_RTC_DRV_GENERIC=y
+CONFIG_RTC_DRV_GAMECUBE=y
CONFIG_NVMEM_NINTENDO_OTP=y
CONFIG_EXT2_FS=y
CONFIG_EXT4_FS=y
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index a05d8c62cbea..ea5d27dda8cf 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -328,8 +328,6 @@ unsigned long __arch_hweight64(__u64 w);
#include <asm-generic/bitops/hweight.h>
#endif
-#include <asm-generic/bitops/find.h>
-
/* wrappers that deal with KASAN instrumentation */
#include <asm-generic/bitops/instrumented-atomic.h>
#include <asm-generic/bitops/instrumented-lock.h>
diff --git a/arch/powerpc/include/asm/book3s/32/mmu-hash.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 7be27862329f..78c6a5fde1d6 100644
--- a/arch/powerpc/include/asm/book3s/32/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
@@ -223,6 +223,8 @@ static __always_inline void update_user_segments(u32 val)
update_user_segment(15, val);
}
+int __init find_free_bat(void);
+unsigned int bat_block_size(unsigned long base, unsigned long top);
#endif /* !__ASSEMBLY__ */
/* We happily ignore the smaller BATs on 601, we don't actually use
diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h
index 609c80f67194..f8b94f78403f 100644
--- a/arch/powerpc/include/asm/book3s/32/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/32/pgtable.h
@@ -178,6 +178,7 @@ static inline bool pte_user(pte_t pte)
#ifndef __ASSEMBLY__
int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
+void unmap_kernel_page(unsigned long va);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 33e073d6b0c4..875730d5af40 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -1082,6 +1082,8 @@ static inline int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t p
return hash__map_kernel_page(ea, pa, prot);
}
+void unmap_kernel_page(unsigned long va);
+
static inline int __meminit vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys)
diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h
index 947b5b9c4424..a832aeafe560 100644
--- a/arch/powerpc/include/asm/fixmap.h
+++ b/arch/powerpc/include/asm/fixmap.h
@@ -111,8 +111,10 @@ static inline void __set_fixmap(enum fixed_addresses idx,
BUILD_BUG_ON(idx >= __end_of_fixed_addresses);
else if (WARN_ON(idx >= __end_of_fixed_addresses))
return;
-
- map_kernel_page(__fix_to_virt(idx), phys, flags);
+ if (pgprot_val(flags))
+ map_kernel_page(__fix_to_virt(idx), phys, flags);
+ else
+ unmap_kernel_page(__fix_to_virt(idx));
}
#define __early_set_fixmap __set_fixmap
diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index a58fb4aa6c81..674e5aaafcbd 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -473,7 +473,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs)
return !(regs->msr & MSR_EE);
}
-static inline bool should_hard_irq_enable(void)
+static __always_inline bool should_hard_irq_enable(void)
{
return false;
}
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index fe07558173ef..827038a33064 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -39,7 +39,6 @@ struct kvm_nested_guest {
pgd_t *shadow_pgtable; /* our page table for this guest */
u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */
u64 process_table; /* process table entry for this guest */
- u64 hfscr; /* HFSCR that the L1 requested for this nested guest */
long refcnt; /* number of pointers to this struct */
struct mutex tlb_lock; /* serialize page faults and tlbies */
struct kvm_nested_guest *next;
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index a770443cd6e0..d9bf60bf0816 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -818,6 +818,7 @@ struct kvm_vcpu_arch {
/* For support of nested guests */
struct kvm_nested_guest *nested;
+ u64 nested_hfscr; /* HFSCR that the L1 requested for the nested guest */
u32 nested_vcpu_id;
gpa_t nested_io_gpr;
#endif
diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h
index b67742e2a9b2..d959c2a73fbf 100644
--- a/arch/powerpc/include/asm/nohash/32/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/32/pgtable.h
@@ -64,6 +64,7 @@ extern int icache_44x_need_flush;
#ifndef __ASSEMBLY__
int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot);
+void unmap_kernel_page(unsigned long va);
#endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index a3313e853e5e..2816d158280a 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -308,6 +308,7 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma,
#define __swp_entry_to_pte(x) __pte((x).val)
int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot);
+void unmap_kernel_page(unsigned long va);
extern int __meminit vmemmap_create_mapping(unsigned long start,
unsigned long page_size,
unsigned long phys);
diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
index efad07081cc0..9675303b724e 100644
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -500,6 +500,7 @@
#define PPC_RAW_LDX(r, base, b) (0x7c00002a | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b))
#define PPC_RAW_LHZ(r, base, i) (0xa0000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i))
#define PPC_RAW_LHBRX(r, base, b) (0x7c00062c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b))
+#define PPC_RAW_LWBRX(r, base, b) (0x7c00042c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b))
#define PPC_RAW_LDBRX(r, base, b) (0x7c000428 | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b))
#define PPC_RAW_STWCX(s, a, b) (0x7c00012d | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b))
#define PPC_RAW_CMPWI(a, i) (0x2c000000 | ___PPC_RA(a) | IMM_L(i))
diff --git a/arch/powerpc/include/asm/syscall.h b/arch/powerpc/include/asm/syscall.h
index 52d05b465e3e..25fc8ad9a27a 100644
--- a/arch/powerpc/include/asm/syscall.h
+++ b/arch/powerpc/include/asm/syscall.h
@@ -90,7 +90,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
unsigned long val, mask = -1UL;
unsigned int n = 6;
- if (is_32bit_task())
+ if (is_tsk_32bit_task(task))
mask = 0xffffffff;
while (n--) {
@@ -105,7 +105,7 @@ static inline void syscall_get_arguments(struct task_struct *task,
static inline int syscall_get_arch(struct task_struct *task)
{
- if (is_32bit_task())
+ if (is_tsk_32bit_task(task))
return AUDIT_ARCH_PPC;
else if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
return AUDIT_ARCH_PPC64LE;
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 5725029aaa29..d6e649b3c70b 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -168,8 +168,10 @@ static inline bool test_thread_local_flags(unsigned int flags)
#ifdef CONFIG_COMPAT
#define is_32bit_task() (test_thread_flag(TIF_32BIT))
+#define is_tsk_32bit_task(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT))
#else
#define is_32bit_task() (IS_ENABLED(CONFIG_PPC32))
+#define is_tsk_32bit_task(tsk) (IS_ENABLED(CONFIG_PPC32))
#endif
#if defined(CONFIG_PPC64)
diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S
index 92088f848266..7bab2d7de372 100644
--- a/arch/powerpc/kernel/interrupt_64.S
+++ b/arch/powerpc/kernel/interrupt_64.S
@@ -30,6 +30,7 @@ COMPAT_SYS_CALL_TABLE:
.ifc \srr,srr
mfspr r11,SPRN_SRR0
ld r12,_NIP(r1)
+ clrrdi r11,r11,2
clrrdi r12,r12,2
100: tdne r11,r12
EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE)
@@ -40,6 +41,7 @@ COMPAT_SYS_CALL_TABLE:
.else
mfspr r11,SPRN_HSRR0
ld r12,_NIP(r1)
+ clrrdi r11,r11,2
clrrdi r12,r12,2
100: tdne r11,r12
EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE)
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
index 877817471e3c..6a029f2378e1 100644
--- a/arch/powerpc/kernel/proc_powerpc.c
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -25,7 +25,7 @@ static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes
loff_t *ppos)
{
return simple_read_from_buffer(buf, nbytes, ppos,
- PDE_DATA(file_inode(file)), PAGE_SIZE);
+ pde_data(file_inode(file)), PAGE_SIZE);
}
static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
@@ -34,7 +34,7 @@ static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
return -EINVAL;
remap_pfn_range(vma, vma->vm_start,
- __pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT,
+ __pa(pde_data(file_inode(file))) >> PAGE_SHIFT,
PAGE_SIZE, vma->vm_page_prot);
return 0;
}
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 62361cc7281c..cd0b8b71ecdd 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -649,8 +649,9 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
__this_cpu_inc(irq_stat.timer_irqs_event);
} else {
now = *next_tb - now;
- if (now <= decrementer_max)
- set_dec_or_work(now);
+ if (now > decrementer_max)
+ now = decrementer_max;
+ set_dec_or_work(now);
__this_cpu_inc(irq_stat.timer_irqs_others);
}
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d1817cd9a691..84c89f08ae9a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1816,7 +1816,6 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
{
- struct kvm_nested_guest *nested = vcpu->arch.nested;
int r;
int srcu_idx;
@@ -1922,7 +1921,7 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
* it into a HEAI.
*/
if (!(vcpu->arch.hfscr_permitted & (1UL << cause)) ||
- (nested->hfscr & (1UL << cause))) {
+ (vcpu->arch.nested_hfscr & (1UL << cause))) {
vcpu->arch.trap = BOOK3S_INTERRUPT_H_EMUL_ASSIST;
/*
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
index 8f8daaeeb3b7..9d373f8963ee 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -363,7 +363,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
/* set L1 state to L2 state */
vcpu->arch.nested = l2;
vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
- l2->hfscr = l2_hv.hfscr;
+ vcpu->arch.nested_hfscr = l2_hv.hfscr;
vcpu->arch.regs = l2_regs;
/* Guest must always run with ME enabled, HV disabled. */
diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c
index 94045b265b6b..203735caf691 100644
--- a/arch/powerpc/mm/book3s32/mmu.c
+++ b/arch/powerpc/mm/book3s32/mmu.c
@@ -76,7 +76,7 @@ unsigned long p_block_mapped(phys_addr_t pa)
return 0;
}
-static int __init find_free_bat(void)
+int __init find_free_bat(void)
{
int b;
int n = mmu_has_feature(MMU_FTR_USE_HIGH_BATS) ? 8 : 4;
@@ -100,7 +100,7 @@ static int __init find_free_bat(void)
* - block size has to be a power of two. This is calculated by finding the
* highest bit set to 1.
*/
-static unsigned int block_size(unsigned long base, unsigned long top)
+unsigned int bat_block_size(unsigned long base, unsigned long top)
{
unsigned int max_size = SZ_256M;
unsigned int base_shift = (ffs(base) - 1) & 31;
@@ -145,7 +145,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to
int idx;
while ((idx = find_free_bat()) != -1 && base != top) {
- unsigned int size = block_size(base, top);
+ unsigned int size = bat_block_size(base, top);
if (size < 128 << 10)
break;
@@ -201,12 +201,12 @@ void mmu_mark_initmem_nx(void)
unsigned long size;
for (i = 0; i < nb - 1 && base < top;) {
- size = block_size(base, top);
+ size = bat_block_size(base, top);
setibat(i++, PAGE_OFFSET + base, base, size, PAGE_KERNEL_TEXT);
base += size;
}
if (base < top) {
- size = block_size(base, top);
+ size = bat_block_size(base, top);
if ((top - base) > size) {
size <<= 1;
if (strict_kernel_rwx_enabled() && base + size > border)
diff --git a/arch/powerpc/mm/kasan/book3s_32.c b/arch/powerpc/mm/kasan/book3s_32.c
index 35b287b0a8da..450a67ef0bbe 100644
--- a/arch/powerpc/mm/kasan/book3s_32.c
+++ b/arch/powerpc/mm/kasan/book3s_32.c
@@ -10,48 +10,51 @@ int __init kasan_init_region(void *start, size_t size)
{
unsigned long k_start = (unsigned long)kasan_mem_to_shadow(start);
unsigned long k_end = (unsigned long)kasan_mem_to_shadow(start + size);
- unsigned long k_cur = k_start;
- int k_size = k_end - k_start;
- int k_size_base = 1 << (ffs(k_size) - 1);
+ unsigned long k_nobat = k_start;
+ unsigned long k_cur;
+ phys_addr_t phys;
int ret;
- void *block;
- block = memblock_alloc(k_size, k_size_base);
-
- if (block && k_size_base >= SZ_128K && k_start == ALIGN(k_start, k_size_base)) {
- int shift = ffs(k_size - k_size_base);
- int k_size_more = shift ? 1 << (shift - 1) : 0;
-
- setbat(-1, k_start, __pa(block), k_size_base, PAGE_KERNEL);
- if (k_size_more >= SZ_128K)
- setbat(-1, k_start + k_size_base, __pa(block) + k_size_base,
- k_size_more, PAGE_KERNEL);
- if (v_block_mapped(k_start))
- k_cur = k_start + k_size_base;
- if (v_block_mapped(k_start + k_size_base))
- k_cur = k_start + k_size_base + k_size_more;
-
- update_bats();
+ while (k_nobat < k_end) {
+ unsigned int k_size = bat_block_size(k_nobat, k_end);
+ int idx = find_free_bat();
+
+ if (idx == -1)
+ break;
+ if (k_size < SZ_128K)
+ break;
+ phys = memblock_phys_alloc_range(k_size, k_size, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ break;
+
+ setbat(idx, k_nobat, phys, k_size, PAGE_KERNEL);
+ k_nobat += k_size;
}
+ if (k_nobat != k_start)
+ update_bats();
- if (!block)
- block = memblock_alloc(k_size, PAGE_SIZE);
- if (!block)
- return -ENOMEM;
+ if (k_nobat < k_end) {
+ phys = memblock_phys_alloc_range(k_end - k_nobat, PAGE_SIZE, 0,
+ MEMBLOCK_ALLOC_ANYWHERE);
+ if (!phys)
+ return -ENOMEM;
+ }
ret = kasan_init_shadow_page_tables(k_start, k_end);
if (ret)
return ret;
- kasan_update_early_region(k_start, k_cur, __pte(0));
+ kasan_update_early_region(k_start, k_nobat, __pte(0));
- for (; k_cur < k_end; k_cur += PAGE_SIZE) {
+ for (k_cur = k_nobat; k_cur < k_end; k_cur += PAGE_SIZE) {
pmd_t *pmd = pmd_off_k(k_cur);
- void *va = block + k_cur - k_start;
- pte_t pte = pfn_pte(PHYS_PFN(__pa(va)), PAGE_KERNEL);
+ pte_t pte = pfn_pte(PHYS_PFN(phys + k_cur - k_nobat), PAGE_KERNEL);
__set_pte_at(&init_mm, k_cur, pte_offset_kernel(pmd, k_cur), pte, 0);
}
flush_tlb_kernel_range(k_start, k_end);
+ memset(kasan_mem_to_shadow(start), 0, k_end - k_start);
+
return 0;
}
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index abb3198bd277..6ec5a7dd7913 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -206,6 +206,15 @@ void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
__set_pte_at(mm, addr, ptep, pte, 0);
}
+void unmap_kernel_page(unsigned long va)
+{
+ pmd_t *pmdp = pmd_off_k(va);
+ pte_t *ptep = pte_offset_kernel(pmdp, va);
+
+ pte_clear(&init_mm, va, ptep);
+ flush_tlb_kernel_range(va, va + PAGE_SIZE);
+}
+
/*
* This is called when relaxing access to a PTE. It's also called in the page
* fault path when we don't hit any of the major fault cases, ie, a minor
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index 9003c313475d..a4f4d347e6bd 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -23,15 +23,15 @@ static void bpf_jit_fill_ill_insns(void *area, unsigned int size)
memset32(area, BREAKPOINT_INSTRUCTION, size / 4);
}
-/* Fix the branch target addresses for subprog calls */
-static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image,
- struct codegen_context *ctx, u32 *addrs)
+/* Fix updated addresses (for subprog calls, ldimm64, et al) during extra pass */
+static int bpf_jit_fixup_addresses(struct bpf_prog *fp, u32 *image,
+ struct codegen_context *ctx, u32 *addrs)
{
const struct bpf_insn *insn = fp->insnsi;
bool func_addr_fixed;
u64 func_addr;
u32 tmp_idx;
- int i, ret;
+ int i, j, ret;
for (i = 0; i < fp->len; i++) {
/*
@@ -66,6 +66,23 @@ static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, u32 *image,
* of the JITed sequence remains unchanged.
*/
ctx->idx = tmp_idx;
+ } else if (insn[i].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ tmp_idx = ctx->idx;
+ ctx->idx = addrs[i] / 4;
+#ifdef CONFIG_PPC32
+ PPC_LI32(ctx->b2p[insn[i].dst_reg] - 1, (u32)insn[i + 1].imm);
+ PPC_LI32(ctx->b2p[insn[i].dst_reg], (u32)insn[i].imm);
+ for (j = ctx->idx - addrs[i] / 4; j < 4; j++)
+ EMIT(PPC_RAW_NOP());
+#else
+ func_addr = ((u64)(u32)insn[i].imm) | (((u64)(u32)insn[i + 1].imm) << 32);
+ PPC_LI64(b2p[insn[i].dst_reg], func_addr);
+ /* overwrite rest with nops */
+ for (j = ctx->idx - addrs[i] / 4; j < 5; j++)
+ EMIT(PPC_RAW_NOP());
+#endif
+ ctx->idx = tmp_idx;
+ i++;
}
}
@@ -200,13 +217,13 @@ skip_init_ctx:
/*
* Do not touch the prologue and epilogue as they will remain
* unchanged. Only fix the branch target address for subprog
- * calls in the body.
+ * calls in the body, and ldimm64 instructions.
*
* This does not change the offsets and lengths of the subprog
* call instruction sequences and hence, the size of the JITed
* image as well.
*/
- bpf_jit_fixup_subprog_calls(fp, code_base, &cgctx, addrs);
+ bpf_jit_fixup_addresses(fp, code_base, &cgctx, addrs);
/* There is no need to perform the usual passes. */
goto skip_codegen_passes;
diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c
index faaebd446cad..cf8dd8aea386 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -191,6 +191,9 @@ void bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 fun
if (image && rel < 0x2000000 && rel >= -0x2000000) {
PPC_BL_ABS(func);
+ EMIT(PPC_RAW_NOP());
+ EMIT(PPC_RAW_NOP());
+ EMIT(PPC_RAW_NOP());
} else {
/* Load function address into r0 */
EMIT(PPC_RAW_LIS(_R0, IMM_H(func)));
@@ -290,6 +293,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
bool func_addr_fixed;
u64 func_addr;
u32 true_cond;
+ u32 tmp_idx;
+ int j;
/*
* addrs[] maps a BPF bytecode address into a real offset from
@@ -905,8 +910,12 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
* 16 byte instruction that uses two 'struct bpf_insn'
*/
case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
+ tmp_idx = ctx->idx;
PPC_LI32(dst_reg_h, (u32)insn[i + 1].imm);
PPC_LI32(dst_reg, (u32)insn[i].imm);
+ /* padding to allow full 4 instructions for later patching */
+ for (j = ctx->idx - tmp_idx; j < 4; j++)
+ EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
break;
diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c
index 9eae8d8ed340..e1e8c934308a 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -319,6 +319,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *
u64 imm64;
u32 true_cond;
u32 tmp_idx;
+ int j;
/*
* addrs[] maps a BPF bytecode address into a real offset from
@@ -633,17 +634,21 @@ bpf_alu32_trunc:
EMIT(PPC_RAW_MR(dst_reg, b2p[TMP_REG_1]));
break;
case 64:
- /*
- * Way easier and faster(?) to store the value
- * into stack and then use ldbrx
- *
- * ctx->seen will be reliable in pass2, but
- * the instructions generated will remain the
- * same across all passes
- */
+ /* Store the value to stack and then use byte-reverse loads */
PPC_BPF_STL(dst_reg, 1, bpf_jit_stack_local(ctx));
EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], 1, bpf_jit_stack_local(ctx)));
- EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1]));
+ if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+ EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1]));
+ } else {
+ EMIT(PPC_RAW_LWBRX(dst_reg, 0, b2p[TMP_REG_1]));
+ if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN))
+ EMIT(PPC_RAW_SLDI(dst_reg, dst_reg, 32));
+ EMIT(PPC_RAW_LI(b2p[TMP_REG_2], 4));
+ EMIT(PPC_RAW_LWBRX(b2p[TMP_REG_2], b2p[TMP_REG_2], b2p[TMP_REG_1]));
+ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
+ EMIT(PPC_RAW_SLDI(b2p[TMP_REG_2], b2p[TMP_REG_2], 32));
+ EMIT(PPC_RAW_OR(dst_reg, dst_reg, b2p[TMP_REG_2]));
+ }
break;
}
break;
@@ -848,9 +853,13 @@ emit_clear:
case BPF_LD | BPF_IMM | BPF_DW: /* dst = (u64) imm */
imm64 = ((u64)(u32) insn[i].imm) |
(((u64)(u32) insn[i+1].imm) << 32);
+ tmp_idx = ctx->idx;
+ PPC_LI64(dst_reg, imm64);
+ /* padding to allow full 5 instructions for later patching */
+ for (j = ctx->idx - tmp_idx; j < 5; j++)
+ EMIT(PPC_RAW_NOP());
/* Adjust for two bpf instructions */
addrs[++i] = ctx->idx * 4;
- PPC_LI64(dst_reg, imm64);
break;
/*
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index a684901b6965..b5b42cf0a703 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -776,6 +776,34 @@ static void pmao_restore_workaround(bool ebb)
mtspr(SPRN_PMC6, pmcs[5]);
}
+/*
+ * If the perf subsystem wants performance monitor interrupts as soon as
+ * possible (e.g., to sample the instruction address and stack chain),
+ * this should return true. The IRQ masking code can then enable MSR[EE]
+ * in some places (e.g., interrupt handlers) that allows PMI interrupts
+ * through to improve accuracy of profiles, at the cost of some performance.
+ *
+ * The PMU counters can be enabled by other means (e.g., sysfs raw SPR
+ * access), but in that case there is no need for prompt PMI handling.
+ *
+ * This currently returns true if any perf counter is being used. It
+ * could possibly return false if only events are being counted rather than
+ * samples being taken, but for now this is good enough.
+ */
+bool power_pmu_wants_prompt_pmi(void)
+{
+ struct cpu_hw_events *cpuhw;
+
+ /*
+ * This could simply test local_paca->pmcregs_in_use if that were not
+ * under ifdef KVM.
+ */
+ if (!ppmu)
+ return false;
+
+ cpuhw = this_cpu_ptr(&cpu_hw_events);
+ return cpuhw->n_events;
+}
#endif /* CONFIG_PPC64 */
static void perf_event_interrupt(struct pt_regs *regs);
@@ -1327,9 +1355,20 @@ static void power_pmu_disable(struct pmu *pmu)
* Otherwise provide a warning if there is PMI pending, but
* no counter is found overflown.
*/
- if (any_pmc_overflown(cpuhw))
- clear_pmi_irq_pending();
- else
+ if (any_pmc_overflown(cpuhw)) {
+ /*
+ * Since power_pmu_disable runs under local_irq_save, it
+ * could happen that code hits a PMC overflow without PMI
+ * pending in paca. Hence only clear PMI pending if it was
+ * set.
+ *
+ * If a PMI is pending, then MSR[EE] must be disabled (because
+ * the masked PMI handler disabling EE). So it is safe to
+ * call clear_pmi_irq_pending().
+ */
+ if (pmi_irq_pending())
+ clear_pmi_irq_pending();
+ } else
WARN_ON(pmi_irq_pending());
val = mmcra = cpuhw->mmcr.mmcra;
@@ -2438,36 +2477,6 @@ static void perf_event_interrupt(struct pt_regs *regs)
perf_sample_event_took(sched_clock() - start_clock);
}
-/*
- * If the perf subsystem wants performance monitor interrupts as soon as
- * possible (e.g., to sample the instruction address and stack chain),
- * this should return true. The IRQ masking code can then enable MSR[EE]
- * in some places (e.g., interrupt handlers) that allows PMI interrupts
- * though to improve accuracy of profiles, at the cost of some performance.
- *
- * The PMU counters can be enabled by other means (e.g., sysfs raw SPR
- * access), but in that case there is no need for prompt PMI handling.
- *
- * This currently returns true if any perf counter is being used. It
- * could possibly return false if only events are being counted rather than
- * samples being taken, but for now this is good enough.
- */
-bool power_pmu_wants_prompt_pmi(void)
-{
- struct cpu_hw_events *cpuhw;
-
- /*
- * This could simply test local_paca->pmcregs_in_use if that were not
- * under ifdef KVM.
- */
-
- if (!ppmu)
- return false;
-
- cpuhw = this_cpu_ptr(&cpu_hw_events);
- return cpuhw->n_events;
-}
-
static int power_pmu_prepare_cpu(unsigned int cpu)
{
struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu);
diff --git a/arch/powerpc/platforms/pasemi/dma_lib.c b/arch/powerpc/platforms/pasemi/dma_lib.c
index 270fa3c0d372..26427311fc72 100644
--- a/arch/powerpc/platforms/pasemi/dma_lib.c
+++ b/arch/powerpc/platforms/pasemi/dma_lib.c
@@ -375,7 +375,7 @@ int pasemi_dma_alloc_flag(void)
int bit;
retry:
- bit = find_next_bit(flags_free, MAX_FLAGS, 0);
+ bit = find_first_bit(flags_free, MAX_FLAGS);
if (bit >= MAX_FLAGS)
return -ENOSPC;
if (!test_and_clear_bit(bit, flags_free))
@@ -440,7 +440,7 @@ int pasemi_dma_alloc_fun(void)
int bit;
retry:
- bit = find_next_bit(fun_free, MAX_FLAGS, 0);
+ bit = find_first_bit(fun_free, MAX_FLAGS);
if (bit >= MAX_FLAGS)
return -ENOSPC;
if (!test_and_clear_bit(bit, fun_free))
diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 171ecc6d1792..5adcbd9b5e88 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -147,27 +147,16 @@ config MMU
Select if you want MMU-based virtualised addressing space
support by paged memory management. If unsure, say 'Y'.
-config VA_BITS
- int
- default 32 if 32BIT
- default 39 if 64BIT
-
-config PA_BITS
- int
- default 34 if 32BIT
- default 56 if 64BIT
-
config PAGE_OFFSET
hex
- default 0xC0000000 if 32BIT && MAXPHYSMEM_1GB
+ default 0xC0000000 if 32BIT
default 0x80000000 if 64BIT && !MMU
- default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB
- default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB
+ default 0xffffaf8000000000 if 64BIT
config KASAN_SHADOW_OFFSET
hex
depends on KASAN_GENERIC
- default 0xdfffffc800000000 if 64BIT
+ default 0xdfffffff00000000 if 64BIT
default 0xffffffff if 32BIT
config ARCH_FLATMEM_ENABLE
@@ -213,7 +202,7 @@ config FIX_EARLYCON_MEM
config PGTABLE_LEVELS
int
- default 3 if 64BIT
+ default 4 if 64BIT
default 2
config LOCKDEP_SUPPORT
@@ -271,24 +260,6 @@ config MODULE_SECTIONS
bool
select HAVE_MOD_ARCH_SPECIFIC
-choice
- prompt "Maximum Physical Memory"
- default MAXPHYSMEM_1GB if 32BIT
- default MAXPHYSMEM_2GB if 64BIT && CMODEL_MEDLOW
- default MAXPHYSMEM_128GB if 64BIT && CMODEL_MEDANY
-
- config MAXPHYSMEM_1GB
- depends on 32BIT
- bool "1GiB"
- config MAXPHYSMEM_2GB
- depends on 64BIT
- bool "2GiB"
- config MAXPHYSMEM_128GB
- depends on 64BIT && CMODEL_MEDANY
- bool "128GiB"
-endchoice
-
-
config SMP
bool "Symmetric Multi-Processing"
help
@@ -392,12 +363,25 @@ source "kernel/Kconfig.hz"
config RISCV_SBI_V01
bool "SBI v0.1 support"
- default y
depends on RISCV_SBI
help
This config allows kernel to use SBI v0.1 APIs. This will be
deprecated in future once legacy M-mode software are no longer in use.
+config RISCV_BOOT_SPINWAIT
+ bool "Spinwait booting method"
+ depends on SMP
+ default y
+ help
+ This enables support for booting Linux via spinwait method. In the
+ spinwait method, all cores randomly jump to Linux. One of the cores
+ gets chosen via lottery and all other keep spinning on a percpu
+ variable. This method cannot support CPU hotplug and sparse hartid
+ scheme. It should be only enabled for M-mode Linux or platforms relying
+ on older firmware without SBI HSM extension. All other platforms should
+ rely on ordered booting via SBI HSM extension which gets chosen
+ dynamically at runtime if the firmware supports it.
+
config KEXEC
bool "Kexec system call"
select KEXEC_CORE
diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts
index 6bfa1f24d3de..c4ed9efdff03 100644
--- a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts
+++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts
@@ -39,6 +39,11 @@
clock-frequency = <RTCCLK_FREQ>;
clock-output-names = "rtcclk";
};
+
+ gpio-poweroff {
+ compatible = "gpio-poweroff";
+ gpios = <&gpio 2 GPIO_ACTIVE_LOW>;
+ };
};
&uart0 {
diff --git a/arch/riscv/configs/nommu_k210_defconfig b/arch/riscv/configs/nommu_k210_defconfig
index e8ceab678e8b..3f42ed87dde8 100644
--- a/arch/riscv/configs/nommu_k210_defconfig
+++ b/arch/riscv/configs/nommu_k210_defconfig
@@ -29,7 +29,6 @@ CONFIG_EMBEDDED=y
CONFIG_SLOB=y
# CONFIG_MMU is not set
CONFIG_SOC_CANAAN=y
-CONFIG_MAXPHYSMEM_2GB=y
CONFIG_SMP=y
CONFIG_NR_CPUS=2
CONFIG_CMDLINE="earlycon console=ttySIF0"
diff --git a/arch/riscv/configs/nommu_k210_sdcard_defconfig b/arch/riscv/configs/nommu_k210_sdcard_defconfig
index 46aa3879f19c..2a82a3b2992b 100644
--- a/arch/riscv/configs/nommu_k210_sdcard_defconfig
+++ b/arch/riscv/configs/nommu_k210_sdcard_defconfig
@@ -21,7 +21,6 @@ CONFIG_EMBEDDED=y
CONFIG_SLOB=y
# CONFIG_MMU is not set
CONFIG_SOC_CANAAN=y
-CONFIG_MAXPHYSMEM_2GB=y
CONFIG_SMP=y
CONFIG_NR_CPUS=2
CONFIG_CMDLINE="earlycon console=ttySIF0 rootdelay=2 root=/dev/mmcblk0p1 ro"
diff --git a/arch/riscv/configs/nommu_virt_defconfig b/arch/riscv/configs/nommu_virt_defconfig
index 385cca741b01..e1c9864b6237 100644
--- a/arch/riscv/configs/nommu_virt_defconfig
+++ b/arch/riscv/configs/nommu_virt_defconfig
@@ -24,10 +24,8 @@ CONFIG_EXPERT=y
# CONFIG_VM_EVENT_COUNTERS is not set
# CONFIG_COMPAT_BRK is not set
CONFIG_SLOB=y
-# CONFIG_SLAB_MERGE_DEFAULT is not set
# CONFIG_MMU is not set
CONFIG_SOC_VIRT=y
-CONFIG_MAXPHYSMEM_2GB=y
CONFIG_SMP=y
CONFIG_CMDLINE="root=/dev/vda rw earlycon=uart8250,mmio,0x10000000,115200n8 console=ttyS0"
CONFIG_CMDLINE_FORCE=y
diff --git a/arch/riscv/include/asm/bitops.h b/arch/riscv/include/asm/bitops.h
index 396a3303c537..3540b690944b 100644
--- a/arch/riscv/include/asm/bitops.h
+++ b/arch/riscv/include/asm/bitops.h
@@ -20,7 +20,6 @@
#include <asm-generic/bitops/fls.h>
#include <asm-generic/bitops/__fls.h>
#include <asm-generic/bitops/fls64.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/ffs.h>
diff --git a/arch/riscv/include/asm/cpu_ops.h b/arch/riscv/include/asm/cpu_ops.h
index a8ec3c5c1bd2..134590f1b843 100644
--- a/arch/riscv/include/asm/cpu_ops.h
+++ b/arch/riscv/include/asm/cpu_ops.h
@@ -40,7 +40,5 @@ struct cpu_operations {
extern const struct cpu_operations *cpu_ops[NR_CPUS];
void __init cpu_set_ops(int cpu);
-void cpu_update_secondary_bootdata(unsigned int cpuid,
- struct task_struct *tidle);
#endif /* ifndef __ASM_CPU_OPS_H */
diff --git a/arch/riscv/include/asm/cpu_ops_sbi.h b/arch/riscv/include/asm/cpu_ops_sbi.h
new file mode 100644
index 000000000000..56e4b76d09ff
--- /dev/null
+++ b/arch/riscv/include/asm/cpu_ops_sbi.h
@@ -0,0 +1,25 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021 by Rivos Inc.
+ */
+#ifndef __ASM_CPU_OPS_SBI_H
+#define __ASM_CPU_OPS_SBI_H
+
+#ifndef __ASSEMBLY__
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/threads.h>
+
+/**
+ * struct sbi_hart_boot_data - Hart specific boot used during booting and
+ * cpu hotplug.
+ * @task_ptr: A pointer to the hart specific tp
+ * @stack_ptr: A pointer to the hart specific sp
+ */
+struct sbi_hart_boot_data {
+ void *task_ptr;
+ void *stack_ptr;
+};
+#endif
+
+#endif /* ifndef __ASM_CPU_OPS_SBI_H */
diff --git a/arch/riscv/include/asm/csr.h b/arch/riscv/include/asm/csr.h
index 5046f431645c..ae711692eec9 100644
--- a/arch/riscv/include/asm/csr.h
+++ b/arch/riscv/include/asm/csr.h
@@ -40,14 +40,13 @@
#ifndef CONFIG_64BIT
#define SATP_PPN _AC(0x003FFFFF, UL)
#define SATP_MODE_32 _AC(0x80000000, UL)
-#define SATP_MODE SATP_MODE_32
#define SATP_ASID_BITS 9
#define SATP_ASID_SHIFT 22
#define SATP_ASID_MASK _AC(0x1FF, UL)
#else
#define SATP_PPN _AC(0x00000FFFFFFFFFFF, UL)
#define SATP_MODE_39 _AC(0x8000000000000000, UL)
-#define SATP_MODE SATP_MODE_39
+#define SATP_MODE_48 _AC(0x9000000000000000, UL)
#define SATP_ASID_BITS 16
#define SATP_ASID_SHIFT 44
#define SATP_ASID_MASK _AC(0xFFFF, UL)
diff --git a/arch/riscv/include/asm/fixmap.h b/arch/riscv/include/asm/fixmap.h
index 54cbf07fb4e9..58a718573ad6 100644
--- a/arch/riscv/include/asm/fixmap.h
+++ b/arch/riscv/include/asm/fixmap.h
@@ -24,6 +24,7 @@ enum fixed_addresses {
FIX_HOLE,
FIX_PTE,
FIX_PMD,
+ FIX_PUD,
FIX_TEXT_POKE1,
FIX_TEXT_POKE0,
FIX_EARLYCON_MEM_BASE,
diff --git a/arch/riscv/include/asm/kasan.h b/arch/riscv/include/asm/kasan.h
index b00f503ec124..0b85e363e778 100644
--- a/arch/riscv/include/asm/kasan.h
+++ b/arch/riscv/include/asm/kasan.h
@@ -27,13 +27,18 @@
*/
#define KASAN_SHADOW_SCALE_SHIFT 3
-#define KASAN_SHADOW_SIZE (UL(1) << ((CONFIG_VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
-#define KASAN_SHADOW_START KERN_VIRT_START
-#define KASAN_SHADOW_END (KASAN_SHADOW_START + KASAN_SHADOW_SIZE)
+#define KASAN_SHADOW_SIZE (UL(1) << ((VA_BITS - 1) - KASAN_SHADOW_SCALE_SHIFT))
+/*
+ * Depending on the size of the virtual address space, the region may not be
+ * aligned on PGDIR_SIZE, so force its alignment to ease its population.
+ */
+#define KASAN_SHADOW_START ((KASAN_SHADOW_END - KASAN_SHADOW_SIZE) & PGDIR_MASK)
+#define KASAN_SHADOW_END MODULES_LOWEST_VADDR
#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
void kasan_init(void);
asmlinkage void kasan_early_init(void);
+void kasan_swapper_init(void);
#endif
#endif
diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
index b3e5ff0125fe..160e3a1e8f8b 100644
--- a/arch/riscv/include/asm/page.h
+++ b/arch/riscv/include/asm/page.h
@@ -31,9 +31,20 @@
* When not using MMU this corresponds to the first free page in
* physical memory (aligned on a page boundary).
*/
+#ifdef CONFIG_64BIT
+#ifdef CONFIG_MMU
+#define PAGE_OFFSET kernel_map.page_offset
+#else
#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
-
-#define KERN_VIRT_SIZE (-PAGE_OFFSET)
+#endif
+/*
+ * By default, CONFIG_PAGE_OFFSET value corresponds to SV48 address space so
+ * define the PAGE_OFFSET value for SV39.
+ */
+#define PAGE_OFFSET_L3 _AC(0xffffffd800000000, UL)
+#else
+#define PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
+#endif /* CONFIG_64BIT */
#ifndef __ASSEMBLY__
@@ -86,6 +97,7 @@ extern unsigned long riscv_pfn_base;
#endif /* CONFIG_MMU */
struct kernel_mapping {
+ unsigned long page_offset;
unsigned long virt_addr;
uintptr_t phys_addr;
uintptr_t size;
diff --git a/arch/riscv/include/asm/pgalloc.h b/arch/riscv/include/asm/pgalloc.h
index 0af6933a7100..11823004b87a 100644
--- a/arch/riscv/include/asm/pgalloc.h
+++ b/arch/riscv/include/asm/pgalloc.h
@@ -11,6 +11,8 @@
#include <asm/tlb.h>
#ifdef CONFIG_MMU
+#define __HAVE_ARCH_PUD_ALLOC_ONE
+#define __HAVE_ARCH_PUD_FREE
#include <asm-generic/pgalloc.h>
static inline void pmd_populate_kernel(struct mm_struct *mm,
@@ -36,6 +38,44 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
set_pud(pud, __pud((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
}
+
+static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud)
+{
+ if (pgtable_l4_enabled) {
+ unsigned long pfn = virt_to_pfn(pud);
+
+ set_p4d(p4d, __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+ }
+}
+
+static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d,
+ pud_t *pud)
+{
+ if (pgtable_l4_enabled) {
+ unsigned long pfn = virt_to_pfn(pud);
+
+ set_p4d_safe(p4d,
+ __p4d((pfn << _PAGE_PFN_SHIFT) | _PAGE_TABLE));
+ }
+}
+
+#define pud_alloc_one pud_alloc_one
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+ if (pgtable_l4_enabled)
+ return __pud_alloc_one(mm, addr);
+
+ return NULL;
+}
+
+#define pud_free pud_free
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+ if (pgtable_l4_enabled)
+ __pud_free(mm, pud);
+}
+
+#define __pud_free_tlb(tlb, pud, addr) pud_free((tlb)->mm, pud)
#endif /* __PAGETABLE_PMD_FOLDED */
static inline pgd_t *pgd_alloc(struct mm_struct *mm)
diff --git a/arch/riscv/include/asm/pgtable-64.h b/arch/riscv/include/asm/pgtable-64.h
index 228261aa9628..bbbdd66e5e2f 100644
--- a/arch/riscv/include/asm/pgtable-64.h
+++ b/arch/riscv/include/asm/pgtable-64.h
@@ -8,16 +8,36 @@
#include <linux/const.h>
-#define PGDIR_SHIFT 30
+extern bool pgtable_l4_enabled;
+
+#define PGDIR_SHIFT_L3 30
+#define PGDIR_SHIFT_L4 39
+#define PGDIR_SIZE_L3 (_AC(1, UL) << PGDIR_SHIFT_L3)
+
+#define PGDIR_SHIFT (pgtable_l4_enabled ? PGDIR_SHIFT_L4 : PGDIR_SHIFT_L3)
/* Size of region mapped by a page global directory */
#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
#define PGDIR_MASK (~(PGDIR_SIZE - 1))
+/* pud is folded into pgd in case of 3-level page table */
+#define PUD_SHIFT 30
+#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
+#define PUD_MASK (~(PUD_SIZE - 1))
+
#define PMD_SHIFT 21
/* Size of region mapped by a page middle directory */
#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
#define PMD_MASK (~(PMD_SIZE - 1))
+/* Page Upper Directory entry */
+typedef struct {
+ unsigned long pud;
+} pud_t;
+
+#define pud_val(x) ((x).pud)
+#define __pud(x) ((pud_t) { (x) })
+#define PTRS_PER_PUD (PAGE_SIZE / sizeof(pud_t))
+
/* Page Middle Directory entry */
typedef struct {
unsigned long pmd;
@@ -59,6 +79,16 @@ static inline void pud_clear(pud_t *pudp)
set_pud(pudp, __pud(0));
}
+static inline pud_t pfn_pud(unsigned long pfn, pgprot_t prot)
+{
+ return __pud((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
+}
+
+static inline unsigned long _pud_pfn(pud_t pud)
+{
+ return pud_val(pud) >> _PAGE_PFN_SHIFT;
+}
+
static inline pmd_t *pud_pgtable(pud_t pud)
{
return (pmd_t *)pfn_to_virt(pud_val(pud) >> _PAGE_PFN_SHIFT);
@@ -69,6 +99,17 @@ static inline struct page *pud_page(pud_t pud)
return pfn_to_page(pud_val(pud) >> _PAGE_PFN_SHIFT);
}
+#define mm_pud_folded mm_pud_folded
+static inline bool mm_pud_folded(struct mm_struct *mm)
+{
+ if (pgtable_l4_enabled)
+ return false;
+
+ return true;
+}
+
+#define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
+
static inline pmd_t pfn_pmd(unsigned long pfn, pgprot_t prot)
{
return __pmd((pfn << _PAGE_PFN_SHIFT) | pgprot_val(prot));
@@ -84,4 +125,69 @@ static inline unsigned long _pmd_pfn(pmd_t pmd)
#define pmd_ERROR(e) \
pr_err("%s:%d: bad pmd %016lx.\n", __FILE__, __LINE__, pmd_val(e))
+#define pud_ERROR(e) \
+ pr_err("%s:%d: bad pud %016lx.\n", __FILE__, __LINE__, pud_val(e))
+
+static inline void set_p4d(p4d_t *p4dp, p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ *p4dp = p4d;
+ else
+ set_pud((pud_t *)p4dp, (pud_t){ p4d_val(p4d) });
+}
+
+static inline int p4d_none(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return (p4d_val(p4d) == 0);
+
+ return 0;
+}
+
+static inline int p4d_present(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return (p4d_val(p4d) & _PAGE_PRESENT);
+
+ return 1;
+}
+
+static inline int p4d_bad(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return !p4d_present(p4d);
+
+ return 0;
+}
+
+static inline void p4d_clear(p4d_t *p4d)
+{
+ if (pgtable_l4_enabled)
+ set_p4d(p4d, __p4d(0));
+}
+
+static inline pud_t *p4d_pgtable(p4d_t p4d)
+{
+ if (pgtable_l4_enabled)
+ return (pud_t *)pfn_to_virt(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
+
+ return (pud_t *)pud_pgtable((pud_t) { p4d_val(p4d) });
+}
+
+static inline struct page *p4d_page(p4d_t p4d)
+{
+ return pfn_to_page(p4d_val(p4d) >> _PAGE_PFN_SHIFT);
+}
+
+#define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
+
+#define pud_offset pud_offset
+static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
+{
+ if (pgtable_l4_enabled)
+ return p4d_pgtable(*p4d) + pud_index(address);
+
+ return (pud_t *)p4d;
+}
+
#endif /* _ASM_RISCV_PGTABLE_64_H */
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 67f687aee673..7e949f25c933 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -24,6 +24,17 @@
#define KERNEL_LINK_ADDR PAGE_OFFSET
#endif
+/* Number of entries in the page global directory */
+#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t))
+/* Number of entries in the page table */
+#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t))
+
+/*
+ * Half of the kernel address space (half of the entries of the page global
+ * directory) is for the direct mapping.
+ */
+#define KERN_VIRT_SIZE ((PTRS_PER_PGD / 2 * PGDIR_SIZE) / 2)
+
#define VMALLOC_SIZE (KERN_VIRT_SIZE >> 1)
#define VMALLOC_END PAGE_OFFSET
#define VMALLOC_START (PAGE_OFFSET - VMALLOC_SIZE)
@@ -39,8 +50,10 @@
/* Modules always live before the kernel */
#ifdef CONFIG_64BIT
-#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
-#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
+/* This is used to define the end of the KASAN shadow region */
+#define MODULES_LOWEST_VADDR (KERNEL_LINK_ADDR - SZ_2G)
+#define MODULES_VADDR (PFN_ALIGN((unsigned long)&_end) - SZ_2G)
+#define MODULES_END (PFN_ALIGN((unsigned long)&_start))
#endif
/*
@@ -48,8 +61,14 @@
* struct pages to map half the virtual address space. Then
* position vmemmap directly below the VMALLOC region.
*/
+#ifdef CONFIG_64BIT
+#define VA_BITS (pgtable_l4_enabled ? 48 : 39)
+#else
+#define VA_BITS 32
+#endif
+
#define VMEMMAP_SHIFT \
- (CONFIG_VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
+ (VA_BITS - PAGE_SHIFT - 1 + STRUCT_PAGE_MAX_SHIFT)
#define VMEMMAP_SIZE BIT(VMEMMAP_SHIFT)
#define VMEMMAP_END VMALLOC_START
#define VMEMMAP_START (VMALLOC_START - VMEMMAP_SIZE)
@@ -83,8 +102,7 @@
#ifndef __ASSEMBLY__
-/* Page Upper Directory not used in RISC-V */
-#include <asm-generic/pgtable-nopud.h>
+#include <asm-generic/pgtable-nop4d.h>
#include <asm/page.h>
#include <asm/tlbflush.h>
#include <linux/mm_types.h>
@@ -107,12 +125,20 @@
#define XIP_FIXUP(addr) (addr)
#endif /* CONFIG_XIP_KERNEL */
-#ifdef CONFIG_MMU
-/* Number of entries in the page global directory */
-#define PTRS_PER_PGD (PAGE_SIZE / sizeof(pgd_t))
-/* Number of entries in the page table */
-#define PTRS_PER_PTE (PAGE_SIZE / sizeof(pte_t))
+struct pt_alloc_ops {
+ pte_t *(*get_pte_virt)(phys_addr_t pa);
+ phys_addr_t (*alloc_pte)(uintptr_t va);
+#ifndef __PAGETABLE_PMD_FOLDED
+ pmd_t *(*get_pmd_virt)(phys_addr_t pa);
+ phys_addr_t (*alloc_pmd)(uintptr_t va);
+ pud_t *(*get_pud_virt)(phys_addr_t pa);
+ phys_addr_t (*alloc_pud)(uintptr_t va);
+#endif
+};
+
+extern struct pt_alloc_ops pt_ops __initdata;
+#ifdef CONFIG_MMU
/* Number of PGD entries that a user-mode program can use */
#define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE)
@@ -659,7 +685,7 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
* and give the kernel the other (upper) half.
*/
#ifdef CONFIG_64BIT
-#define KERN_VIRT_START (-(BIT(CONFIG_VA_BITS)) + TASK_SIZE)
+#define KERN_VIRT_START (-(BIT(VA_BITS)) + TASK_SIZE)
#else
#define KERN_VIRT_START FIXADDR_START
#endif
@@ -667,11 +693,22 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
/*
* Task size is 0x4000000000 for RV64 or 0x9fc00000 for RV32.
* Note that PGDIR_SIZE must evenly divide TASK_SIZE.
+ * Task size is:
+ * - 0x9fc00000 (~2.5GB) for RV32.
+ * - 0x4000000000 ( 256GB) for RV64 using SV39 mmu
+ * - 0x800000000000 ( 128TB) for RV64 using SV48 mmu
+ *
+ * Note that PGDIR_SIZE must evenly divide TASK_SIZE since "RISC-V
+ * Instruction Set Manual Volume II: Privileged Architecture" states that
+ * "load and store effective addresses, which are 64bits, must have bits
+ * 63–48 all equal to bit 47, or else a page-fault exception will occur."
*/
#ifdef CONFIG_64BIT
-#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
+#define TASK_SIZE (PGDIR_SIZE * PTRS_PER_PGD / 2)
+#define TASK_SIZE_MIN (PGDIR_SIZE_L3 * PTRS_PER_PGD / 2)
#else
-#define TASK_SIZE FIXADDR_START
+#define TASK_SIZE FIXADDR_START
+#define TASK_SIZE_MIN TASK_SIZE
#endif
#else /* CONFIG_MMU */
@@ -697,6 +734,8 @@ extern uintptr_t _dtb_early_pa;
#define dtb_early_va _dtb_early_va
#define dtb_early_pa _dtb_early_pa
#endif /* CONFIG_XIP_KERNEL */
+extern u64 satp_mode;
+extern bool pgtable_l4_enabled;
void paging_init(void);
void misc_mem_init(void);
diff --git a/arch/riscv/include/asm/sbi.h b/arch/riscv/include/asm/sbi.h
index 26ba6f2d7a40..d1c37479d828 100644
--- a/arch/riscv/include/asm/sbi.h
+++ b/arch/riscv/include/asm/sbi.h
@@ -8,6 +8,7 @@
#define _ASM_RISCV_SBI_H
#include <linux/types.h>
+#include <linux/cpumask.h>
#ifdef CONFIG_RISCV_SBI
enum sbi_ext_id {
@@ -128,27 +129,27 @@ long sbi_get_mimpid(void);
void sbi_set_timer(uint64_t stime_value);
void sbi_shutdown(void);
void sbi_clear_ipi(void);
-int sbi_send_ipi(const unsigned long *hart_mask);
-int sbi_remote_fence_i(const unsigned long *hart_mask);
-int sbi_remote_sfence_vma(const unsigned long *hart_mask,
+int sbi_send_ipi(const struct cpumask *cpu_mask);
+int sbi_remote_fence_i(const struct cpumask *cpu_mask);
+int sbi_remote_sfence_vma(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size);
-int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
+int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size,
unsigned long asid);
-int sbi_remote_hfence_gvma(const unsigned long *hart_mask,
+int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size);
-int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask,
+int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size,
unsigned long vmid);
-int sbi_remote_hfence_vvma(const unsigned long *hart_mask,
+int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size);
-int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask,
+int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size,
unsigned long asid);
@@ -183,7 +184,7 @@ static inline unsigned long sbi_mk_version(unsigned long major,
int sbi_err_map_linux_errno(int err);
#else /* CONFIG_RISCV_SBI */
-static inline int sbi_remote_fence_i(const unsigned long *hart_mask) { return -1; }
+static inline int sbi_remote_fence_i(const struct cpumask *cpu_mask) { return -1; }
static inline void sbi_init(void) {}
#endif /* CONFIG_RISCV_SBI */
#endif /* _ASM_RISCV_SBI_H */
diff --git a/arch/riscv/include/asm/smp.h b/arch/riscv/include/asm/smp.h
index 6ad749f42807..23170c933d73 100644
--- a/arch/riscv/include/asm/smp.h
+++ b/arch/riscv/include/asm/smp.h
@@ -92,8 +92,6 @@ static inline void riscv_clear_ipi(void)
#endif /* CONFIG_SMP */
-void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out);
-
#if defined(CONFIG_HOTPLUG_CPU) && (CONFIG_SMP)
bool cpu_has_hotplug(unsigned int cpu);
#else
diff --git a/arch/riscv/include/asm/sparsemem.h b/arch/riscv/include/asm/sparsemem.h
index 45a7018a8118..63acaecc3374 100644
--- a/arch/riscv/include/asm/sparsemem.h
+++ b/arch/riscv/include/asm/sparsemem.h
@@ -4,7 +4,11 @@
#define _ASM_RISCV_SPARSEMEM_H
#ifdef CONFIG_SPARSEMEM
-#define MAX_PHYSMEM_BITS CONFIG_PA_BITS
+#ifdef CONFIG_64BIT
+#define MAX_PHYSMEM_BITS 56
+#else
+#define MAX_PHYSMEM_BITS 34
+#endif /* CONFIG_64BIT */
#define SECTION_SIZE_BITS 27
#endif /* CONFIG_SPARSEMEM */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index 3397ddac1a30..612556faa527 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -43,7 +43,8 @@ obj-$(CONFIG_FPU) += fpu.o
obj-$(CONFIG_SMP) += smpboot.o
obj-$(CONFIG_SMP) += smp.o
obj-$(CONFIG_SMP) += cpu_ops.o
-obj-$(CONFIG_SMP) += cpu_ops_spinwait.o
+
+obj-$(CONFIG_RISCV_BOOT_SPINWAIT) += cpu_ops_spinwait.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SECTIONS) += module-sections.o
diff --git a/arch/riscv/kernel/asm-offsets.c b/arch/riscv/kernel/asm-offsets.c
index 253126e4beef..df0519a64eaf 100644
--- a/arch/riscv/kernel/asm-offsets.c
+++ b/arch/riscv/kernel/asm-offsets.c
@@ -12,6 +12,7 @@
#include <asm/kvm_host.h>
#include <asm/thread_info.h>
#include <asm/ptrace.h>
+#include <asm/cpu_ops_sbi.h>
void asm_offsets(void);
@@ -468,4 +469,6 @@ void asm_offsets(void)
DEFINE(PT_SIZE_ON_STACK, ALIGN(sizeof(struct pt_regs), STACK_ALIGN));
OFFSET(KERNEL_MAP_VIRT_ADDR, kernel_mapping, virt_addr);
+ OFFSET(SBI_HART_BOOT_TASK_PTR_OFFSET, sbi_hart_boot_data, task_ptr);
+ OFFSET(SBI_HART_BOOT_STACK_PTR_OFFSET, sbi_hart_boot_data, stack_ptr);
}
diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
index f13b2c9ea912..ad0a7e9f828b 100644
--- a/arch/riscv/kernel/cpu.c
+++ b/arch/riscv/kernel/cpu.c
@@ -7,6 +7,7 @@
#include <linux/seq_file.h>
#include <linux/of.h>
#include <asm/smp.h>
+#include <asm/pgtable.h>
/*
* Returns the hart ID of the given device tree node, or -ENODEV if the node
@@ -71,18 +72,19 @@ static void print_isa(struct seq_file *f, const char *isa)
seq_puts(f, "\n");
}
-static void print_mmu(struct seq_file *f, const char *mmu_type)
+static void print_mmu(struct seq_file *f)
{
+ char sv_type[16];
+
#if defined(CONFIG_32BIT)
- if (strcmp(mmu_type, "riscv,sv32") != 0)
- return;
+ strncpy(sv_type, "sv32", 5);
#elif defined(CONFIG_64BIT)
- if (strcmp(mmu_type, "riscv,sv39") != 0 &&
- strcmp(mmu_type, "riscv,sv48") != 0)
- return;
+ if (pgtable_l4_enabled)
+ strncpy(sv_type, "sv48", 5);
+ else
+ strncpy(sv_type, "sv39", 5);
#endif
-
- seq_printf(f, "mmu\t\t: %s\n", mmu_type+6);
+ seq_printf(f, "mmu\t\t: %s\n", sv_type);
}
static void *c_start(struct seq_file *m, loff_t *pos)
@@ -107,14 +109,13 @@ static int c_show(struct seq_file *m, void *v)
{
unsigned long cpu_id = (unsigned long)v - 1;
struct device_node *node = of_get_cpu_node(cpu_id, NULL);
- const char *compat, *isa, *mmu;
+ const char *compat, *isa;
seq_printf(m, "processor\t: %lu\n", cpu_id);
seq_printf(m, "hart\t\t: %lu\n", cpuid_to_hartid_map(cpu_id));
if (!of_property_read_string(node, "riscv,isa", &isa))
print_isa(m, isa);
- if (!of_property_read_string(node, "mmu-type", &mmu))
- print_mmu(m, mmu);
+ print_mmu(m);
if (!of_property_read_string(node, "compatible", &compat)
&& strcmp(compat, "riscv"))
seq_printf(m, "uarch\t\t: %s\n", compat);
diff --git a/arch/riscv/kernel/cpu_ops.c b/arch/riscv/kernel/cpu_ops.c
index 1985884fe829..170d07e57721 100644
--- a/arch/riscv/kernel/cpu_ops.c
+++ b/arch/riscv/kernel/cpu_ops.c
@@ -8,37 +8,29 @@
#include <linux/of.h>
#include <linux/string.h>
#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
#include <asm/cpu_ops.h>
#include <asm/sbi.h>
#include <asm/smp.h>
const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;
-void *__cpu_up_stack_pointer[NR_CPUS] __section(".data");
-void *__cpu_up_task_pointer[NR_CPUS] __section(".data");
-
extern const struct cpu_operations cpu_ops_sbi;
+#ifdef CONFIG_RISCV_BOOT_SPINWAIT
extern const struct cpu_operations cpu_ops_spinwait;
-
-void cpu_update_secondary_bootdata(unsigned int cpuid,
- struct task_struct *tidle)
-{
- int hartid = cpuid_to_hartid_map(cpuid);
-
- /* Make sure tidle is updated */
- smp_mb();
- WRITE_ONCE(__cpu_up_stack_pointer[hartid],
- task_stack_page(tidle) + THREAD_SIZE);
- WRITE_ONCE(__cpu_up_task_pointer[hartid], tidle);
-}
+#else
+const struct cpu_operations cpu_ops_spinwait = {
+ .name = "",
+ .cpu_prepare = NULL,
+ .cpu_start = NULL,
+};
+#endif
void __init cpu_set_ops(int cpuid)
{
#if IS_ENABLED(CONFIG_RISCV_SBI)
if (sbi_probe_extension(SBI_EXT_HSM) > 0) {
if (!cpuid)
- pr_info("SBI v0.2 HSM extension detected\n");
+ pr_info("SBI HSM extension detected\n");
cpu_ops[cpuid] = &cpu_ops_sbi;
} else
#endif
diff --git a/arch/riscv/kernel/cpu_ops_sbi.c b/arch/riscv/kernel/cpu_ops_sbi.c
index 685fae72b7f5..dae29cbfe550 100644
--- a/arch/riscv/kernel/cpu_ops_sbi.c
+++ b/arch/riscv/kernel/cpu_ops_sbi.c
@@ -7,13 +7,22 @@
#include <linux/init.h>
#include <linux/mm.h>
+#include <linux/sched/task_stack.h>
#include <asm/cpu_ops.h>
+#include <asm/cpu_ops_sbi.h>
#include <asm/sbi.h>
#include <asm/smp.h>
extern char secondary_start_sbi[];
const struct cpu_operations cpu_ops_sbi;
+/*
+ * Ordered booting via HSM brings one cpu at a time. However, cpu hotplug can
+ * be invoked from multiple threads in parallel. Define a per cpu data
+ * to handle that.
+ */
+DEFINE_PER_CPU(struct sbi_hart_boot_data, boot_data);
+
static int sbi_hsm_hart_start(unsigned long hartid, unsigned long saddr,
unsigned long priv)
{
@@ -55,14 +64,19 @@ static int sbi_hsm_hart_get_status(unsigned long hartid)
static int sbi_cpu_start(unsigned int cpuid, struct task_struct *tidle)
{
- int rc;
unsigned long boot_addr = __pa_symbol(secondary_start_sbi);
int hartid = cpuid_to_hartid_map(cpuid);
-
- cpu_update_secondary_bootdata(cpuid, tidle);
- rc = sbi_hsm_hart_start(hartid, boot_addr, 0);
-
- return rc;
+ unsigned long hsm_data;
+ struct sbi_hart_boot_data *bdata = &per_cpu(boot_data, cpuid);
+
+ /* Make sure tidle is updated */
+ smp_mb();
+ bdata->task_ptr = tidle;
+ bdata->stack_ptr = task_stack_page(tidle) + THREAD_SIZE;
+ /* Make sure boot data is updated */
+ smp_mb();
+ hsm_data = __pa(bdata);
+ return sbi_hsm_hart_start(hartid, boot_addr, hsm_data);
}
static int sbi_cpu_prepare(unsigned int cpuid)
diff --git a/arch/riscv/kernel/cpu_ops_spinwait.c b/arch/riscv/kernel/cpu_ops_spinwait.c
index b2c957bb68c1..346847f6c41c 100644
--- a/arch/riscv/kernel/cpu_ops_spinwait.c
+++ b/arch/riscv/kernel/cpu_ops_spinwait.c
@@ -6,11 +6,36 @@
#include <linux/errno.h>
#include <linux/of.h>
#include <linux/string.h>
+#include <linux/sched/task_stack.h>
#include <asm/cpu_ops.h>
#include <asm/sbi.h>
#include <asm/smp.h>
const struct cpu_operations cpu_ops_spinwait;
+void *__cpu_spinwait_stack_pointer[NR_CPUS] __section(".data");
+void *__cpu_spinwait_task_pointer[NR_CPUS] __section(".data");
+
+static void cpu_update_secondary_bootdata(unsigned int cpuid,
+ struct task_struct *tidle)
+{
+ int hartid = cpuid_to_hartid_map(cpuid);
+
+ /*
+ * The hartid must be less than NR_CPUS to avoid out-of-bound access
+ * errors for __cpu_spinwait_stack/task_pointer. That is not always possible
+ * for platforms with discontiguous hartid numbering scheme. That's why
+ * spinwait booting is not the recommended approach for any platforms
+ * booting Linux in S-mode and can be disabled in the future.
+ */
+ if (hartid == INVALID_HARTID || hartid >= NR_CPUS)
+ return;
+
+ /* Make sure tidle is updated */
+ smp_mb();
+ WRITE_ONCE(__cpu_spinwait_stack_pointer[hartid],
+ task_stack_page(tidle) + THREAD_SIZE);
+ WRITE_ONCE(__cpu_spinwait_task_pointer[hartid], tidle);
+}
static int spinwait_cpu_prepare(unsigned int cpuid)
{
@@ -28,7 +53,7 @@ static int spinwait_cpu_start(unsigned int cpuid, struct task_struct *tidle)
* selects the first cpu to boot the kernel and causes the remainder
* of the cpus to spin in a loop waiting for their stack pointer to be
* setup by that main cpu. Writing to bootdata
- * (i.e __cpu_up_stack_pointer) signals to the spinning cpus that they
+ * (i.e __cpu_spinwait_stack_pointer) signals to the spinning cpus that they
* can continue the boot process.
*/
cpu_update_secondary_bootdata(cpuid, tidle);
diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S
index 604d60292dd8..2363b43312fc 100644
--- a/arch/riscv/kernel/head.S
+++ b/arch/riscv/kernel/head.S
@@ -11,6 +11,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/csr.h>
+#include <asm/cpu_ops_sbi.h>
#include <asm/hwcap.h>
#include <asm/image.h>
#include "efi-header.S"
@@ -105,7 +106,8 @@ relocate:
/* Compute satp for kernel page tables, but don't load it yet */
srl a2, a0, PAGE_SHIFT
- li a1, SATP_MODE
+ la a1, satp_mode
+ REG_L a1, 0(a1)
or a2, a2, a1
/*
@@ -167,15 +169,15 @@ secondary_start_sbi:
la a3, .Lsecondary_park
csrw CSR_TVEC, a3
- slli a3, a0, LGREG
- la a4, __cpu_up_stack_pointer
- XIP_FIXUP_OFFSET a4
- la a5, __cpu_up_task_pointer
- XIP_FIXUP_OFFSET a5
- add a4, a3, a4
- add a5, a3, a5
- REG_L sp, (a4)
- REG_L tp, (a5)
+ /* a0 contains the hartid & a1 contains boot data */
+ li a2, SBI_HART_BOOT_TASK_PTR_OFFSET
+ XIP_FIXUP_OFFSET a2
+ add a2, a2, a1
+ REG_L tp, (a2)
+ li a3, SBI_HART_BOOT_STACK_PTR_OFFSET
+ XIP_FIXUP_OFFSET a3
+ add a3, a3, a1
+ REG_L sp, (a3)
.Lsecondary_start_common:
@@ -257,13 +259,13 @@ pmp_done:
li t0, SR_FS
csrc CSR_STATUS, t0
-#ifdef CONFIG_SMP
+#ifdef CONFIG_RISCV_BOOT_SPINWAIT
li t0, CONFIG_NR_CPUS
blt a0, t0, .Lgood_cores
tail .Lsecondary_park
.Lgood_cores:
-#endif
+ /* The lottery system is only required for spinwait booting method */
#ifndef CONFIG_XIP_KERNEL
/* Pick one hart to run the main boot sequence */
la a3, hart_lottery
@@ -282,6 +284,10 @@ pmp_done:
/* first time here if hart_lottery in RAM is not set */
beq t0, t1, .Lsecondary_start
+#endif /* CONFIG_XIP */
+#endif /* CONFIG_RISCV_BOOT_SPINWAIT */
+
+#ifdef CONFIG_XIP_KERNEL
la sp, _end + THREAD_SIZE
XIP_FIXUP_OFFSET sp
mv s0, a0
@@ -338,16 +344,16 @@ clear_bss_done:
call soc_early_init
tail start_kernel
+#if CONFIG_RISCV_BOOT_SPINWAIT
.Lsecondary_start:
-#ifdef CONFIG_SMP
/* Set trap vector to spin forever to help debug */
la a3, .Lsecondary_park
csrw CSR_TVEC, a3
slli a3, a0, LGREG
- la a1, __cpu_up_stack_pointer
+ la a1, __cpu_spinwait_stack_pointer
XIP_FIXUP_OFFSET a1
- la a2, __cpu_up_task_pointer
+ la a2, __cpu_spinwait_task_pointer
XIP_FIXUP_OFFSET a2
add a1, a3, a1
add a2, a3, a2
@@ -365,7 +371,7 @@ clear_bss_done:
fence
tail .Lsecondary_start_common
-#endif
+#endif /* CONFIG_RISCV_BOOT_SPINWAIT */
END(_start_kernel)
diff --git a/arch/riscv/kernel/head.h b/arch/riscv/kernel/head.h
index aabbc3ac3e48..726731ada534 100644
--- a/arch/riscv/kernel/head.h
+++ b/arch/riscv/kernel/head.h
@@ -16,7 +16,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa);
asmlinkage void __init __copy_data(void);
#endif
-extern void *__cpu_up_stack_pointer[];
-extern void *__cpu_up_task_pointer[];
+#ifdef CONFIG_RISCV_BOOT_SPINWAIT
+extern void *__cpu_spinwait_stack_pointer[];
+extern void *__cpu_spinwait_task_pointer[];
+#endif
#endif /* __ASM_HEAD_H */
diff --git a/arch/riscv/kernel/ptrace.c b/arch/riscv/kernel/ptrace.c
index 9c0511119bad..a89243730153 100644
--- a/arch/riscv/kernel/ptrace.c
+++ b/arch/riscv/kernel/ptrace.c
@@ -42,12 +42,10 @@ static int riscv_gpr_set(struct task_struct *target,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
- int ret;
struct pt_regs *regs;
regs = task_pt_regs(target);
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1);
- return ret;
+ return user_regset_copyin(&pos, &count, &kbuf, &ubuf, regs, 0, -1);
}
#ifdef CONFIG_FPU
diff --git a/arch/riscv/kernel/sbi.c b/arch/riscv/kernel/sbi.c
index 9a84f0cb5175..f72527fcb347 100644
--- a/arch/riscv/kernel/sbi.c
+++ b/arch/riscv/kernel/sbi.c
@@ -16,8 +16,8 @@ unsigned long sbi_spec_version __ro_after_init = SBI_SPEC_VERSION_DEFAULT;
EXPORT_SYMBOL(sbi_spec_version);
static void (*__sbi_set_timer)(uint64_t stime) __ro_after_init;
-static int (*__sbi_send_ipi)(const unsigned long *hart_mask) __ro_after_init;
-static int (*__sbi_rfence)(int fid, const unsigned long *hart_mask,
+static int (*__sbi_send_ipi)(const struct cpumask *cpu_mask) __ro_after_init;
+static int (*__sbi_rfence)(int fid, const struct cpumask *cpu_mask,
unsigned long start, unsigned long size,
unsigned long arg4, unsigned long arg5) __ro_after_init;
@@ -67,6 +67,30 @@ int sbi_err_map_linux_errno(int err)
EXPORT_SYMBOL(sbi_err_map_linux_errno);
#ifdef CONFIG_RISCV_SBI_V01
+static unsigned long __sbi_v01_cpumask_to_hartmask(const struct cpumask *cpu_mask)
+{
+ unsigned long cpuid, hartid;
+ unsigned long hmask = 0;
+
+ /*
+ * There is no maximum hartid concept in RISC-V and NR_CPUS must not be
+ * associated with hartid. As SBI v0.1 is only kept for backward compatibility
+ * and will be removed in the future, there is no point in supporting hartid
+ * greater than BITS_PER_LONG (32 for RV32 and 64 for RV64). Ideally, SBI v0.2
+ * should be used for platforms with hartid greater than BITS_PER_LONG.
+ */
+ for_each_cpu(cpuid, cpu_mask) {
+ hartid = cpuid_to_hartid_map(cpuid);
+ if (hartid >= BITS_PER_LONG) {
+ pr_warn("Unable to send any request to hartid > BITS_PER_LONG for SBI v0.1\n");
+ break;
+ }
+ hmask |= 1 << hartid;
+ }
+
+ return hmask;
+}
+
/**
* sbi_console_putchar() - Writes given character to the console device.
* @ch: The data to be written to the console.
@@ -132,33 +156,44 @@ static void __sbi_set_timer_v01(uint64_t stime_value)
#endif
}
-static int __sbi_send_ipi_v01(const unsigned long *hart_mask)
+static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask)
{
- sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)hart_mask,
+ unsigned long hart_mask;
+
+ if (!cpu_mask)
+ cpu_mask = cpu_online_mask;
+ hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask);
+
+ sbi_ecall(SBI_EXT_0_1_SEND_IPI, 0, (unsigned long)(&hart_mask),
0, 0, 0, 0, 0);
return 0;
}
-static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask,
+static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask,
unsigned long start, unsigned long size,
unsigned long arg4, unsigned long arg5)
{
int result = 0;
+ unsigned long hart_mask;
+
+ if (!cpu_mask)
+ cpu_mask = cpu_online_mask;
+ hart_mask = __sbi_v01_cpumask_to_hartmask(cpu_mask);
/* v0.2 function IDs are equivalent to v0.1 extension IDs */
switch (fid) {
case SBI_EXT_RFENCE_REMOTE_FENCE_I:
sbi_ecall(SBI_EXT_0_1_REMOTE_FENCE_I, 0,
- (unsigned long)hart_mask, 0, 0, 0, 0, 0);
+ (unsigned long)&hart_mask, 0, 0, 0, 0, 0);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA, 0,
- (unsigned long)hart_mask, start, size,
+ (unsigned long)&hart_mask, start, size,
0, 0, 0);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
sbi_ecall(SBI_EXT_0_1_REMOTE_SFENCE_VMA_ASID, 0,
- (unsigned long)hart_mask, start, size,
+ (unsigned long)&hart_mask, start, size,
arg4, 0, 0);
break;
default:
@@ -180,7 +215,7 @@ static void __sbi_set_timer_v01(uint64_t stime_value)
sbi_major_version(), sbi_minor_version());
}
-static int __sbi_send_ipi_v01(const unsigned long *hart_mask)
+static int __sbi_send_ipi_v01(const struct cpumask *cpu_mask)
{
pr_warn("IPI extension is not available in SBI v%lu.%lu\n",
sbi_major_version(), sbi_minor_version());
@@ -188,7 +223,7 @@ static int __sbi_send_ipi_v01(const unsigned long *hart_mask)
return 0;
}
-static int __sbi_rfence_v01(int fid, const unsigned long *hart_mask,
+static int __sbi_rfence_v01(int fid, const struct cpumask *cpu_mask,
unsigned long start, unsigned long size,
unsigned long arg4, unsigned long arg5)
{
@@ -212,37 +247,33 @@ static void __sbi_set_timer_v02(uint64_t stime_value)
#endif
}
-static int __sbi_send_ipi_v02(const unsigned long *hart_mask)
+static int __sbi_send_ipi_v02(const struct cpumask *cpu_mask)
{
- unsigned long hartid, hmask_val, hbase;
- struct cpumask tmask;
+ unsigned long hartid, cpuid, hmask = 0, hbase = 0;
struct sbiret ret = {0};
int result;
- if (!hart_mask || !(*hart_mask)) {
- riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask);
- hart_mask = cpumask_bits(&tmask);
- }
+ if (!cpu_mask)
+ cpu_mask = cpu_online_mask;
- hmask_val = 0;
- hbase = 0;
- for_each_set_bit(hartid, hart_mask, NR_CPUS) {
- if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) {
+ for_each_cpu(cpuid, cpu_mask) {
+ hartid = cpuid_to_hartid_map(cpuid);
+ if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) {
ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI,
- hmask_val, hbase, 0, 0, 0, 0);
+ hmask, hbase, 0, 0, 0, 0);
if (ret.error)
goto ecall_failed;
- hmask_val = 0;
+ hmask = 0;
hbase = 0;
}
- if (!hmask_val)
+ if (!hmask)
hbase = hartid;
- hmask_val |= 1UL << (hartid - hbase);
+ hmask |= 1UL << (hartid - hbase);
}
- if (hmask_val) {
+ if (hmask) {
ret = sbi_ecall(SBI_EXT_IPI, SBI_EXT_IPI_SEND_IPI,
- hmask_val, hbase, 0, 0, 0, 0);
+ hmask, hbase, 0, 0, 0, 0);
if (ret.error)
goto ecall_failed;
}
@@ -252,11 +283,11 @@ static int __sbi_send_ipi_v02(const unsigned long *hart_mask)
ecall_failed:
result = sbi_err_map_linux_errno(ret.error);
pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n",
- __func__, hbase, hmask_val, result);
+ __func__, hbase, hmask, result);
return result;
}
-static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val,
+static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask,
unsigned long hbase, unsigned long start,
unsigned long size, unsigned long arg4,
unsigned long arg5)
@@ -267,31 +298,31 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val,
switch (fid) {
case SBI_EXT_RFENCE_REMOTE_FENCE_I:
- ret = sbi_ecall(ext, fid, hmask_val, hbase, 0, 0, 0, 0);
+ ret = sbi_ecall(ext, fid, hmask, hbase, 0, 0, 0, 0);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
- ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ ret = sbi_ecall(ext, fid, hmask, hbase, start,
size, 0, 0);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
- ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ ret = sbi_ecall(ext, fid, hmask, hbase, start,
size, arg4, 0);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
- ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ ret = sbi_ecall(ext, fid, hmask, hbase, start,
size, 0, 0);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID:
- ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ ret = sbi_ecall(ext, fid, hmask, hbase, start,
size, arg4, 0);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA:
- ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ ret = sbi_ecall(ext, fid, hmask, hbase, start,
size, 0, 0);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID:
- ret = sbi_ecall(ext, fid, hmask_val, hbase, start,
+ ret = sbi_ecall(ext, fid, hmask, hbase, start,
size, arg4, 0);
break;
default:
@@ -303,43 +334,39 @@ static int __sbi_rfence_v02_call(unsigned long fid, unsigned long hmask_val,
if (ret.error) {
result = sbi_err_map_linux_errno(ret.error);
pr_err("%s: hbase = [%lu] hmask = [0x%lx] failed (error [%d])\n",
- __func__, hbase, hmask_val, result);
+ __func__, hbase, hmask, result);
}
return result;
}
-static int __sbi_rfence_v02(int fid, const unsigned long *hart_mask,
+static int __sbi_rfence_v02(int fid, const struct cpumask *cpu_mask,
unsigned long start, unsigned long size,
unsigned long arg4, unsigned long arg5)
{
- unsigned long hmask_val, hartid, hbase;
- struct cpumask tmask;
+ unsigned long hartid, cpuid, hmask = 0, hbase = 0;
int result;
- if (!hart_mask || !(*hart_mask)) {
- riscv_cpuid_to_hartid_mask(cpu_online_mask, &tmask);
- hart_mask = cpumask_bits(&tmask);
- }
+ if (!cpu_mask)
+ cpu_mask = cpu_online_mask;
- hmask_val = 0;
- hbase = 0;
- for_each_set_bit(hartid, hart_mask, NR_CPUS) {
- if (hmask_val && ((hbase + BITS_PER_LONG) <= hartid)) {
- result = __sbi_rfence_v02_call(fid, hmask_val, hbase,
+ for_each_cpu(cpuid, cpu_mask) {
+ hartid = cpuid_to_hartid_map(cpuid);
+ if (hmask && ((hbase + BITS_PER_LONG) <= hartid)) {
+ result = __sbi_rfence_v02_call(fid, hmask, hbase,
start, size, arg4, arg5);
if (result)
return result;
- hmask_val = 0;
+ hmask = 0;
hbase = 0;
}
- if (!hmask_val)
+ if (!hmask)
hbase = hartid;
- hmask_val |= 1UL << (hartid - hbase);
+ hmask |= 1UL << (hartid - hbase);
}
- if (hmask_val) {
- result = __sbi_rfence_v02_call(fid, hmask_val, hbase,
+ if (hmask) {
+ result = __sbi_rfence_v02_call(fid, hmask, hbase,
start, size, arg4, arg5);
if (result)
return result;
@@ -361,44 +388,44 @@ void sbi_set_timer(uint64_t stime_value)
/**
* sbi_send_ipi() - Send an IPI to any hart.
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
*
* Return: 0 on success, appropriate linux error code otherwise.
*/
-int sbi_send_ipi(const unsigned long *hart_mask)
+int sbi_send_ipi(const struct cpumask *cpu_mask)
{
- return __sbi_send_ipi(hart_mask);
+ return __sbi_send_ipi(cpu_mask);
}
EXPORT_SYMBOL(sbi_send_ipi);
/**
* sbi_remote_fence_i() - Execute FENCE.I instruction on given remote harts.
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
*
* Return: 0 on success, appropriate linux error code otherwise.
*/
-int sbi_remote_fence_i(const unsigned long *hart_mask)
+int sbi_remote_fence_i(const struct cpumask *cpu_mask)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_FENCE_I,
- hart_mask, 0, 0, 0, 0);
+ cpu_mask, 0, 0, 0, 0);
}
EXPORT_SYMBOL(sbi_remote_fence_i);
/**
* sbi_remote_sfence_vma() - Execute SFENCE.VMA instructions on given remote
* harts for the specified virtual address range.
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
* @start: Start of the virtual address
* @size: Total size of the virtual address range.
*
* Return: 0 on success, appropriate linux error code otherwise.
*/
-int sbi_remote_sfence_vma(const unsigned long *hart_mask,
+int sbi_remote_sfence_vma(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA,
- hart_mask, start, size, 0, 0);
+ cpu_mask, start, size, 0, 0);
}
EXPORT_SYMBOL(sbi_remote_sfence_vma);
@@ -406,38 +433,38 @@ EXPORT_SYMBOL(sbi_remote_sfence_vma);
* sbi_remote_sfence_vma_asid() - Execute SFENCE.VMA instructions on given
* remote harts for a virtual address range belonging to a specific ASID.
*
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
* @start: Start of the virtual address
* @size: Total size of the virtual address range.
* @asid: The value of address space identifier (ASID).
*
* Return: 0 on success, appropriate linux error code otherwise.
*/
-int sbi_remote_sfence_vma_asid(const unsigned long *hart_mask,
+int sbi_remote_sfence_vma_asid(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size,
unsigned long asid)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID,
- hart_mask, start, size, asid, 0);
+ cpu_mask, start, size, asid, 0);
}
EXPORT_SYMBOL(sbi_remote_sfence_vma_asid);
/**
* sbi_remote_hfence_gvma() - Execute HFENCE.GVMA instructions on given remote
* harts for the specified guest physical address range.
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
* @start: Start of the guest physical address
* @size: Total size of the guest physical address range.
*
* Return: None
*/
-int sbi_remote_hfence_gvma(const unsigned long *hart_mask,
+int sbi_remote_hfence_gvma(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA,
- hart_mask, start, size, 0, 0);
+ cpu_mask, start, size, 0, 0);
}
EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma);
@@ -445,38 +472,38 @@ EXPORT_SYMBOL_GPL(sbi_remote_hfence_gvma);
* sbi_remote_hfence_gvma_vmid() - Execute HFENCE.GVMA instructions on given
* remote harts for a guest physical address range belonging to a specific VMID.
*
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
* @start: Start of the guest physical address
* @size: Total size of the guest physical address range.
* @vmid: The value of guest ID (VMID).
*
* Return: 0 if success, Error otherwise.
*/
-int sbi_remote_hfence_gvma_vmid(const unsigned long *hart_mask,
+int sbi_remote_hfence_gvma_vmid(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size,
unsigned long vmid)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA_VMID,
- hart_mask, start, size, vmid, 0);
+ cpu_mask, start, size, vmid, 0);
}
EXPORT_SYMBOL(sbi_remote_hfence_gvma_vmid);
/**
* sbi_remote_hfence_vvma() - Execute HFENCE.VVMA instructions on given remote
* harts for the current guest virtual address range.
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
* @start: Start of the current guest virtual address
* @size: Total size of the current guest virtual address range.
*
* Return: None
*/
-int sbi_remote_hfence_vvma(const unsigned long *hart_mask,
+int sbi_remote_hfence_vvma(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA,
- hart_mask, start, size, 0, 0);
+ cpu_mask, start, size, 0, 0);
}
EXPORT_SYMBOL(sbi_remote_hfence_vvma);
@@ -485,20 +512,20 @@ EXPORT_SYMBOL(sbi_remote_hfence_vvma);
* remote harts for current guest virtual address range belonging to a specific
* ASID.
*
- * @hart_mask: A cpu mask containing all the target harts.
+ * @cpu_mask: A cpu mask containing all the target harts.
* @start: Start of the current guest virtual address
* @size: Total size of the current guest virtual address range.
* @asid: The value of address space identifier (ASID).
*
* Return: None
*/
-int sbi_remote_hfence_vvma_asid(const unsigned long *hart_mask,
+int sbi_remote_hfence_vvma_asid(const struct cpumask *cpu_mask,
unsigned long start,
unsigned long size,
unsigned long asid)
{
return __sbi_rfence(SBI_EXT_RFENCE_REMOTE_HFENCE_VVMA_ASID,
- hart_mask, start, size, asid, 0);
+ cpu_mask, start, size, asid, 0);
}
EXPORT_SYMBOL(sbi_remote_hfence_vvma_asid);
@@ -591,11 +618,7 @@ long sbi_get_mimpid(void)
static void sbi_send_cpumask_ipi(const struct cpumask *target)
{
- struct cpumask hartid_mask;
-
- riscv_cpuid_to_hartid_mask(target, &hartid_mask);
-
- sbi_send_ipi(cpumask_bits(&hartid_mask));
+ sbi_send_ipi(target);
}
static const struct riscv_ipi_ops sbi_ipi_ops = {
diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c
index 63241abe84eb..b42bfdc67482 100644
--- a/arch/riscv/kernel/setup.c
+++ b/arch/riscv/kernel/setup.c
@@ -59,16 +59,6 @@ atomic_t hart_lottery __section(".sdata")
unsigned long boot_cpu_hartid;
static DEFINE_PER_CPU(struct cpu, cpu_devices);
-void riscv_cpuid_to_hartid_mask(const struct cpumask *in, struct cpumask *out)
-{
- int cpu;
-
- cpumask_clear(out);
- for_each_cpu(cpu, in)
- cpumask_set_cpu(cpuid_to_hartid_map(cpu), out);
-}
-EXPORT_SYMBOL_GPL(riscv_cpuid_to_hartid_mask);
-
/*
* Place kernel memory regions on the resource tree so that
* kexec-tools can retrieve them from /proc/iomem. While there
diff --git a/arch/riscv/kernel/smpboot.c b/arch/riscv/kernel/smpboot.c
index bd82375db51a..622f226454d5 100644
--- a/arch/riscv/kernel/smpboot.c
+++ b/arch/riscv/kernel/smpboot.c
@@ -96,7 +96,7 @@ void __init setup_smp(void)
if (cpuid >= NR_CPUS) {
pr_warn("Invalid cpuid [%d] for hartid [%d]\n",
cpuid, hart);
- break;
+ continue;
}
cpuid_to_hartid_map(cpuid) = hart;
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 9af67dbdc66a..f80a34fbf102 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -114,7 +114,6 @@ static bool stage2_get_leaf_entry(struct kvm *kvm, gpa_t addr,
static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
{
- struct cpumask hmask;
unsigned long size = PAGE_SIZE;
struct kvm_vmid *vmid = &kvm->arch.vmid;
@@ -127,8 +126,7 @@ static void stage2_remote_tlb_flush(struct kvm *kvm, u32 level, gpa_t addr)
* where the Guest/VM is running.
*/
preempt_disable();
- riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask);
- sbi_remote_hfence_gvma_vmid(cpumask_bits(&hmask), addr, size,
+ sbi_remote_hfence_gvma_vmid(cpu_online_mask, addr, size,
READ_ONCE(vmid->vmid));
preempt_enable();
}
diff --git a/arch/riscv/kvm/vcpu_sbi_replace.c b/arch/riscv/kvm/vcpu_sbi_replace.c
index 00036b7f83b9..1bc0608a5bfd 100644
--- a/arch/riscv/kvm/vcpu_sbi_replace.c
+++ b/arch/riscv/kvm/vcpu_sbi_replace.c
@@ -82,7 +82,7 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
{
int ret = 0;
unsigned long i;
- struct cpumask cm, hm;
+ struct cpumask cm;
struct kvm_vcpu *tmp;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
unsigned long hmask = cp->a0;
@@ -90,7 +90,6 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
unsigned long funcid = cp->a6;
cpumask_clear(&cm);
- cpumask_clear(&hm);
kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
if (hbase != -1UL) {
if (tmp->vcpu_id < hbase)
@@ -103,17 +102,15 @@ static int kvm_sbi_ext_rfence_handler(struct kvm_vcpu *vcpu, struct kvm_run *run
cpumask_set_cpu(tmp->cpu, &cm);
}
- riscv_cpuid_to_hartid_mask(&cm, &hm);
-
switch (funcid) {
case SBI_EXT_RFENCE_REMOTE_FENCE_I:
- ret = sbi_remote_fence_i(cpumask_bits(&hm));
+ ret = sbi_remote_fence_i(&cm);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA:
- ret = sbi_remote_hfence_vvma(cpumask_bits(&hm), cp->a2, cp->a3);
+ ret = sbi_remote_hfence_vvma(&cm, cp->a2, cp->a3);
break;
case SBI_EXT_RFENCE_REMOTE_SFENCE_VMA_ASID:
- ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm), cp->a2,
+ ret = sbi_remote_hfence_vvma_asid(&cm, cp->a2,
cp->a3, cp->a4);
break;
case SBI_EXT_RFENCE_REMOTE_HFENCE_GVMA:
diff --git a/arch/riscv/kvm/vcpu_sbi_v01.c b/arch/riscv/kvm/vcpu_sbi_v01.c
index 4c7e13ec9ccc..07e2de14433a 100644
--- a/arch/riscv/kvm/vcpu_sbi_v01.c
+++ b/arch/riscv/kvm/vcpu_sbi_v01.c
@@ -38,7 +38,7 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
int i, ret = 0;
u64 next_cycle;
struct kvm_vcpu *rvcpu;
- struct cpumask cm, hm;
+ struct cpumask cm;
struct kvm *kvm = vcpu->kvm;
struct kvm_cpu_context *cp = &vcpu->arch.guest_context;
@@ -101,15 +101,12 @@ static int kvm_sbi_ext_v01_handler(struct kvm_vcpu *vcpu, struct kvm_run *run,
continue;
cpumask_set_cpu(rvcpu->cpu, &cm);
}
- riscv_cpuid_to_hartid_mask(&cm, &hm);
if (cp->a7 == SBI_EXT_0_1_REMOTE_FENCE_I)
- ret = sbi_remote_fence_i(cpumask_bits(&hm));
+ ret = sbi_remote_fence_i(&cm);
else if (cp->a7 == SBI_EXT_0_1_REMOTE_SFENCE_VMA)
- ret = sbi_remote_hfence_vvma(cpumask_bits(&hm),
- cp->a1, cp->a2);
+ ret = sbi_remote_hfence_vvma(&cm, cp->a1, cp->a2);
else
- ret = sbi_remote_hfence_vvma_asid(cpumask_bits(&hm),
- cp->a1, cp->a2, cp->a3);
+ ret = sbi_remote_hfence_vvma_asid(&cm, cp->a1, cp->a2, cp->a3);
break;
default:
ret = -EINVAL;
diff --git a/arch/riscv/kvm/vmid.c b/arch/riscv/kvm/vmid.c
index 807228f8f409..2fa4f7b1813d 100644
--- a/arch/riscv/kvm/vmid.c
+++ b/arch/riscv/kvm/vmid.c
@@ -67,7 +67,6 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu)
{
unsigned long i;
struct kvm_vcpu *v;
- struct cpumask hmask;
struct kvm_vmid *vmid = &vcpu->kvm->arch.vmid;
if (!kvm_riscv_stage2_vmid_ver_changed(vmid))
@@ -102,8 +101,7 @@ void kvm_riscv_stage2_vmid_update(struct kvm_vcpu *vcpu)
* running, we force VM exits on all host CPUs using IPI and
* flush all Guest TLBs.
*/
- riscv_cpuid_to_hartid_mask(cpu_online_mask, &hmask);
- sbi_remote_hfence_gvma(cpumask_bits(&hmask), 0, 0);
+ sbi_remote_hfence_gvma(cpu_online_mask, 0, 0);
}
vmid->vmid = vmid_next;
diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c
index 89f81067e09e..6cb7d96ad9c7 100644
--- a/arch/riscv/mm/cacheflush.c
+++ b/arch/riscv/mm/cacheflush.c
@@ -67,10 +67,7 @@ void flush_icache_mm(struct mm_struct *mm, bool local)
*/
smp_mb();
} else if (IS_ENABLED(CONFIG_RISCV_SBI)) {
- cpumask_t hartid_mask;
-
- riscv_cpuid_to_hartid_mask(&others, &hartid_mask);
- sbi_remote_fence_i(cpumask_bits(&hartid_mask));
+ sbi_remote_fence_i(&others);
} else {
on_each_cpu_mask(&others, ipi_remote_fence_i, NULL, 1);
}
diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c
index ea54cc0c9106..7acbfbd14557 100644
--- a/arch/riscv/mm/context.c
+++ b/arch/riscv/mm/context.c
@@ -192,7 +192,7 @@ static void set_mm_asid(struct mm_struct *mm, unsigned int cpu)
switch_mm_fast:
csr_write(CSR_SATP, virt_to_pfn(mm->pgd) |
((cntx & asid_mask) << SATP_ASID_SHIFT) |
- SATP_MODE);
+ satp_mode);
if (need_flush_tlb)
local_flush_tlb_all();
@@ -201,7 +201,7 @@ switch_mm_fast:
static void set_mm_noasid(struct mm_struct *mm)
{
/* Switch the page table and blindly nuke entire local TLB */
- csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | SATP_MODE);
+ csr_write(CSR_SATP, virt_to_pfn(mm->pgd) | satp_mode);
local_flush_tlb_all();
}
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 0624c68331d8..cf4d018b7d66 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -37,13 +37,19 @@ EXPORT_SYMBOL(kernel_map);
#define kernel_map (*(struct kernel_mapping *)XIP_FIXUP(&kernel_map))
#endif
+#ifdef CONFIG_64BIT
+u64 satp_mode = !IS_ENABLED(CONFIG_XIP_KERNEL) ? SATP_MODE_48 : SATP_MODE_39;
+#else
+u64 satp_mode = SATP_MODE_32;
+#endif
+EXPORT_SYMBOL(satp_mode);
+
+bool pgtable_l4_enabled = IS_ENABLED(CONFIG_64BIT) && !IS_ENABLED(CONFIG_XIP_KERNEL);
+EXPORT_SYMBOL(pgtable_l4_enabled);
+
phys_addr_t phys_ram_base __ro_after_init;
EXPORT_SYMBOL(phys_ram_base);
-#ifdef CONFIG_XIP_KERNEL
-extern char _xiprom[], _exiprom[], __data_loc;
-#endif
-
unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]
__page_aligned_bss;
EXPORT_SYMBOL(empty_zero_page);
@@ -53,15 +59,6 @@ extern char _start[];
void *_dtb_early_va __initdata;
uintptr_t _dtb_early_pa __initdata;
-struct pt_alloc_ops {
- pte_t *(*get_pte_virt)(phys_addr_t pa);
- phys_addr_t (*alloc_pte)(uintptr_t va);
-#ifndef __PAGETABLE_PMD_FOLDED
- pmd_t *(*get_pmd_virt)(phys_addr_t pa);
- phys_addr_t (*alloc_pmd)(uintptr_t va);
-#endif
-};
-
static phys_addr_t dma32_phys_limit __initdata;
static void __init zone_sizes_init(void)
@@ -102,10 +99,14 @@ static void __init print_vm_layout(void)
(unsigned long)VMALLOC_END);
print_mlm("lowmem", (unsigned long)PAGE_OFFSET,
(unsigned long)high_memory);
-#ifdef CONFIG_64BIT
- print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR,
- (unsigned long)ADDRESS_SPACE_END);
+ if (IS_ENABLED(CONFIG_64BIT)) {
+#ifdef CONFIG_KASAN
+ print_mlm("kasan", KASAN_SHADOW_START, KASAN_SHADOW_END);
#endif
+
+ print_mlm("kernel", (unsigned long)KERNEL_LINK_ADDR,
+ (unsigned long)ADDRESS_SPACE_END);
+ }
}
#else
static void print_vm_layout(void) { }
@@ -130,18 +131,8 @@ void __init mem_init(void)
print_vm_layout();
}
-/*
- * The default maximal physical memory size is -PAGE_OFFSET for 32-bit kernel,
- * whereas for 64-bit kernel, the end of the virtual address space is occupied
- * by the modules/BPF/kernel mappings which reduces the available size of the
- * linear mapping.
- * Limit the memory size via mem.
- */
-#ifdef CONFIG_64BIT
-static phys_addr_t memory_limit = -PAGE_OFFSET - SZ_4G;
-#else
-static phys_addr_t memory_limit = -PAGE_OFFSET;
-#endif
+/* Limit the memory size via mem. */
+static phys_addr_t memory_limit;
static int __init early_mem(char *p)
{
@@ -162,35 +153,31 @@ early_param("mem", early_mem);
static void __init setup_bootmem(void)
{
phys_addr_t vmlinux_end = __pa_symbol(&_end);
- phys_addr_t vmlinux_start = __pa_symbol(&_start);
- phys_addr_t __maybe_unused max_mapped_addr;
- phys_addr_t phys_ram_end;
+ phys_addr_t max_mapped_addr;
+ phys_addr_t phys_ram_end, vmlinux_start;
-#ifdef CONFIG_XIP_KERNEL
- vmlinux_start = __pa_symbol(&_sdata);
-#endif
+ if (IS_ENABLED(CONFIG_XIP_KERNEL))
+ vmlinux_start = __pa_symbol(&_sdata);
+ else
+ vmlinux_start = __pa_symbol(&_start);
memblock_enforce_memory_limit(memory_limit);
/*
- * Reserve from the start of the kernel to the end of the kernel
- */
-#if defined(CONFIG_64BIT) && defined(CONFIG_STRICT_KERNEL_RWX)
- /*
* Make sure we align the reservation on PMD_SIZE since we will
* map the kernel in the linear mapping as read-only: we do not want
* any allocation to happen between _end and the next pmd aligned page.
*/
- vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK;
-#endif
+ if (IS_ENABLED(CONFIG_64BIT) && IS_ENABLED(CONFIG_STRICT_KERNEL_RWX))
+ vmlinux_end = (vmlinux_end + PMD_SIZE - 1) & PMD_MASK;
+ /*
+ * Reserve from the start of the kernel to the end of the kernel
+ */
memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start);
-
phys_ram_end = memblock_end_of_DRAM();
-#ifndef CONFIG_XIP_KERNEL
- phys_ram_base = memblock_start_of_DRAM();
-#endif
-#ifndef CONFIG_64BIT
+ if (!IS_ENABLED(CONFIG_XIP_KERNEL))
+ phys_ram_base = memblock_start_of_DRAM();
/*
* memblock allocator is not aware of the fact that last 4K bytes of
* the addressable memory can not be mapped because of IS_ERR_VALUE
@@ -200,10 +187,11 @@ static void __init setup_bootmem(void)
* address space is occupied by the kernel mapping then this check must
* be done as soon as the kernel mapping base address is determined.
*/
- max_mapped_addr = __pa(~(ulong)0);
- if (max_mapped_addr == (phys_ram_end - 1))
- memblock_set_current_limit(max_mapped_addr - 4096);
-#endif
+ if (!IS_ENABLED(CONFIG_64BIT)) {
+ max_mapped_addr = __pa(~(ulong)0);
+ if (max_mapped_addr == (phys_ram_end - 1))
+ memblock_set_current_limit(max_mapped_addr - 4096);
+ }
min_low_pfn = PFN_UP(phys_ram_base);
max_low_pfn = max_pfn = PFN_DOWN(phys_ram_end);
@@ -229,13 +217,7 @@ static void __init setup_bootmem(void)
}
#ifdef CONFIG_MMU
-static struct pt_alloc_ops _pt_ops __initdata;
-
-#ifdef CONFIG_XIP_KERNEL
-#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&_pt_ops))
-#else
-#define pt_ops _pt_ops
-#endif
+struct pt_alloc_ops pt_ops __initdata;
unsigned long riscv_pfn_base __ro_after_init;
EXPORT_SYMBOL(riscv_pfn_base);
@@ -245,9 +227,11 @@ pgd_t trampoline_pg_dir[PTRS_PER_PGD] __page_aligned_bss;
static pte_t fixmap_pte[PTRS_PER_PTE] __page_aligned_bss;
pgd_t early_pg_dir[PTRS_PER_PGD] __initdata __aligned(PAGE_SIZE);
+static pud_t __maybe_unused early_dtb_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
static pmd_t __maybe_unused early_dtb_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
#ifdef CONFIG_XIP_KERNEL
+#define pt_ops (*(struct pt_alloc_ops *)XIP_FIXUP(&pt_ops))
#define trampoline_pg_dir ((pgd_t *)XIP_FIXUP(trampoline_pg_dir))
#define fixmap_pte ((pte_t *)XIP_FIXUP(fixmap_pte))
#define early_pg_dir ((pgd_t *)XIP_FIXUP(early_pg_dir))
@@ -333,6 +317,16 @@ static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
#define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd))
#endif /* CONFIG_XIP_KERNEL */
+static pud_t trampoline_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pud_t fixmap_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pud_t early_pud[PTRS_PER_PUD] __initdata __aligned(PAGE_SIZE);
+
+#ifdef CONFIG_XIP_KERNEL
+#define trampoline_pud ((pud_t *)XIP_FIXUP(trampoline_pud))
+#define fixmap_pud ((pud_t *)XIP_FIXUP(fixmap_pud))
+#define early_pud ((pud_t *)XIP_FIXUP(early_pud))
+#endif /* CONFIG_XIP_KERNEL */
+
static pmd_t *__init get_pmd_virt_early(phys_addr_t pa)
{
/* Before MMU is enabled */
@@ -352,7 +346,7 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
static phys_addr_t __init alloc_pmd_early(uintptr_t va)
{
- BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
+ BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT);
return (uintptr_t)early_pmd;
}
@@ -399,21 +393,97 @@ static void __init create_pmd_mapping(pmd_t *pmdp,
create_pte_mapping(ptep, va, pa, sz, prot);
}
-#define pgd_next_t pmd_t
-#define alloc_pgd_next(__va) pt_ops.alloc_pmd(__va)
-#define get_pgd_next_virt(__pa) pt_ops.get_pmd_virt(__pa)
+static pud_t *__init get_pud_virt_early(phys_addr_t pa)
+{
+ return (pud_t *)((uintptr_t)pa);
+}
+
+static pud_t *__init get_pud_virt_fixmap(phys_addr_t pa)
+{
+ clear_fixmap(FIX_PUD);
+ return (pud_t *)set_fixmap_offset(FIX_PUD, pa);
+}
+
+static pud_t *__init get_pud_virt_late(phys_addr_t pa)
+{
+ return (pud_t *)__va(pa);
+}
+
+static phys_addr_t __init alloc_pud_early(uintptr_t va)
+{
+ /* Only one PUD is available for early mapping */
+ BUG_ON((va - kernel_map.virt_addr) >> PGDIR_SHIFT);
+
+ return (uintptr_t)early_pud;
+}
+
+static phys_addr_t __init alloc_pud_fixmap(uintptr_t va)
+{
+ return memblock_phys_alloc(PAGE_SIZE, PAGE_SIZE);
+}
+
+static phys_addr_t alloc_pud_late(uintptr_t va)
+{
+ unsigned long vaddr;
+
+ vaddr = __get_free_page(GFP_KERNEL);
+ BUG_ON(!vaddr);
+ return __pa(vaddr);
+}
+
+static void __init create_pud_mapping(pud_t *pudp,
+ uintptr_t va, phys_addr_t pa,
+ phys_addr_t sz, pgprot_t prot)
+{
+ pmd_t *nextp;
+ phys_addr_t next_phys;
+ uintptr_t pud_index = pud_index(va);
+
+ if (sz == PUD_SIZE) {
+ if (pud_val(pudp[pud_index]) == 0)
+ pudp[pud_index] = pfn_pud(PFN_DOWN(pa), prot);
+ return;
+ }
+
+ if (pud_val(pudp[pud_index]) == 0) {
+ next_phys = pt_ops.alloc_pmd(va);
+ pudp[pud_index] = pfn_pud(PFN_DOWN(next_phys), PAGE_TABLE);
+ nextp = pt_ops.get_pmd_virt(next_phys);
+ memset(nextp, 0, PAGE_SIZE);
+ } else {
+ next_phys = PFN_PHYS(_pud_pfn(pudp[pud_index]));
+ nextp = pt_ops.get_pmd_virt(next_phys);
+ }
+
+ create_pmd_mapping(nextp, va, pa, sz, prot);
+}
+
+#define pgd_next_t pud_t
+#define alloc_pgd_next(__va) (pgtable_l4_enabled ? \
+ pt_ops.alloc_pud(__va) : pt_ops.alloc_pmd(__va))
+#define get_pgd_next_virt(__pa) (pgtable_l4_enabled ? \
+ pt_ops.get_pud_virt(__pa) : (pgd_next_t *)pt_ops.get_pmd_virt(__pa))
#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
- create_pmd_mapping(__nextp, __va, __pa, __sz, __prot)
-#define fixmap_pgd_next fixmap_pmd
+ (pgtable_l4_enabled ? \
+ create_pud_mapping(__nextp, __va, __pa, __sz, __prot) : \
+ create_pmd_mapping((pmd_t *)__nextp, __va, __pa, __sz, __prot))
+#define fixmap_pgd_next (pgtable_l4_enabled ? \
+ (uintptr_t)fixmap_pud : (uintptr_t)fixmap_pmd)
+#define trampoline_pgd_next (pgtable_l4_enabled ? \
+ (uintptr_t)trampoline_pud : (uintptr_t)trampoline_pmd)
+#define early_dtb_pgd_next (pgtable_l4_enabled ? \
+ (uintptr_t)early_dtb_pud : (uintptr_t)early_dtb_pmd)
#else
#define pgd_next_t pte_t
#define alloc_pgd_next(__va) pt_ops.alloc_pte(__va)
#define get_pgd_next_virt(__pa) pt_ops.get_pte_virt(__pa)
#define create_pgd_next_mapping(__nextp, __va, __pa, __sz, __prot) \
create_pte_mapping(__nextp, __va, __pa, __sz, __prot)
-#define fixmap_pgd_next fixmap_pte
+#define fixmap_pgd_next ((uintptr_t)fixmap_pte)
+#define early_dtb_pgd_next ((uintptr_t)early_dtb_pmd)
+#define create_pud_mapping(__pmdp, __va, __pa, __sz, __prot)
#define create_pmd_mapping(__pmdp, __va, __pa, __sz, __prot)
-#endif
+#endif /* __PAGETABLE_PMD_FOLDED */
void __init create_pgd_mapping(pgd_t *pgdp,
uintptr_t va, phys_addr_t pa,
@@ -452,6 +522,8 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size)
}
#ifdef CONFIG_XIP_KERNEL
+extern char _xiprom[], _exiprom[], __data_loc;
+
/* called from head.S with MMU off */
asmlinkage void __init __copy_data(void)
{
@@ -500,6 +572,57 @@ static __init pgprot_t pgprot_from_va(uintptr_t va)
}
#endif /* CONFIG_STRICT_KERNEL_RWX */
+#ifdef CONFIG_64BIT
+static void __init disable_pgtable_l4(void)
+{
+ pgtable_l4_enabled = false;
+ kernel_map.page_offset = PAGE_OFFSET_L3;
+ satp_mode = SATP_MODE_39;
+}
+
+/*
+ * There is a simple way to determine if 4-level is supported by the
+ * underlying hardware: establish 1:1 mapping in 4-level page table mode
+ * then read SATP to see if the configuration was taken into account
+ * meaning sv48 is supported.
+ */
+static __init void set_satp_mode(void)
+{
+ u64 identity_satp, hw_satp;
+ uintptr_t set_satp_mode_pmd;
+
+ set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
+ create_pgd_mapping(early_pg_dir,
+ set_satp_mode_pmd, (uintptr_t)early_pud,
+ PGDIR_SIZE, PAGE_TABLE);
+ create_pud_mapping(early_pud,
+ set_satp_mode_pmd, (uintptr_t)early_pmd,
+ PUD_SIZE, PAGE_TABLE);
+ /* Handle the case where set_satp_mode straddles 2 PMDs */
+ create_pmd_mapping(early_pmd,
+ set_satp_mode_pmd, set_satp_mode_pmd,
+ PMD_SIZE, PAGE_KERNEL_EXEC);
+ create_pmd_mapping(early_pmd,
+ set_satp_mode_pmd + PMD_SIZE,
+ set_satp_mode_pmd + PMD_SIZE,
+ PMD_SIZE, PAGE_KERNEL_EXEC);
+
+ identity_satp = PFN_DOWN((uintptr_t)&early_pg_dir) | satp_mode;
+
+ local_flush_tlb_all();
+ csr_write(CSR_SATP, identity_satp);
+ hw_satp = csr_swap(CSR_SATP, 0ULL);
+ local_flush_tlb_all();
+
+ if (hw_satp != identity_satp)
+ disable_pgtable_l4();
+
+ memset(early_pg_dir, 0, PAGE_SIZE);
+ memset(early_pud, 0, PAGE_SIZE);
+ memset(early_pmd, 0, PAGE_SIZE);
+}
+#endif
+
/*
* setup_vm() is called from head.S with MMU-off.
*
@@ -564,10 +687,15 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa)
uintptr_t pa = dtb_pa & ~(PMD_SIZE - 1);
create_pgd_mapping(early_pg_dir, DTB_EARLY_BASE_VA,
- IS_ENABLED(CONFIG_64BIT) ? (uintptr_t)early_dtb_pmd : pa,
+ IS_ENABLED(CONFIG_64BIT) ? early_dtb_pgd_next : pa,
PGDIR_SIZE,
IS_ENABLED(CONFIG_64BIT) ? PAGE_TABLE : PAGE_KERNEL);
+ if (pgtable_l4_enabled) {
+ create_pud_mapping(early_dtb_pud, DTB_EARLY_BASE_VA,
+ (uintptr_t)early_dtb_pmd, PUD_SIZE, PAGE_TABLE);
+ }
+
if (IS_ENABLED(CONFIG_64BIT)) {
create_pmd_mapping(early_dtb_pmd, DTB_EARLY_BASE_VA,
pa, PMD_SIZE, PAGE_KERNEL);
@@ -589,11 +717,64 @@ static void __init create_fdt_early_page_table(pgd_t *pgdir, uintptr_t dtb_pa)
dtb_early_pa = dtb_pa;
}
+/*
+ * MMU is not enabled, the page tables are allocated directly using
+ * early_pmd/pud/p4d and the address returned is the physical one.
+ */
+void __init pt_ops_set_early(void)
+{
+ pt_ops.alloc_pte = alloc_pte_early;
+ pt_ops.get_pte_virt = get_pte_virt_early;
+#ifndef __PAGETABLE_PMD_FOLDED
+ pt_ops.alloc_pmd = alloc_pmd_early;
+ pt_ops.get_pmd_virt = get_pmd_virt_early;
+ pt_ops.alloc_pud = alloc_pud_early;
+ pt_ops.get_pud_virt = get_pud_virt_early;
+#endif
+}
+
+/*
+ * MMU is enabled but page table setup is not complete yet.
+ * fixmap page table alloc functions must be used as a means to temporarily
+ * map the allocated physical pages since the linear mapping does not exist yet.
+ *
+ * Note that this is called with MMU disabled, hence kernel_mapping_pa_to_va,
+ * but it will be used as described above.
+ */
+void __init pt_ops_set_fixmap(void)
+{
+ pt_ops.alloc_pte = kernel_mapping_pa_to_va((uintptr_t)alloc_pte_fixmap);
+ pt_ops.get_pte_virt = kernel_mapping_pa_to_va((uintptr_t)get_pte_virt_fixmap);
+#ifndef __PAGETABLE_PMD_FOLDED
+ pt_ops.alloc_pmd = kernel_mapping_pa_to_va((uintptr_t)alloc_pmd_fixmap);
+ pt_ops.get_pmd_virt = kernel_mapping_pa_to_va((uintptr_t)get_pmd_virt_fixmap);
+ pt_ops.alloc_pud = kernel_mapping_pa_to_va((uintptr_t)alloc_pud_fixmap);
+ pt_ops.get_pud_virt = kernel_mapping_pa_to_va((uintptr_t)get_pud_virt_fixmap);
+#endif
+}
+
+/*
+ * MMU is enabled and page table setup is complete, so from now, we can use
+ * generic page allocation functions to setup page table.
+ */
+void __init pt_ops_set_late(void)
+{
+ pt_ops.alloc_pte = alloc_pte_late;
+ pt_ops.get_pte_virt = get_pte_virt_late;
+#ifndef __PAGETABLE_PMD_FOLDED
+ pt_ops.alloc_pmd = alloc_pmd_late;
+ pt_ops.get_pmd_virt = get_pmd_virt_late;
+ pt_ops.alloc_pud = alloc_pud_late;
+ pt_ops.get_pud_virt = get_pud_virt_late;
+#endif
+}
+
asmlinkage void __init setup_vm(uintptr_t dtb_pa)
{
pmd_t __maybe_unused fix_bmap_spmd, fix_bmap_epmd;
kernel_map.virt_addr = KERNEL_LINK_ADDR;
+ kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL);
#ifdef CONFIG_XIP_KERNEL
kernel_map.xiprom = (uintptr_t)CONFIG_XIP_PHYS_ADDR;
@@ -608,11 +789,24 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
kernel_map.phys_addr = (uintptr_t)(&_start);
kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr;
#endif
+
+#if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL)
+ set_satp_mode();
+#endif
+
kernel_map.va_pa_offset = PAGE_OFFSET - kernel_map.phys_addr;
kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr;
riscv_pfn_base = PFN_DOWN(kernel_map.phys_addr);
+ /*
+ * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit
+ * kernel, whereas for 64-bit kernel, the end of the virtual address
+ * space is occupied by the modules/BPF/kernel mappings which reduces
+ * the available size of the linear mapping.
+ */
+ memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0);
+
/* Sanity check alignment and size */
BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0);
BUG_ON((kernel_map.phys_addr % PMD_SIZE) != 0);
@@ -625,23 +819,25 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
BUG_ON((kernel_map.virt_addr + kernel_map.size) > ADDRESS_SPACE_END - SZ_4K);
#endif
- pt_ops.alloc_pte = alloc_pte_early;
- pt_ops.get_pte_virt = get_pte_virt_early;
-#ifndef __PAGETABLE_PMD_FOLDED
- pt_ops.alloc_pmd = alloc_pmd_early;
- pt_ops.get_pmd_virt = get_pmd_virt_early;
-#endif
+ pt_ops_set_early();
+
/* Setup early PGD for fixmap */
create_pgd_mapping(early_pg_dir, FIXADDR_START,
- (uintptr_t)fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
+ fixmap_pgd_next, PGDIR_SIZE, PAGE_TABLE);
#ifndef __PAGETABLE_PMD_FOLDED
- /* Setup fixmap PMD */
+ /* Setup fixmap PUD and PMD */
+ if (pgtable_l4_enabled)
+ create_pud_mapping(fixmap_pud, FIXADDR_START,
+ (uintptr_t)fixmap_pmd, PUD_SIZE, PAGE_TABLE);
create_pmd_mapping(fixmap_pmd, FIXADDR_START,
(uintptr_t)fixmap_pte, PMD_SIZE, PAGE_TABLE);
/* Setup trampoline PGD and PMD */
create_pgd_mapping(trampoline_pg_dir, kernel_map.virt_addr,
- (uintptr_t)trampoline_pmd, PGDIR_SIZE, PAGE_TABLE);
+ trampoline_pgd_next, PGDIR_SIZE, PAGE_TABLE);
+ if (pgtable_l4_enabled)
+ create_pud_mapping(trampoline_pud, kernel_map.virt_addr,
+ (uintptr_t)trampoline_pmd, PUD_SIZE, PAGE_TABLE);
#ifdef CONFIG_XIP_KERNEL
create_pmd_mapping(trampoline_pmd, kernel_map.virt_addr,
kernel_map.xiprom, PMD_SIZE, PAGE_KERNEL_EXEC);
@@ -669,7 +865,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
* Bootime fixmap only can handle PMD_SIZE mapping. Thus, boot-ioremap
* range can not span multiple pmds.
*/
- BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
+ BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
!= (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
#ifndef __PAGETABLE_PMD_FOLDED
@@ -694,6 +890,8 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa)
pr_warn("FIX_BTMAP_BEGIN: %d\n", FIX_BTMAP_BEGIN);
}
#endif
+
+ pt_ops_set_fixmap();
}
static void __init setup_vm_final(void)
@@ -702,16 +900,6 @@ static void __init setup_vm_final(void)
phys_addr_t pa, start, end;
u64 i;
- /**
- * MMU is enabled at this point. But page table setup is not complete yet.
- * fixmap page table alloc functions should be used at this point
- */
- pt_ops.alloc_pte = alloc_pte_fixmap;
- pt_ops.get_pte_virt = get_pte_virt_fixmap;
-#ifndef __PAGETABLE_PMD_FOLDED
- pt_ops.alloc_pmd = alloc_pmd_fixmap;
- pt_ops.get_pmd_virt = get_pmd_virt_fixmap;
-#endif
/* Setup swapper PGD for fixmap */
create_pgd_mapping(swapper_pg_dir, FIXADDR_START,
__pa_symbol(fixmap_pgd_next),
@@ -736,26 +924,24 @@ static void __init setup_vm_final(void)
}
}
-#ifdef CONFIG_64BIT
/* Map the kernel */
- create_kernel_page_table(swapper_pg_dir, false);
+ if (IS_ENABLED(CONFIG_64BIT))
+ create_kernel_page_table(swapper_pg_dir, false);
+
+#ifdef CONFIG_KASAN
+ kasan_swapper_init();
#endif
/* Clear fixmap PTE and PMD mappings */
clear_fixmap(FIX_PTE);
clear_fixmap(FIX_PMD);
+ clear_fixmap(FIX_PUD);
/* Move to swapper page table */
- csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | SATP_MODE);
+ csr_write(CSR_SATP, PFN_DOWN(__pa_symbol(swapper_pg_dir)) | satp_mode);
local_flush_tlb_all();
- /* generic page allocation functions must be used to setup page table */
- pt_ops.alloc_pte = alloc_pte_late;
- pt_ops.get_pte_virt = get_pte_virt_late;
-#ifndef __PAGETABLE_PMD_FOLDED
- pt_ops.alloc_pmd = alloc_pmd_late;
- pt_ops.get_pmd_virt = get_pmd_virt_late;
-#endif
+ pt_ops_set_late();
}
#else
asmlinkage void __init setup_vm(uintptr_t dtb_pa)
@@ -791,12 +977,10 @@ static void __init reserve_crashkernel(void)
* since it doesn't make much sense and we have limited memory
* resources.
*/
-#ifdef CONFIG_CRASH_DUMP
if (is_kdump_kernel()) {
pr_info("crashkernel: ignoring reservation request\n");
return;
}
-#endif
ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(),
&crash_size, &crash_base);
diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c
index 54294f83513d..f61f7ca6fe0f 100644
--- a/arch/riscv/mm/kasan_init.c
+++ b/arch/riscv/mm/kasan_init.c
@@ -11,45 +11,27 @@
#include <asm/fixmap.h>
#include <asm/pgalloc.h>
-extern pgd_t early_pg_dir[PTRS_PER_PGD];
-asmlinkage void __init kasan_early_init(void)
-{
- uintptr_t i;
- pgd_t *pgd = early_pg_dir + pgd_index(KASAN_SHADOW_START);
-
- BUILD_BUG_ON(KASAN_SHADOW_OFFSET !=
- KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT)));
-
- for (i = 0; i < PTRS_PER_PTE; ++i)
- set_pte(kasan_early_shadow_pte + i,
- mk_pte(virt_to_page(kasan_early_shadow_page),
- PAGE_KERNEL));
-
- for (i = 0; i < PTRS_PER_PMD; ++i)
- set_pmd(kasan_early_shadow_pmd + i,
- pfn_pmd(PFN_DOWN
- (__pa((uintptr_t) kasan_early_shadow_pte)),
- __pgprot(_PAGE_TABLE)));
-
- for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END;
- i += PGDIR_SIZE, ++pgd)
- set_pgd(pgd,
- pfn_pgd(PFN_DOWN
- (__pa(((uintptr_t) kasan_early_shadow_pmd))),
- __pgprot(_PAGE_TABLE)));
-
- /* init for swapper_pg_dir */
- pgd = pgd_offset_k(KASAN_SHADOW_START);
-
- for (i = KASAN_SHADOW_START; i < KASAN_SHADOW_END;
- i += PGDIR_SIZE, ++pgd)
- set_pgd(pgd,
- pfn_pgd(PFN_DOWN
- (__pa(((uintptr_t) kasan_early_shadow_pmd))),
- __pgprot(_PAGE_TABLE)));
+/*
+ * Kasan shadow region must lie at a fixed address across sv39, sv48 and sv57
+ * which is right before the kernel.
+ *
+ * For sv39, the region is aligned on PGDIR_SIZE so we only need to populate
+ * the page global directory with kasan_early_shadow_pmd.
+ *
+ * For sv48 and sv57, the region is not aligned on PGDIR_SIZE so the mapping
+ * must be divided as follows:
+ * - the first PGD entry, although incomplete, is populated with
+ * kasan_early_shadow_pud/p4d
+ * - the PGD entries in the middle are populated with kasan_early_shadow_pud/p4d
+ * - the last PGD entry is shared with the kernel mapping so populated at the
+ * lower levels pud/p4d
+ *
+ * In addition, when shallow populating a kasan region (for example vmalloc),
+ * this region may also not be aligned on PGDIR size, so we must go down to the
+ * pud level too.
+ */
- local_flush_tlb_all();
-}
+extern pgd_t early_pg_dir[PTRS_PER_PGD];
static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned long end)
{
@@ -73,15 +55,19 @@ static void __init kasan_populate_pte(pmd_t *pmd, unsigned long vaddr, unsigned
set_pmd(pmd, pfn_pmd(PFN_DOWN(__pa(base_pte)), PAGE_TABLE));
}
-static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned long end)
+static void __init kasan_populate_pmd(pud_t *pud, unsigned long vaddr, unsigned long end)
{
phys_addr_t phys_addr;
pmd_t *pmdp, *base_pmd;
unsigned long next;
- base_pmd = (pmd_t *)pgd_page_vaddr(*pgd);
- if (base_pmd == lm_alias(kasan_early_shadow_pmd))
+ if (pud_none(*pud)) {
base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
+ } else {
+ base_pmd = (pmd_t *)pud_pgtable(*pud);
+ if (base_pmd == lm_alias(kasan_early_shadow_pmd))
+ base_pmd = memblock_alloc(PTRS_PER_PMD * sizeof(pmd_t), PAGE_SIZE);
+ }
pmdp = base_pmd + pmd_index(vaddr);
@@ -105,59 +91,207 @@ static void __init kasan_populate_pmd(pgd_t *pgd, unsigned long vaddr, unsigned
* it entirely, memblock could allocate a page at a physical address
* where KASAN is not populated yet and then we'd get a page fault.
*/
- set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
+ set_pud(pud, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
+}
+
+static void __init kasan_populate_pud(pgd_t *pgd,
+ unsigned long vaddr, unsigned long end,
+ bool early)
+{
+ phys_addr_t phys_addr;
+ pud_t *pudp, *base_pud;
+ unsigned long next;
+
+ if (early) {
+ /*
+ * We can't use pgd_page_vaddr here as it would return a linear
+ * mapping address but it is not mapped yet, but when populating
+ * early_pg_dir, we need the physical address and when populating
+ * swapper_pg_dir, we need the kernel virtual address so use
+ * pt_ops facility.
+ */
+ base_pud = pt_ops.get_pud_virt(pfn_to_phys(_pgd_pfn(*pgd)));
+ } else {
+ base_pud = (pud_t *)pgd_page_vaddr(*pgd);
+ if (base_pud == lm_alias(kasan_early_shadow_pud))
+ base_pud = memblock_alloc(PTRS_PER_PUD * sizeof(pud_t), PAGE_SIZE);
+ }
+
+ pudp = base_pud + pud_index(vaddr);
+
+ do {
+ next = pud_addr_end(vaddr, end);
+
+ if (pud_none(*pudp) && IS_ALIGNED(vaddr, PUD_SIZE) && (next - vaddr) >= PUD_SIZE) {
+ if (early) {
+ phys_addr = __pa(((uintptr_t)kasan_early_shadow_pmd));
+ set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_TABLE));
+ continue;
+ } else {
+ phys_addr = memblock_phys_alloc(PUD_SIZE, PUD_SIZE);
+ if (phys_addr) {
+ set_pud(pudp, pfn_pud(PFN_DOWN(phys_addr), PAGE_KERNEL));
+ continue;
+ }
+ }
+ }
+
+ kasan_populate_pmd(pudp, vaddr, next);
+ } while (pudp++, vaddr = next, vaddr != end);
+
+ /*
+ * Wait for the whole PGD to be populated before setting the PGD in
+ * the page table, otherwise, if we did set the PGD before populating
+ * it entirely, memblock could allocate a page at a physical address
+ * where KASAN is not populated yet and then we'd get a page fault.
+ */
+ if (!early)
+ set_pgd(pgd, pfn_pgd(PFN_DOWN(__pa(base_pud)), PAGE_TABLE));
}
-static void __init kasan_populate_pgd(unsigned long vaddr, unsigned long end)
+#define kasan_early_shadow_pgd_next (pgtable_l4_enabled ? \
+ (uintptr_t)kasan_early_shadow_pud : \
+ (uintptr_t)kasan_early_shadow_pmd)
+#define kasan_populate_pgd_next(pgdp, vaddr, next, early) \
+ (pgtable_l4_enabled ? \
+ kasan_populate_pud(pgdp, vaddr, next, early) : \
+ kasan_populate_pmd((pud_t *)pgdp, vaddr, next))
+
+static void __init kasan_populate_pgd(pgd_t *pgdp,
+ unsigned long vaddr, unsigned long end,
+ bool early)
{
phys_addr_t phys_addr;
- pgd_t *pgdp = pgd_offset_k(vaddr);
unsigned long next;
do {
next = pgd_addr_end(vaddr, end);
- /*
- * pgdp can't be none since kasan_early_init initialized all KASAN
- * shadow region with kasan_early_shadow_pmd: if this is stillthe case,
- * that means we can try to allocate a hugepage as a replacement.
- */
- if (pgd_page_vaddr(*pgdp) == (unsigned long)lm_alias(kasan_early_shadow_pmd) &&
- IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) {
- phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE);
- if (phys_addr) {
- set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL));
+ if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE) {
+ if (early) {
+ phys_addr = __pa((uintptr_t)kasan_early_shadow_pgd_next);
+ set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_TABLE));
continue;
+ } else if (pgd_page_vaddr(*pgdp) ==
+ (unsigned long)lm_alias(kasan_early_shadow_pgd_next)) {
+ /*
+ * pgdp can't be none since kasan_early_init
+ * initialized all KASAN shadow region with
+ * kasan_early_shadow_pud: if this is still the
+ * case, that means we can try to allocate a
+ * hugepage as a replacement.
+ */
+ phys_addr = memblock_phys_alloc(PGDIR_SIZE, PGDIR_SIZE);
+ if (phys_addr) {
+ set_pgd(pgdp, pfn_pgd(PFN_DOWN(phys_addr), PAGE_KERNEL));
+ continue;
+ }
}
}
- kasan_populate_pmd(pgdp, vaddr, next);
+ kasan_populate_pgd_next(pgdp, vaddr, next, early);
} while (pgdp++, vaddr = next, vaddr != end);
}
+asmlinkage void __init kasan_early_init(void)
+{
+ uintptr_t i;
+
+ BUILD_BUG_ON(KASAN_SHADOW_OFFSET !=
+ KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT)));
+
+ for (i = 0; i < PTRS_PER_PTE; ++i)
+ set_pte(kasan_early_shadow_pte + i,
+ mk_pte(virt_to_page(kasan_early_shadow_page),
+ PAGE_KERNEL));
+
+ for (i = 0; i < PTRS_PER_PMD; ++i)
+ set_pmd(kasan_early_shadow_pmd + i,
+ pfn_pmd(PFN_DOWN
+ (__pa((uintptr_t)kasan_early_shadow_pte)),
+ PAGE_TABLE));
+
+ if (pgtable_l4_enabled) {
+ for (i = 0; i < PTRS_PER_PUD; ++i)
+ set_pud(kasan_early_shadow_pud + i,
+ pfn_pud(PFN_DOWN
+ (__pa(((uintptr_t)kasan_early_shadow_pmd))),
+ PAGE_TABLE));
+ }
+
+ kasan_populate_pgd(early_pg_dir + pgd_index(KASAN_SHADOW_START),
+ KASAN_SHADOW_START, KASAN_SHADOW_END, true);
+
+ local_flush_tlb_all();
+}
+
+void __init kasan_swapper_init(void)
+{
+ kasan_populate_pgd(pgd_offset_k(KASAN_SHADOW_START),
+ KASAN_SHADOW_START, KASAN_SHADOW_END, true);
+
+ local_flush_tlb_all();
+}
+
static void __init kasan_populate(void *start, void *end)
{
unsigned long vaddr = (unsigned long)start & PAGE_MASK;
unsigned long vend = PAGE_ALIGN((unsigned long)end);
- kasan_populate_pgd(vaddr, vend);
+ kasan_populate_pgd(pgd_offset_k(vaddr), vaddr, vend, false);
local_flush_tlb_all();
memset(start, KASAN_SHADOW_INIT, end - start);
}
+static void __init kasan_shallow_populate_pud(pgd_t *pgdp,
+ unsigned long vaddr, unsigned long end,
+ bool kasan_populate)
+{
+ unsigned long next;
+ pud_t *pudp, *base_pud;
+ pmd_t *base_pmd;
+ bool is_kasan_pmd;
+
+ base_pud = (pud_t *)pgd_page_vaddr(*pgdp);
+ pudp = base_pud + pud_index(vaddr);
+
+ if (kasan_populate)
+ memcpy(base_pud, (void *)kasan_early_shadow_pgd_next,
+ sizeof(pud_t) * PTRS_PER_PUD);
+
+ do {
+ next = pud_addr_end(vaddr, end);
+ is_kasan_pmd = (pud_pgtable(*pudp) == lm_alias(kasan_early_shadow_pmd));
+
+ if (is_kasan_pmd) {
+ base_pmd = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
+ set_pud(pudp, pfn_pud(PFN_DOWN(__pa(base_pmd)), PAGE_TABLE));
+ }
+ } while (pudp++, vaddr = next, vaddr != end);
+}
+
static void __init kasan_shallow_populate_pgd(unsigned long vaddr, unsigned long end)
{
unsigned long next;
void *p;
pgd_t *pgd_k = pgd_offset_k(vaddr);
+ bool is_kasan_pgd_next;
do {
next = pgd_addr_end(vaddr, end);
- if (pgd_page_vaddr(*pgd_k) == (unsigned long)lm_alias(kasan_early_shadow_pmd)) {
+ is_kasan_pgd_next = (pgd_page_vaddr(*pgd_k) ==
+ (unsigned long)lm_alias(kasan_early_shadow_pgd_next));
+
+ if (is_kasan_pgd_next) {
p = memblock_alloc(PAGE_SIZE, PAGE_SIZE);
set_pgd(pgd_k, pfn_pgd(PFN_DOWN(__pa(p)), PAGE_TABLE));
}
+
+ if (IS_ALIGNED(vaddr, PGDIR_SIZE) && (next - vaddr) >= PGDIR_SIZE)
+ continue;
+
+ kasan_shallow_populate_pud(pgd_k, vaddr, next, is_kasan_pgd_next);
} while (pgd_k++, vaddr = next, vaddr != end);
}
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 64f8201237c2..37ed760d007c 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -32,7 +32,6 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
unsigned long size, unsigned long stride)
{
struct cpumask *cmask = mm_cpumask(mm);
- struct cpumask hmask;
unsigned int cpuid;
bool broadcast;
@@ -46,9 +45,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
unsigned long asid = atomic_long_read(&mm->context.id);
if (broadcast) {
- riscv_cpuid_to_hartid_mask(cmask, &hmask);
- sbi_remote_sfence_vma_asid(cpumask_bits(&hmask),
- start, size, asid);
+ sbi_remote_sfence_vma_asid(cmask, start, size, asid);
} else if (size <= stride) {
local_flush_tlb_page_asid(start, asid);
} else {
@@ -56,9 +53,7 @@ static void __sbi_tlb_flush_range(struct mm_struct *mm, unsigned long start,
}
} else {
if (broadcast) {
- riscv_cpuid_to_hartid_mask(cmask, &hmask);
- sbi_remote_sfence_vma(cpumask_bits(&hmask),
- start, size);
+ sbi_remote_sfence_vma(cmask, start, size);
} else if (size <= stride) {
local_flush_tlb_page(start);
} else {
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index 293dd6e171ed..0bcda99d1d68 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -497,7 +497,7 @@ static int add_exception_handler(const struct bpf_insn *insn,
offset = pc - (long)&ex->insn;
if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN))
return -ERANGE;
- ex->insn = pc;
+ ex->insn = offset;
/*
* Since the extable follows the program, the fixup offset is always
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f6a9475cbc8c..be9f39fd06df 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -127,7 +127,6 @@ config S390
select GENERIC_CPU_AUTOPROBE
select GENERIC_CPU_VULNERABILITIES
select GENERIC_ENTRY
- select GENERIC_FIND_FIRST_BIT
select GENERIC_GETTIMEOFDAY
select GENERIC_PTDUMP
select GENERIC_SMP_IDLE_THREAD
@@ -946,6 +945,9 @@ config S390_GUEST
endmenu
+config S390_MODULES_SANITY_TEST_HELPERS
+ def_bool n
+
menu "Selftests"
config S390_UNWIND_SELFTEST
@@ -972,4 +974,16 @@ config S390_KPROBES_SANITY_TEST
Say N if you are unsure.
+config S390_MODULES_SANITY_TEST
+ def_tristate n
+ depends on KUNIT
+ default KUNIT_ALL_TESTS
+ prompt "Enable s390 specific modules tests"
+ select S390_MODULES_SANITY_TEST_HELPERS
+ help
+ This option enables an s390 specific modules test. This option is
+ not useful for distributions or general kernels, but only for
+ kernel developers working on architecture code.
+
+ Say N if you are unsure.
endmenu
diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig
index 354e51dcb3e2..498bed9b261b 100644
--- a/arch/s390/configs/debug_defconfig
+++ b/arch/s390/configs/debug_defconfig
@@ -63,6 +63,7 @@ CONFIG_APPLDATA_BASE=y
CONFIG_KVM=m
CONFIG_S390_UNWIND_SELFTEST=m
CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
CONFIG_STATIC_KEYS_SELFTEST=y
@@ -96,8 +97,6 @@ CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
CONFIG_CMA_DEBUG=y
CONFIG_CMA_DEBUGFS=y
CONFIG_CMA_SYSFS=y
@@ -110,6 +109,7 @@ CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
CONFIG_GUP_TEST=y
+CONFIG_ANON_VMA_NAME=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -117,7 +117,6 @@ CONFIG_UNIX=y
CONFIG_UNIX_DIAG=m
CONFIG_XFRM_USER=m
CONFIG_NET_KEY=m
-CONFIG_NET_SWITCHDEV=y
CONFIG_SMC=m
CONFIG_SMC_DIAG=m
CONFIG_INET=y
@@ -186,7 +185,6 @@ CONFIG_NF_CT_NETLINK_TIMEOUT=m
CONFIG_NF_TABLES=m
CONFIG_NF_TABLES_INET=y
CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
CONFIG_NFT_LOG=m
CONFIG_NFT_LIMIT=m
CONFIG_NFT_NAT=m
@@ -392,6 +390,7 @@ CONFIG_OPENVSWITCH=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_CGROUP_NET_PRIO=y
CONFIG_NET_PKTGEN=m
CONFIG_PCI=y
@@ -401,6 +400,7 @@ CONFIG_PCI_IOV=y
CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_S390=y
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
CONFIG_CONNECTOR=y
CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
@@ -502,6 +502,7 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_DEC is not set
# CONFIG_NET_VENDOR_DLINK is not set
# CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
# CONFIG_NET_VENDOR_EZCHIP is not set
# CONFIG_NET_VENDOR_GOOGLE is not set
# CONFIG_NET_VENDOR_HUAWEI is not set
@@ -512,7 +513,6 @@ CONFIG_NLMON=m
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
-CONFIG_MLX5_ESWITCH=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
@@ -543,6 +543,7 @@ CONFIG_MLX5_ESWITCH=y
# CONFIG_NET_VENDOR_SYNOPSYS is not set
# CONFIG_NET_VENDOR_TEHUTI is not set
# CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
# CONFIG_NET_VENDOR_VIA is not set
# CONFIG_NET_VENDOR_WIZNET is not set
# CONFIG_NET_VENDOR_XILINX is not set
@@ -593,6 +594,7 @@ CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
CONFIG_VHOST_NET=m
CONFIG_VHOST_VSOCK=m
+# CONFIG_SURFACE_PLATFORMS is not set
CONFIG_S390_CCW_IOMMU=y
CONFIG_S390_AP_IOMMU=y
CONFIG_EXT4_FS=y
@@ -757,9 +759,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_STATS=y
-CONFIG_CRYPTO_LIB_BLAKE2S=m
-CONFIG_CRYPTO_LIB_CURVE25519=m
-CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_ZCRYPT=m
CONFIG_PKEY=m
CONFIG_CRYPTO_PAES_S390=m
@@ -775,6 +774,8 @@ CONFIG_CRYPTO_GHASH_S390=m
CONFIG_CRYPTO_CRC32_S390=y
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_CORDIC=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRC32_SELFTEST=y
CONFIG_CRC4=m
CONFIG_CRC7=m
@@ -808,7 +809,6 @@ CONFIG_SLUB_DEBUG_ON=y
CONFIG_SLUB_STATS=y
CONFIG_DEBUG_STACK_USAGE=y
CONFIG_DEBUG_VM=y
-CONFIG_DEBUG_VM_VMACACHE=y
CONFIG_DEBUG_VM_PGFLAGS=y
CONFIG_DEBUG_MEMORY_INIT=y
CONFIG_MEMORY_NOTIFIER_ERROR_INJECT=m
@@ -820,12 +820,11 @@ CONFIG_PANIC_ON_OOPS=y
CONFIG_DETECT_HUNG_TASK=y
CONFIG_WQ_WATCHDOG=y
CONFIG_TEST_LOCKUP=m
-CONFIG_DEBUG_TIMEKEEPING=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCK_STAT=y
-CONFIG_DEBUG_LOCKDEP=y
CONFIG_DEBUG_ATOMIC_SLEEP=y
CONFIG_DEBUG_LOCKING_API_SELFTESTS=y
+CONFIG_DEBUG_IRQFLAGS=y
CONFIG_DEBUG_SG=y
CONFIG_DEBUG_NOTIFIERS=y
CONFIG_BUG_ON_DATA_CORRUPTION=y
diff --git a/arch/s390/configs/defconfig b/arch/s390/configs/defconfig
index 8dee6c3782f3..61e36b999f67 100644
--- a/arch/s390/configs/defconfig
+++ b/arch/s390/configs/defconfig
@@ -61,6 +61,7 @@ CONFIG_APPLDATA_BASE=y
CONFIG_KVM=m
CONFIG_S390_UNWIND_SELFTEST=m
CONFIG_S390_KPROBES_SANITY_TEST=m
+CONFIG_S390_MODULES_SANITY_TEST=m
CONFIG_KPROBES=y
CONFIG_JUMP_LABEL=y
# CONFIG_GCC_PLUGINS is not set
@@ -91,8 +92,6 @@ CONFIG_MEMORY_HOTPLUG=y
CONFIG_MEMORY_HOTREMOVE=y
CONFIG_KSM=y
CONFIG_TRANSPARENT_HUGEPAGE=y
-CONFIG_CLEANCACHE=y
-CONFIG_FRONTSWAP=y
CONFIG_CMA_SYSFS=y
CONFIG_CMA_AREAS=7
CONFIG_MEM_SOFT_DIRTY=y
@@ -102,6 +101,7 @@ CONFIG_ZSMALLOC_STAT=y
CONFIG_DEFERRED_STRUCT_PAGE_INIT=y
CONFIG_IDLE_PAGE_TRACKING=y
CONFIG_PERCPU_STATS=y
+CONFIG_ANON_VMA_NAME=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m
@@ -109,7 +109,6 @@ CONFIG_UNIX=y
CONFIG_UNIX_DIAG=m
CONFIG_XFRM_USER=m
CONFIG_NET_KEY=m
-CONFIG_NET_SWITCHDEV=y
CONFIG_SMC=m
CONFIG_SMC_DIAG=m
CONFIG_INET=y
@@ -178,7 +177,6 @@ CONFIG_NF_CT_NETLINK_TIMEOUT=m
CONFIG_NF_TABLES=m
CONFIG_NF_TABLES_INET=y
CONFIG_NFT_CT=m
-CONFIG_NFT_COUNTER=m
CONFIG_NFT_LOG=m
CONFIG_NFT_LIMIT=m
CONFIG_NFT_NAT=m
@@ -383,6 +381,7 @@ CONFIG_OPENVSWITCH=m
CONFIG_VSOCKETS=m
CONFIG_VIRTIO_VSOCKETS=m
CONFIG_NETLINK_DIAG=m
+CONFIG_NET_SWITCHDEV=y
CONFIG_CGROUP_NET_PRIO=y
CONFIG_NET_PKTGEN=m
CONFIG_PCI=y
@@ -392,6 +391,7 @@ CONFIG_HOTPLUG_PCI=y
CONFIG_HOTPLUG_PCI_S390=y
CONFIG_UEVENT_HELPER=y
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
CONFIG_CONNECTOR=y
CONFIG_ZRAM=y
CONFIG_BLK_DEV_LOOP=m
@@ -493,6 +493,7 @@ CONFIG_NLMON=m
# CONFIG_NET_VENDOR_DEC is not set
# CONFIG_NET_VENDOR_DLINK is not set
# CONFIG_NET_VENDOR_EMULEX is not set
+# CONFIG_NET_VENDOR_ENGLEDER is not set
# CONFIG_NET_VENDOR_EZCHIP is not set
# CONFIG_NET_VENDOR_GOOGLE is not set
# CONFIG_NET_VENDOR_HUAWEI is not set
@@ -503,7 +504,6 @@ CONFIG_NLMON=m
CONFIG_MLX4_EN=m
CONFIG_MLX5_CORE=m
CONFIG_MLX5_CORE_EN=y
-CONFIG_MLX5_ESWITCH=y
# CONFIG_NET_VENDOR_MICREL is not set
# CONFIG_NET_VENDOR_MICROCHIP is not set
# CONFIG_NET_VENDOR_MICROSEMI is not set
@@ -534,6 +534,7 @@ CONFIG_MLX5_ESWITCH=y
# CONFIG_NET_VENDOR_SYNOPSYS is not set
# CONFIG_NET_VENDOR_TEHUTI is not set
# CONFIG_NET_VENDOR_TI is not set
+# CONFIG_NET_VENDOR_VERTEXCOM is not set
# CONFIG_NET_VENDOR_VIA is not set
# CONFIG_NET_VENDOR_WIZNET is not set
# CONFIG_NET_VENDOR_XILINX is not set
@@ -583,6 +584,7 @@ CONFIG_VIRTIO_BALLOON=m
CONFIG_VIRTIO_INPUT=y
CONFIG_VHOST_NET=m
CONFIG_VHOST_VSOCK=m
+# CONFIG_SURFACE_PLATFORMS is not set
CONFIG_S390_CCW_IOMMU=y
CONFIG_S390_AP_IOMMU=y
CONFIG_EXT4_FS=y
@@ -744,9 +746,6 @@ CONFIG_CRYPTO_USER_API_SKCIPHER=m
CONFIG_CRYPTO_USER_API_RNG=m
CONFIG_CRYPTO_USER_API_AEAD=m
CONFIG_CRYPTO_STATS=y
-CONFIG_CRYPTO_LIB_BLAKE2S=m
-CONFIG_CRYPTO_LIB_CURVE25519=m
-CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_ZCRYPT=m
CONFIG_PKEY=m
CONFIG_CRYPTO_PAES_S390=m
@@ -763,6 +762,8 @@ CONFIG_CRYPTO_CRC32_S390=y
CONFIG_CRYPTO_DEV_VIRTIO=m
CONFIG_CORDIC=m
CONFIG_PRIME_NUMBERS=m
+CONFIG_CRYPTO_LIB_CURVE25519=m
+CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m
CONFIG_CRC4=m
CONFIG_CRC7=m
CONFIG_CRC8=m
diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig
index eed3b9acfa71..c55c668dc3c7 100644
--- a/arch/s390/configs/zfcpdump_defconfig
+++ b/arch/s390/configs/zfcpdump_defconfig
@@ -1,6 +1,7 @@
# CONFIG_SWAP is not set
CONFIG_NO_HZ_IDLE=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BPF_SYSCALL=y
# CONFIG_CPU_ISOLATION is not set
# CONFIG_UTS_NS is not set
# CONFIG_TIME_NS is not set
@@ -34,6 +35,7 @@ CONFIG_NET=y
# CONFIG_PCPU_DEV_REFCNT is not set
# CONFIG_ETHTOOL_NETLINK is not set
CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_SAFE=y
CONFIG_BLK_DEV_RAM=y
# CONFIG_DCSSBLK is not set
# CONFIG_DASD is not set
@@ -58,6 +60,7 @@ CONFIG_ZFCP=y
# CONFIG_HID is not set
# CONFIG_VIRTIO_MENU is not set
# CONFIG_VHOST_MENU is not set
+# CONFIG_SURFACE_PLATFORMS is not set
# CONFIG_IOMMU_SUPPORT is not set
# CONFIG_DNOTIFY is not set
# CONFIG_INOTIFY_USER is not set
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index 33f973ff9744..e8f15dbb89d0 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -20,6 +20,7 @@
static char local_guest[] = " ";
static char all_guests[] = "* ";
+static char *all_groups = all_guests;
static char *guest_query;
struct diag2fc_data {
@@ -62,10 +63,11 @@ static int diag2fc(int size, char* query, void *addr)
memcpy(parm_list.userid, query, NAME_LEN);
ASCEBC(parm_list.userid, NAME_LEN);
- parm_list.addr = (unsigned long) addr ;
+ memcpy(parm_list.aci_grp, all_groups, NAME_LEN);
+ ASCEBC(parm_list.aci_grp, NAME_LEN);
+ parm_list.addr = (unsigned long)addr;
parm_list.size = size;
parm_list.fmt = 0x02;
- memset(parm_list.aci_grp, 0x40, NAME_LEN);
rc = -1;
diag_stat_inc(DIAG_STAT_X2FC);
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 5a530c552c23..1d40630128a5 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -387,7 +387,6 @@ static inline int fls(unsigned int word)
#endif /* CONFIG_HAVE_MARCH_Z9_109_FEATURES */
#include <asm-generic/bitops/ffz.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/le.h>
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 0d90cbeb89b4..e3f12db46cfc 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -109,7 +109,9 @@ struct hws_basic_entry {
unsigned int AS:2; /* 29-30 PSW address-space control */
unsigned int I:1; /* 31 entry valid or invalid */
unsigned int CL:2; /* 32-33 Configuration Level */
- unsigned int:14;
+ unsigned int H:1; /* 34 Host Indicator */
+ unsigned int LS:1; /* 35 Limited Sampling */
+ unsigned int:12;
unsigned int prim_asn:16; /* primary ASN */
unsigned long long ia; /* Instruction Address */
unsigned long long gpp; /* Guest Program Parameter */
diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h
index ce550d06abc3..d74e26b48604 100644
--- a/arch/s390/include/asm/uaccess.h
+++ b/arch/s390/include/asm/uaccess.h
@@ -47,53 +47,87 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n);
int __put_user_bad(void) __attribute__((noreturn));
int __get_user_bad(void) __attribute__((noreturn));
+union oac {
+ unsigned int val;
+ struct {
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac1;
+ struct {
+ unsigned short key : 4;
+ unsigned short : 4;
+ unsigned short as : 2;
+ unsigned short : 4;
+ unsigned short k : 1;
+ unsigned short a : 1;
+ } oac2;
+ };
+};
+
#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES
-#define __put_get_user_asm(to, from, size, insn) \
-({ \
- int __rc; \
- \
- asm volatile( \
- insn " 0,%[spec]\n" \
- "0: mvcos %[_to],%[_from],%[_size]\n" \
- "1: xr %[rc],%[rc]\n" \
- "2:\n" \
- ".pushsection .fixup, \"ax\"\n" \
- "3: lhi %[rc],%[retval]\n" \
- " jg 2b\n" \
- ".popsection\n" \
- EX_TABLE(0b,3b) EX_TABLE(1b,3b) \
- : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \
- : [_size] "d" (size), [_from] "Q" (*(from)), \
- [retval] "K" (-EFAULT), [spec] "K" (0x81UL) \
- : "cc", "0"); \
- __rc; \
+#define __put_get_user_asm(to, from, size, oac_spec) \
+({ \
+ int __rc; \
+ \
+ asm volatile( \
+ " lr 0,%[spec]\n" \
+ "0: mvcos %[_to],%[_from],%[_size]\n" \
+ "1: xr %[rc],%[rc]\n" \
+ "2:\n" \
+ ".pushsection .fixup, \"ax\"\n" \
+ "3: lhi %[rc],%[retval]\n" \
+ " jg 2b\n" \
+ ".popsection\n" \
+ EX_TABLE(0b,3b) EX_TABLE(1b,3b) \
+ : [rc] "=&d" (__rc), [_to] "+Q" (*(to)) \
+ : [_size] "d" (size), [_from] "Q" (*(from)), \
+ [retval] "K" (-EFAULT), [spec] "d" (oac_spec.val) \
+ : "cc", "0"); \
+ __rc; \
})
+#define __put_user_asm(to, from, size) \
+ __put_get_user_asm(to, from, size, ((union oac) { \
+ .oac1.as = PSW_BITS_AS_SECONDARY, \
+ .oac1.a = 1 \
+ }))
+
+#define __get_user_asm(to, from, size) \
+ __put_get_user_asm(to, from, size, ((union oac) { \
+ .oac2.as = PSW_BITS_AS_SECONDARY, \
+ .oac2.a = 1 \
+ })) \
+
static __always_inline int __put_user_fn(void *x, void __user *ptr, unsigned long size)
{
int rc;
switch (size) {
case 1:
- rc = __put_get_user_asm((unsigned char __user *)ptr,
- (unsigned char *)x,
- size, "llilh");
+ rc = __put_user_asm((unsigned char __user *)ptr,
+ (unsigned char *)x,
+ size);
break;
case 2:
- rc = __put_get_user_asm((unsigned short __user *)ptr,
- (unsigned short *)x,
- size, "llilh");
+ rc = __put_user_asm((unsigned short __user *)ptr,
+ (unsigned short *)x,
+ size);
break;
case 4:
- rc = __put_get_user_asm((unsigned int __user *)ptr,
- (unsigned int *)x,
- size, "llilh");
+ rc = __put_user_asm((unsigned int __user *)ptr,
+ (unsigned int *)x,
+ size);
break;
case 8:
- rc = __put_get_user_asm((unsigned long __user *)ptr,
- (unsigned long *)x,
- size, "llilh");
+ rc = __put_user_asm((unsigned long __user *)ptr,
+ (unsigned long *)x,
+ size);
break;
default:
__put_user_bad();
@@ -108,24 +142,24 @@ static __always_inline int __get_user_fn(void *x, const void __user *ptr, unsign
switch (size) {
case 1:
- rc = __put_get_user_asm((unsigned char *)x,
- (unsigned char __user *)ptr,
- size, "lghi");
+ rc = __get_user_asm((unsigned char *)x,
+ (unsigned char __user *)ptr,
+ size);
break;
case 2:
- rc = __put_get_user_asm((unsigned short *)x,
- (unsigned short __user *)ptr,
- size, "lghi");
+ rc = __get_user_asm((unsigned short *)x,
+ (unsigned short __user *)ptr,
+ size);
break;
case 4:
- rc = __put_get_user_asm((unsigned int *)x,
- (unsigned int __user *)ptr,
- size, "lghi");
+ rc = __get_user_asm((unsigned int *)x,
+ (unsigned int __user *)ptr,
+ size);
break;
case 8:
- rc = __put_get_user_asm((unsigned long *)x,
- (unsigned long __user *)ptr,
- size, "lghi");
+ rc = __get_user_asm((unsigned long *)x,
+ (unsigned long __user *)ptr,
+ size);
break;
default:
__get_user_bad();
diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c
index d52d85367bf7..b032e556eeb7 100644
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -33,7 +33,7 @@
#define DEBUGP(fmt , ...)
#endif
-#define PLT_ENTRY_SIZE 20
+#define PLT_ENTRY_SIZE 22
void *module_alloc(unsigned long size)
{
@@ -341,27 +341,26 @@ static int apply_rela(Elf_Rela *rela, Elf_Addr base, Elf_Sym *symtab,
case R_390_PLTOFF32: /* 32 bit offset from GOT to PLT. */
case R_390_PLTOFF64: /* 16 bit offset from GOT to PLT. */
if (info->plt_initialized == 0) {
- unsigned int insn[5];
- unsigned int *ip = me->core_layout.base +
- me->arch.plt_offset +
- info->plt_offset;
-
- insn[0] = 0x0d10e310; /* basr 1,0 */
- insn[1] = 0x100a0004; /* lg 1,10(1) */
+ unsigned char insn[PLT_ENTRY_SIZE];
+ char *plt_base;
+ char *ip;
+
+ plt_base = me->core_layout.base + me->arch.plt_offset;
+ ip = plt_base + info->plt_offset;
+ *(int *)insn = 0x0d10e310; /* basr 1,0 */
+ *(int *)&insn[4] = 0x100c0004; /* lg 1,12(1) */
if (IS_ENABLED(CONFIG_EXPOLINE) && !nospec_disable) {
- unsigned int *ij;
- ij = me->core_layout.base +
- me->arch.plt_offset +
- me->arch.plt_size - PLT_ENTRY_SIZE;
- insn[2] = 0xa7f40000 + /* j __jump_r1 */
- (unsigned int)(u16)
- (((unsigned long) ij - 8 -
- (unsigned long) ip) / 2);
+ char *jump_r1;
+
+ jump_r1 = plt_base + me->arch.plt_size -
+ PLT_ENTRY_SIZE;
+ /* brcl 0xf,__jump_r1 */
+ *(short *)&insn[8] = 0xc0f4;
+ *(int *)&insn[10] = (jump_r1 - (ip + 8)) / 2;
} else {
- insn[2] = 0x07f10000; /* br %r1 */
+ *(int *)&insn[8] = 0x07f10000; /* br %r1 */
}
- insn[3] = (unsigned int) (val >> 32);
- insn[4] = (unsigned int) val;
+ *(long *)&insn[14] = val;
write(ip, insn, sizeof(insn));
info->plt_initialized = 1;
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 0c9e894913dc..651a51914e34 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -264,7 +264,14 @@ static int notrace s390_validate_registers(union mci mci, int umode)
/* Validate vector registers */
union ctlreg0 cr0;
- if (!mci.vr) {
+ /*
+ * The vector validity must only be checked if not running a
+ * KVM guest. For KVM guests the machine check is forwarded by
+ * KVM and it is the responsibility of the guest to take
+ * appropriate actions. The host vector or FPU values have been
+ * saved by KVM and will be restored by KVM.
+ */
+ if (!mci.vr && !test_cpu_flag(CIF_MCCK_GUEST)) {
/*
* Vector registers can't be restored. If the kernel
* currently uses vector registers the system is
@@ -307,11 +314,21 @@ static int notrace s390_validate_registers(union mci mci, int umode)
if (cr2.gse) {
if (!mci.gs) {
/*
- * Guarded storage register can't be restored and
- * the current processes uses guarded storage.
- * It has to be terminated.
+ * 2 cases:
+ * - machine check in kernel or userspace
+ * - machine check while running SIE (KVM guest)
+ * For kernel or userspace the userspace values of
+ * guarded storage control can not be recreated, the
+ * process must be terminated.
+ * For SIE the guest values of guarded storage can not
+ * be recreated. This is either due to a bug or due to
+ * GS being disabled in the guest. The guest will be
+ * notified by KVM code and the guests machine check
+ * handling must take care of this. The host values
+ * are saved by KVM and are not affected.
*/
- kill_task = 1;
+ if (!test_cpu_flag(CIF_MCCK_GUEST))
+ kill_task = 1;
} else {
load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area);
}
diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c
index 30f0242de4a5..8ee48672233f 100644
--- a/arch/s390/kernel/perf_cpum_cf_common.c
+++ b/arch/s390/kernel/perf_cpum_cf_common.c
@@ -178,7 +178,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
case CPUMF_CTR_SET_CRYPTO:
if (info->csvn >= 1 && info->csvn <= 5)
ctrset_size = 16;
- else if (info->csvn == 6)
+ else if (info->csvn == 6 || info->csvn == 7)
ctrset_size = 20;
break;
case CPUMF_CTR_SET_EXT:
@@ -188,7 +188,7 @@ size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset,
ctrset_size = 48;
else if (info->csvn >= 3 && info->csvn <= 5)
ctrset_size = 128;
- else if (info->csvn == 6)
+ else if (info->csvn == 6 || info->csvn == 7)
ctrset_size = 160;
break;
case CPUMF_CTR_SET_MT_DIAG:
diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c
index 37265f551a11..52c1fe23b823 100644
--- a/arch/s390/kernel/perf_cpum_cf_events.c
+++ b/arch/s390/kernel/perf_cpum_cf_events.c
@@ -344,7 +344,7 @@ static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = {
NULL,
};
-static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = {
+static struct attribute *cpumcf_svn_67_pmu_event_attr[] __initdata = {
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES),
CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS),
@@ -715,8 +715,8 @@ __init const struct attribute_group **cpumf_cf_event_group(void)
case 1 ... 5:
csvn = cpumcf_svn_12345_pmu_event_attr;
break;
- case 6:
- csvn = cpumcf_svn_6_pmu_event_attr;
+ case 6 ... 7:
+ csvn = cpumcf_svn_67_pmu_event_attr;
break;
default:
csvn = none;
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index db62def4ef28..332a49965130 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1179,7 +1179,7 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt,
sample = (struct hws_basic_entry *) *sdbt;
while ((unsigned long *) sample < (unsigned long *) te) {
/* Check for an empty sample */
- if (!sample->def)
+ if (!sample->def || sample->LS)
break;
/* Update perf event period */
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 9c6d45d0d345..577f1ead6a51 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1990,7 +1990,7 @@ static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
while (ofs >= ms->npages && (mnode = rb_next(mnode))) {
ms = container_of(mnode, struct kvm_memory_slot, gfn_node[slots->node_idx]);
- ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
+ ofs = find_first_bit(kvm_second_dirty_bitmap(ms), ms->npages);
}
return ms->base_gfn + ofs;
}
diff --git a/arch/s390/lib/Makefile b/arch/s390/lib/Makefile
index 707cd4622c13..69feb8ed3312 100644
--- a/arch/s390/lib/Makefile
+++ b/arch/s390/lib/Makefile
@@ -17,4 +17,7 @@ KASAN_SANITIZE_uaccess.o := n
obj-$(CONFIG_S390_UNWIND_SELFTEST) += test_unwind.o
CFLAGS_test_unwind.o += -fno-optimize-sibling-calls
+obj-$(CONFIG_S390_MODULES_SANITY_TEST) += test_modules.o
+obj-$(CONFIG_S390_MODULES_SANITY_TEST_HELPERS) += test_modules_helpers.o
+
lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
diff --git a/arch/s390/lib/test_modules.c b/arch/s390/lib/test_modules.c
new file mode 100644
index 000000000000..d056baa8fbb0
--- /dev/null
+++ b/arch/s390/lib/test_modules.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <kunit/test.h>
+#include <linux/module.h>
+
+#include "test_modules.h"
+
+#define DECLARE_RETURN(i) int test_modules_return_ ## i(void)
+REPEAT_10000(DECLARE_RETURN);
+
+/*
+ * Test that modules with many relocations are loaded properly.
+ */
+static void test_modules_many_vmlinux_relocs(struct kunit *test)
+{
+ int result = 0;
+
+#define CALL_RETURN(i) result += test_modules_return_ ## i()
+ REPEAT_10000(CALL_RETURN);
+ KUNIT_ASSERT_EQ(test, result, 49995000);
+}
+
+static struct kunit_case modules_testcases[] = {
+ KUNIT_CASE(test_modules_many_vmlinux_relocs),
+ {}
+};
+
+static struct kunit_suite modules_test_suite = {
+ .name = "modules_test_s390",
+ .test_cases = modules_testcases,
+};
+
+kunit_test_suites(&modules_test_suite);
+
+MODULE_LICENSE("GPL");
diff --git a/arch/s390/lib/test_modules.h b/arch/s390/lib/test_modules.h
new file mode 100644
index 000000000000..43b5e4b4af3e
--- /dev/null
+++ b/arch/s390/lib/test_modules.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+#ifndef TEST_MODULES_H
+#define TEST_MODULES_H
+
+#define __REPEAT_10000_3(f, x) \
+ f(x ## 0); \
+ f(x ## 1); \
+ f(x ## 2); \
+ f(x ## 3); \
+ f(x ## 4); \
+ f(x ## 5); \
+ f(x ## 6); \
+ f(x ## 7); \
+ f(x ## 8); \
+ f(x ## 9)
+#define __REPEAT_10000_2(f, x) \
+ __REPEAT_10000_3(f, x ## 0); \
+ __REPEAT_10000_3(f, x ## 1); \
+ __REPEAT_10000_3(f, x ## 2); \
+ __REPEAT_10000_3(f, x ## 3); \
+ __REPEAT_10000_3(f, x ## 4); \
+ __REPEAT_10000_3(f, x ## 5); \
+ __REPEAT_10000_3(f, x ## 6); \
+ __REPEAT_10000_3(f, x ## 7); \
+ __REPEAT_10000_3(f, x ## 8); \
+ __REPEAT_10000_3(f, x ## 9)
+#define __REPEAT_10000_1(f, x) \
+ __REPEAT_10000_2(f, x ## 0); \
+ __REPEAT_10000_2(f, x ## 1); \
+ __REPEAT_10000_2(f, x ## 2); \
+ __REPEAT_10000_2(f, x ## 3); \
+ __REPEAT_10000_2(f, x ## 4); \
+ __REPEAT_10000_2(f, x ## 5); \
+ __REPEAT_10000_2(f, x ## 6); \
+ __REPEAT_10000_2(f, x ## 7); \
+ __REPEAT_10000_2(f, x ## 8); \
+ __REPEAT_10000_2(f, x ## 9)
+#define REPEAT_10000(f) \
+ __REPEAT_10000_1(f, 0); \
+ __REPEAT_10000_1(f, 1); \
+ __REPEAT_10000_1(f, 2); \
+ __REPEAT_10000_1(f, 3); \
+ __REPEAT_10000_1(f, 4); \
+ __REPEAT_10000_1(f, 5); \
+ __REPEAT_10000_1(f, 6); \
+ __REPEAT_10000_1(f, 7); \
+ __REPEAT_10000_1(f, 8); \
+ __REPEAT_10000_1(f, 9)
+
+#endif
diff --git a/arch/s390/lib/test_modules_helpers.c b/arch/s390/lib/test_modules_helpers.c
new file mode 100644
index 000000000000..1670349a03eb
--- /dev/null
+++ b/arch/s390/lib/test_modules_helpers.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include <linux/export.h>
+
+#include "test_modules.h"
+
+#define DEFINE_RETURN(i) \
+ int test_modules_return_ ## i(void) \
+ { \
+ return 1 ## i - 10000; \
+ } \
+ EXPORT_SYMBOL_GPL(test_modules_return_ ## i)
+REPEAT_10000(DEFINE_RETURN);
diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c
index a596e69d3c47..8a5d21461889 100644
--- a/arch/s390/lib/uaccess.c
+++ b/arch/s390/lib/uaccess.c
@@ -62,10 +62,14 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr
unsigned long size)
{
unsigned long tmp1, tmp2;
+ union oac spec = {
+ .oac2.as = PSW_BITS_AS_SECONDARY,
+ .oac2.a = 1,
+ };
tmp1 = -4096UL;
asm volatile(
- " lghi 0,%[spec]\n"
+ " lr 0,%[spec]\n"
"0: .insn ss,0xc80000000000,0(%0,%2),0(%1),0\n"
"6: jz 4f\n"
"1: algr %0,%3\n"
@@ -84,7 +88,7 @@ static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr
"5:\n"
EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : [spec] "K" (0x81UL)
+ : [spec] "d" (spec.val)
: "cc", "memory", "0");
return size;
}
@@ -135,10 +139,14 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
unsigned long size)
{
unsigned long tmp1, tmp2;
+ union oac spec = {
+ .oac1.as = PSW_BITS_AS_SECONDARY,
+ .oac1.a = 1,
+ };
tmp1 = -4096UL;
asm volatile(
- " llilh 0,%[spec]\n"
+ " lr 0,%[spec]\n"
"0: .insn ss,0xc80000000000,0(%0,%1),0(%2),0\n"
"6: jz 4f\n"
"1: algr %0,%3\n"
@@ -157,7 +165,7 @@ static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x,
"5:\n"
EX_TABLE(0b,2b) EX_TABLE(3b,5b) EX_TABLE(6b,2b) EX_TABLE(7b,5b)
: "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2)
- : [spec] "K" (0x81UL)
+ : [spec] "d" (spec.val)
: "cc", "memory", "0");
return size;
}
@@ -207,10 +215,14 @@ EXPORT_SYMBOL(raw_copy_to_user);
static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size)
{
unsigned long tmp1, tmp2;
+ union oac spec = {
+ .oac1.as = PSW_BITS_AS_SECONDARY,
+ .oac1.a = 1,
+ };
tmp1 = -4096UL;
asm volatile(
- " llilh 0,%[spec]\n"
+ " lr 0,%[spec]\n"
"0: .insn ss,0xc80000000000,0(%0,%1),0(%4),0\n"
" jz 4f\n"
"1: algr %0,%2\n"
@@ -228,7 +240,7 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size
"5:\n"
EX_TABLE(0b,2b) EX_TABLE(3b,5b)
: "+a" (size), "+a" (to), "+a" (tmp1), "=a" (tmp2)
- : "a" (empty_zero_page), [spec] "K" (0x81UL)
+ : "a" (empty_zero_page), [spec] "d" (spec.val)
: "cc", "memory", "0");
return size;
}
diff --git a/arch/sh/include/asm/bitops.h b/arch/sh/include/asm/bitops.h
index 3b6c7b5b7ec9..10ceb0d6b5a9 100644
--- a/arch/sh/include/asm/bitops.h
+++ b/arch/sh/include/asm/bitops.h
@@ -68,6 +68,5 @@ static inline unsigned long __ffs(unsigned long word)
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/le.h>
-#include <asm-generic/bitops/find.h>
#endif /* __ASM_SH_BITOPS_H */
diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c
index fb517b82a87b..3a76a766f423 100644
--- a/arch/sh/mm/alignment.c
+++ b/arch/sh/mm/alignment.c
@@ -140,7 +140,7 @@ static int alignment_proc_open(struct inode *inode, struct file *file)
static ssize_t alignment_proc_write(struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
- int *data = PDE_DATA(file_inode(file));
+ int *data = pde_data(file_inode(file));
char mode;
if (count > 0) {
@@ -161,7 +161,7 @@ static const struct proc_ops alignment_proc_ops = {
};
/*
- * This needs to be done after sysctl_init, otherwise sys/ will be
+ * This needs to be done after sysctl_init_bases(), otherwise sys/ will be
* overwritten. Actually, this shouldn't be in sys/ at all since
* it isn't a sysctl, and it doesn't contain sysctl information.
* We now locate it in /proc/cpu/alignment instead.
diff --git a/arch/sparc/include/asm/bitops_32.h b/arch/sparc/include/asm/bitops_32.h
index 0ceff3b915a8..889afa9f990f 100644
--- a/arch/sparc/include/asm/bitops_32.h
+++ b/arch/sparc/include/asm/bitops_32.h
@@ -100,7 +100,6 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/lock.h>
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/ext2-atomic.h>
diff --git a/arch/sparc/include/asm/bitops_64.h b/arch/sparc/include/asm/bitops_64.h
index ca7ea5913494..005a8ae858f1 100644
--- a/arch/sparc/include/asm/bitops_64.h
+++ b/arch/sparc/include/asm/bitops_64.h
@@ -52,8 +52,6 @@ unsigned int __arch_hweight8(unsigned int w);
#include <asm-generic/bitops/lock.h>
#endif /* __KERNEL__ */
-#include <asm-generic/bitops/find.h>
-
#ifdef __KERNEL__
#include <asm-generic/bitops/le.h>
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 2672dd03faf3..666f81e617ea 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h
@@ -126,6 +126,8 @@
#define SO_RESERVE_MEM 0x0052
+#define SO_TXREHASH 0x0053
+
#if !defined(__KERNEL__)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e0e0d00cf103..995f2dc28631 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -137,7 +137,6 @@ config X86
select GENERIC_CPU_VULNERABILITIES
select GENERIC_EARLY_IOREMAP
select GENERIC_ENTRY
- select GENERIC_FIND_FIRST_BIT
select GENERIC_IOMAP
select GENERIC_IRQ_EFFECTIVE_AFF_MASK if SMP
select GENERIC_IRQ_MATRIX_ALLOCATOR if X86_LOCAL_APIC
@@ -188,6 +187,7 @@ config X86
select HAVE_CONTEXT_TRACKING_OFFSTACK if HAVE_CONTEXT_TRACKING
select HAVE_C_RECORDMCOUNT
select HAVE_OBJTOOL_MCOUNT if STACK_VALIDATION
+ select HAVE_BUILDTIME_MCOUNT_SORT
select HAVE_DEBUG_KMEMLEAK
select HAVE_DMA_CONTIGUOUS
select HAVE_DYNAMIC_FTRACE
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index fd9f908debe5..c91434056c29 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -6236,6 +6236,19 @@ __init int intel_pmu_init(void)
pmu->num_counters = x86_pmu.num_counters;
pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
}
+
+ /*
+ * Quirk: For some Alder Lake machine, when all E-cores are disabled in
+ * a BIOS, the leaf 0xA will enumerate all counters of P-cores. However,
+ * the X86_FEATURE_HYBRID_CPU is still set. The above codes will
+ * mistakenly add extra counters for P-cores. Correct the number of
+ * counters here.
+ */
+ if ((pmu->num_counters > 8) || (pmu->num_counters_fixed > 4)) {
+ pmu->num_counters = x86_pmu.num_counters;
+ pmu->num_counters_fixed = x86_pmu.num_counters_fixed;
+ }
+
pmu->max_pebs_events = min_t(unsigned, MAX_PEBS_EVENTS, pmu->num_counters);
pmu->unconstrained = (struct event_constraint)
__EVENT_CONSTRAINT(0, (1ULL << pmu->num_counters) - 1,
@@ -6340,6 +6353,8 @@ __init int intel_pmu_init(void)
}
if (x86_pmu.lbr_nr) {
+ intel_pmu_lbr_init();
+
pr_cont("%d-deep LBR, ", x86_pmu.lbr_nr);
/* only support branch_stack snapshot for perfmon >= v2 */
diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c
index 8043213b75a5..669c2be14784 100644
--- a/arch/x86/events/intel/lbr.c
+++ b/arch/x86/events/intel/lbr.c
@@ -8,14 +8,6 @@
#include "../perf_event.h"
-static const enum {
- LBR_EIP_FLAGS = 1,
- LBR_TSX = 2,
-} lbr_desc[LBR_FORMAT_MAX_KNOWN + 1] = {
- [LBR_FORMAT_EIP_FLAGS] = LBR_EIP_FLAGS,
- [LBR_FORMAT_EIP_FLAGS2] = LBR_EIP_FLAGS | LBR_TSX,
-};
-
/*
* Intel LBR_SELECT bits
* Intel Vol3a, April 2011, Section 16.7 Table 16-10
@@ -243,7 +235,7 @@ void intel_pmu_lbr_reset_64(void)
for (i = 0; i < x86_pmu.lbr_nr; i++) {
wrmsrl(x86_pmu.lbr_from + i, 0);
wrmsrl(x86_pmu.lbr_to + i, 0);
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ if (x86_pmu.lbr_has_info)
wrmsrl(x86_pmu.lbr_info + i, 0);
}
}
@@ -305,11 +297,10 @@ enum {
*/
static inline bool lbr_from_signext_quirk_needed(void)
{
- int lbr_format = x86_pmu.intel_cap.lbr_format;
bool tsx_support = boot_cpu_has(X86_FEATURE_HLE) ||
boot_cpu_has(X86_FEATURE_RTM);
- return !tsx_support && (lbr_desc[lbr_format] & LBR_TSX);
+ return !tsx_support && x86_pmu.lbr_has_tsx;
}
static DEFINE_STATIC_KEY_FALSE(lbr_from_quirk_key);
@@ -427,12 +418,12 @@ rdlbr_all(struct lbr_entry *lbr, unsigned int idx, bool need_info)
void intel_pmu_lbr_restore(void *ctx)
{
- bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
- int i;
- unsigned lbr_idx, mask;
+ bool need_info = x86_pmu.lbr_has_info;
u64 tos = task_ctx->tos;
+ unsigned lbr_idx, mask;
+ int i;
mask = x86_pmu.lbr_nr - 1;
for (i = 0; i < task_ctx->valid_lbrs; i++) {
@@ -444,7 +435,7 @@ void intel_pmu_lbr_restore(void *ctx)
lbr_idx = (tos - i) & mask;
wrlbr_from(lbr_idx, 0);
wrlbr_to(lbr_idx, 0);
- if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+ if (need_info)
wrlbr_info(lbr_idx, 0);
}
@@ -519,9 +510,9 @@ static void __intel_pmu_lbr_restore(void *ctx)
void intel_pmu_lbr_save(void *ctx)
{
- bool need_info = x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO;
struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
struct x86_perf_task_context *task_ctx = ctx;
+ bool need_info = x86_pmu.lbr_has_info;
unsigned lbr_idx, mask;
u64 tos;
int i;
@@ -816,7 +807,6 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
{
bool need_info = false, call_stack = false;
unsigned long mask = x86_pmu.lbr_nr - 1;
- int lbr_format = x86_pmu.intel_cap.lbr_format;
u64 tos = intel_pmu_lbr_tos();
int i;
int out = 0;
@@ -831,9 +821,7 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
for (i = 0; i < num; i++) {
unsigned long lbr_idx = (tos - i) & mask;
u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
- int skip = 0;
u16 cycles = 0;
- int lbr_flags = lbr_desc[lbr_format];
from = rdlbr_from(lbr_idx, NULL);
to = rdlbr_to(lbr_idx, NULL);
@@ -845,37 +833,39 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
if (call_stack && !from)
break;
- if (lbr_format == LBR_FORMAT_INFO && need_info) {
- u64 info;
-
- info = rdlbr_info(lbr_idx, NULL);
- mis = !!(info & LBR_INFO_MISPRED);
- pred = !mis;
- in_tx = !!(info & LBR_INFO_IN_TX);
- abort = !!(info & LBR_INFO_ABORT);
- cycles = (info & LBR_INFO_CYCLES);
- }
-
- if (lbr_format == LBR_FORMAT_TIME) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
- skip = 1;
- cycles = ((to >> 48) & LBR_INFO_CYCLES);
-
- to = (u64)((((s64)to) << 16) >> 16);
- }
-
- if (lbr_flags & LBR_EIP_FLAGS) {
- mis = !!(from & LBR_FROM_FLAG_MISPRED);
- pred = !mis;
- skip = 1;
- }
- if (lbr_flags & LBR_TSX) {
- in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
- abort = !!(from & LBR_FROM_FLAG_ABORT);
- skip = 3;
+ if (x86_pmu.lbr_has_info) {
+ if (need_info) {
+ u64 info;
+
+ info = rdlbr_info(lbr_idx, NULL);
+ mis = !!(info & LBR_INFO_MISPRED);
+ pred = !mis;
+ cycles = (info & LBR_INFO_CYCLES);
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(info & LBR_INFO_IN_TX);
+ abort = !!(info & LBR_INFO_ABORT);
+ }
+ }
+ } else {
+ int skip = 0;
+
+ if (x86_pmu.lbr_from_flags) {
+ mis = !!(from & LBR_FROM_FLAG_MISPRED);
+ pred = !mis;
+ skip = 1;
+ }
+ if (x86_pmu.lbr_has_tsx) {
+ in_tx = !!(from & LBR_FROM_FLAG_IN_TX);
+ abort = !!(from & LBR_FROM_FLAG_ABORT);
+ skip = 3;
+ }
+ from = (u64)((((s64)from) << skip) >> skip);
+
+ if (x86_pmu.lbr_to_cycles) {
+ cycles = ((to >> 48) & LBR_INFO_CYCLES);
+ to = (u64)((((s64)to) << 16) >> 16);
+ }
}
- from = (u64)((((s64)from) << skip) >> skip);
/*
* Some CPUs report duplicated abort records,
@@ -903,37 +893,40 @@ void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
cpuc->lbr_stack.hw_idx = tos;
}
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_mispred);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_cycles);
+static DEFINE_STATIC_KEY_FALSE(x86_lbr_type);
+
static __always_inline int get_lbr_br_type(u64 info)
{
- if (!static_cpu_has(X86_FEATURE_ARCH_LBR) || !x86_pmu.lbr_br_type)
- return 0;
+ int type = 0;
- return (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+ if (static_branch_likely(&x86_lbr_type))
+ type = (info & LBR_INFO_BR_TYPE) >> LBR_INFO_BR_TYPE_OFFSET;
+
+ return type;
}
static __always_inline bool get_lbr_mispred(u64 info)
{
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
- return 0;
+ bool mispred = 0;
- return !!(info & LBR_INFO_MISPRED);
-}
+ if (static_branch_likely(&x86_lbr_mispred))
+ mispred = !!(info & LBR_INFO_MISPRED);
-static __always_inline bool get_lbr_predicted(u64 info)
-{
- if (static_cpu_has(X86_FEATURE_ARCH_LBR) && !x86_pmu.lbr_mispred)
- return 0;
-
- return !(info & LBR_INFO_MISPRED);
+ return mispred;
}
static __always_inline u16 get_lbr_cycles(u64 info)
{
+ u16 cycles = info & LBR_INFO_CYCLES;
+
if (static_cpu_has(X86_FEATURE_ARCH_LBR) &&
- !(x86_pmu.lbr_timed_lbr && info & LBR_INFO_CYC_CNT_VALID))
- return 0;
+ (!static_branch_likely(&x86_lbr_cycles) ||
+ !(info & LBR_INFO_CYC_CNT_VALID)))
+ cycles = 0;
- return info & LBR_INFO_CYCLES;
+ return cycles;
}
static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
@@ -961,7 +954,7 @@ static void intel_pmu_store_lbr(struct cpu_hw_events *cpuc,
e->from = from;
e->to = to;
e->mispred = get_lbr_mispred(info);
- e->predicted = get_lbr_predicted(info);
+ e->predicted = !e->mispred;
e->in_tx = !!(info & LBR_INFO_IN_TX);
e->abort = !!(info & LBR_INFO_ABORT);
e->cycles = get_lbr_cycles(info);
@@ -1120,7 +1113,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
if ((br_type & PERF_SAMPLE_BRANCH_NO_CYCLES) &&
(br_type & PERF_SAMPLE_BRANCH_NO_FLAGS) &&
- (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO))
+ x86_pmu.lbr_has_info)
reg->config |= LBR_NO_INFO;
return 0;
@@ -1706,6 +1699,38 @@ void intel_pmu_lbr_init_knl(void)
x86_pmu.intel_cap.lbr_format = LBR_FORMAT_EIP_FLAGS;
}
+void intel_pmu_lbr_init(void)
+{
+ switch (x86_pmu.intel_cap.lbr_format) {
+ case LBR_FORMAT_EIP_FLAGS2:
+ x86_pmu.lbr_has_tsx = 1;
+ fallthrough;
+ case LBR_FORMAT_EIP_FLAGS:
+ x86_pmu.lbr_from_flags = 1;
+ break;
+
+ case LBR_FORMAT_INFO:
+ x86_pmu.lbr_has_tsx = 1;
+ fallthrough;
+ case LBR_FORMAT_INFO2:
+ x86_pmu.lbr_has_info = 1;
+ break;
+
+ case LBR_FORMAT_TIME:
+ x86_pmu.lbr_from_flags = 1;
+ x86_pmu.lbr_to_cycles = 1;
+ break;
+ }
+
+ if (x86_pmu.lbr_has_info) {
+ /*
+ * Only used in combination with baseline pebs.
+ */
+ static_branch_enable(&x86_lbr_mispred);
+ static_branch_enable(&x86_lbr_cycles);
+ }
+}
+
/*
* LBR state size is variable based on the max number of registers.
* This calculates the expected state size, which should match
@@ -1726,6 +1751,9 @@ static bool is_arch_lbr_xsave_available(void)
* Check the LBR state with the corresponding software structure.
* Disable LBR XSAVES support if the size doesn't match.
*/
+ if (xfeature_size(XFEATURE_LBR) == 0)
+ return false;
+
if (WARN_ON(xfeature_size(XFEATURE_LBR) != get_lbr_state_size()))
return false;
@@ -1765,6 +1793,12 @@ void __init intel_pmu_arch_lbr_init(void)
x86_pmu.lbr_br_type = ecx.split.lbr_br_type;
x86_pmu.lbr_nr = lbr_nr;
+ if (x86_pmu.lbr_mispred)
+ static_branch_enable(&x86_lbr_mispred);
+ if (x86_pmu.lbr_timed_lbr)
+ static_branch_enable(&x86_lbr_cycles);
+ if (x86_pmu.lbr_br_type)
+ static_branch_enable(&x86_lbr_type);
arch_lbr_xsave = is_arch_lbr_xsave_available();
if (arch_lbr_xsave) {
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index f1ba6ab2e97e..e497da9bf427 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -1762,7 +1762,7 @@ static const struct intel_uncore_init_fun rkl_uncore_init __initconst = {
static const struct intel_uncore_init_fun adl_uncore_init __initconst = {
.cpu_init = adl_uncore_cpu_init,
- .mmio_init = tgl_uncore_mmio_init,
+ .mmio_init = adl_uncore_mmio_init,
};
static const struct intel_uncore_init_fun icx_uncore_init __initconst = {
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index b9687980aab6..2adeaf4de4df 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -584,10 +584,11 @@ void snb_uncore_cpu_init(void);
void nhm_uncore_cpu_init(void);
void skl_uncore_cpu_init(void);
void icl_uncore_cpu_init(void);
-void adl_uncore_cpu_init(void);
void tgl_uncore_cpu_init(void);
+void adl_uncore_cpu_init(void);
void tgl_uncore_mmio_init(void);
void tgl_l_uncore_mmio_init(void);
+void adl_uncore_mmio_init(void);
int snb_pci2phy_map_init(int devid);
/* uncore_snbep.c */
diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 3049c646fa20..6ddadb482f68 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -494,8 +494,8 @@ void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box)
writel(0, box->io_addr);
}
-static void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
- struct perf_event *event)
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 6d735611c281..cfaf558bdb6b 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -139,6 +139,8 @@ void intel_generic_uncore_mmio_disable_box(struct intel_uncore_box *box);
void intel_generic_uncore_mmio_enable_box(struct intel_uncore_box *box);
void intel_generic_uncore_mmio_disable_event(struct intel_uncore_box *box,
struct perf_event *event);
+void intel_generic_uncore_mmio_enable_event(struct intel_uncore_box *box,
+ struct perf_event *event);
void intel_generic_uncore_pci_init_box(struct intel_uncore_box *box);
void intel_generic_uncore_pci_disable_box(struct intel_uncore_box *box);
diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c
index 0f63706cdadf..f698a55bde81 100644
--- a/arch/x86/events/intel/uncore_snb.c
+++ b/arch/x86/events/intel/uncore_snb.c
@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/* Nehalem/SandBridge/Haswell/Broadwell/Skylake uncore support */
#include "uncore.h"
+#include "uncore_discovery.h"
/* Uncore IMC PCI IDs */
#define PCI_DEVICE_ID_INTEL_SNB_IMC 0x0100
@@ -64,6 +65,20 @@
#define PCI_DEVICE_ID_INTEL_RKL_2_IMC 0x4c53
#define PCI_DEVICE_ID_INTEL_ADL_1_IMC 0x4660
#define PCI_DEVICE_ID_INTEL_ADL_2_IMC 0x4641
+#define PCI_DEVICE_ID_INTEL_ADL_3_IMC 0x4601
+#define PCI_DEVICE_ID_INTEL_ADL_4_IMC 0x4602
+#define PCI_DEVICE_ID_INTEL_ADL_5_IMC 0x4609
+#define PCI_DEVICE_ID_INTEL_ADL_6_IMC 0x460a
+#define PCI_DEVICE_ID_INTEL_ADL_7_IMC 0x4621
+#define PCI_DEVICE_ID_INTEL_ADL_8_IMC 0x4623
+#define PCI_DEVICE_ID_INTEL_ADL_9_IMC 0x4629
+#define PCI_DEVICE_ID_INTEL_ADL_10_IMC 0x4637
+#define PCI_DEVICE_ID_INTEL_ADL_11_IMC 0x463b
+#define PCI_DEVICE_ID_INTEL_ADL_12_IMC 0x4648
+#define PCI_DEVICE_ID_INTEL_ADL_13_IMC 0x4649
+#define PCI_DEVICE_ID_INTEL_ADL_14_IMC 0x4650
+#define PCI_DEVICE_ID_INTEL_ADL_15_IMC 0x4668
+#define PCI_DEVICE_ID_INTEL_ADL_16_IMC 0x4670
/* SNB event control */
#define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff
@@ -155,6 +170,7 @@
DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7");
DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15");
+DEFINE_UNCORE_FORMAT_ATTR(chmask, chmask, "config:8-11");
DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18");
DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23");
DEFINE_UNCORE_FORMAT_ATTR(cmask5, cmask, "config:24-28");
@@ -1334,6 +1350,62 @@ static const struct pci_device_id tgl_uncore_pci_ids[] = {
PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_2_IMC),
.driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
},
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_3_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_4_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_5_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_6_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_7_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_8_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_9_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_10_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_11_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_12_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_13_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_14_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_15_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
+ { /* IMC */
+ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ADL_16_IMC),
+ .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0),
+ },
{ /* end: all zeroes */ }
};
@@ -1390,7 +1462,8 @@ static struct pci_dev *tgl_uncore_get_mc_dev(void)
#define TGL_UNCORE_MMIO_IMC_MEM_OFFSET 0x10000
#define TGL_UNCORE_PCI_IMC_MAP_SIZE 0xe000
-static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+static void __uncore_imc_init_box(struct intel_uncore_box *box,
+ unsigned int base_offset)
{
struct pci_dev *pdev = tgl_uncore_get_mc_dev();
struct intel_uncore_pmu *pmu = box->pmu;
@@ -1417,11 +1490,17 @@ static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
addr |= ((resource_size_t)mch_bar << 32);
#endif
+ addr += base_offset;
box->io_addr = ioremap(addr, type->mmio_map_size);
if (!box->io_addr)
pr_warn("perf uncore: Failed to ioremap for %s.\n", type->name);
}
+static void tgl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, 0);
+}
+
static struct intel_uncore_ops tgl_uncore_imc_freerunning_ops = {
.init_box = tgl_uncore_imc_freerunning_init_box,
.exit_box = uncore_mmio_exit_box,
@@ -1469,3 +1548,136 @@ void tgl_uncore_mmio_init(void)
}
/* end of Tiger Lake MMIO uncore support */
+
+/* Alder Lake MMIO uncore support */
+#define ADL_UNCORE_IMC_BASE 0xd900
+#define ADL_UNCORE_IMC_MAP_SIZE 0x200
+#define ADL_UNCORE_IMC_CTR 0xe8
+#define ADL_UNCORE_IMC_CTRL 0xd0
+#define ADL_UNCORE_IMC_GLOBAL_CTL 0xc0
+#define ADL_UNCORE_IMC_BOX_CTL 0xc4
+#define ADL_UNCORE_IMC_FREERUNNING_BASE 0xd800
+#define ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE 0x100
+
+#define ADL_UNCORE_IMC_CTL_FRZ (1 << 0)
+#define ADL_UNCORE_IMC_CTL_RST_CTRL (1 << 1)
+#define ADL_UNCORE_IMC_CTL_RST_CTRS (1 << 2)
+#define ADL_UNCORE_IMC_CTL_INT (ADL_UNCORE_IMC_CTL_RST_CTRL | \
+ ADL_UNCORE_IMC_CTL_RST_CTRS)
+
+static void adl_uncore_imc_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_BASE);
+
+ /* The global control in MC1 can control both MCs. */
+ if (box->io_addr && (box->pmu->pmu_idx == 1))
+ writel(ADL_UNCORE_IMC_CTL_INT, box->io_addr + ADL_UNCORE_IMC_GLOBAL_CTL);
+}
+
+static void adl_uncore_mmio_disable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(ADL_UNCORE_IMC_CTL_FRZ, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+static void adl_uncore_mmio_enable_box(struct intel_uncore_box *box)
+{
+ if (!box->io_addr)
+ return;
+
+ writel(0, box->io_addr + uncore_mmio_box_ctl(box));
+}
+
+static struct intel_uncore_ops adl_uncore_mmio_ops = {
+ .init_box = adl_uncore_imc_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .disable_box = adl_uncore_mmio_disable_box,
+ .enable_box = adl_uncore_mmio_enable_box,
+ .disable_event = intel_generic_uncore_mmio_disable_event,
+ .enable_event = intel_generic_uncore_mmio_enable_event,
+ .read_counter = uncore_mmio_read_counter,
+};
+
+#define ADL_UNC_CTL_CHMASK_MASK 0x00000f00
+#define ADL_UNC_IMC_EVENT_MASK (SNB_UNC_CTL_EV_SEL_MASK | \
+ ADL_UNC_CTL_CHMASK_MASK | \
+ SNB_UNC_CTL_EDGE_DET)
+
+static struct attribute *adl_uncore_imc_formats_attr[] = {
+ &format_attr_event.attr,
+ &format_attr_chmask.attr,
+ &format_attr_edge.attr,
+ NULL,
+};
+
+static const struct attribute_group adl_uncore_imc_format_group = {
+ .name = "format",
+ .attrs = adl_uncore_imc_formats_attr,
+};
+
+static struct intel_uncore_type adl_uncore_imc = {
+ .name = "imc",
+ .num_counters = 5,
+ .num_boxes = 2,
+ .perf_ctr_bits = 64,
+ .perf_ctr = ADL_UNCORE_IMC_CTR,
+ .event_ctl = ADL_UNCORE_IMC_CTRL,
+ .event_mask = ADL_UNC_IMC_EVENT_MASK,
+ .box_ctl = ADL_UNCORE_IMC_BOX_CTL,
+ .mmio_offset = 0,
+ .mmio_map_size = ADL_UNCORE_IMC_MAP_SIZE,
+ .ops = &adl_uncore_mmio_ops,
+ .format_group = &adl_uncore_imc_format_group,
+};
+
+enum perf_adl_uncore_imc_freerunning_types {
+ ADL_MMIO_UNCORE_IMC_DATA_TOTAL,
+ ADL_MMIO_UNCORE_IMC_DATA_READ,
+ ADL_MMIO_UNCORE_IMC_DATA_WRITE,
+ ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX
+};
+
+static struct freerunning_counters adl_uncore_imc_freerunning[] = {
+ [ADL_MMIO_UNCORE_IMC_DATA_TOTAL] = { 0x40, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_READ] = { 0x58, 0x0, 0x0, 1, 64 },
+ [ADL_MMIO_UNCORE_IMC_DATA_WRITE] = { 0xA0, 0x0, 0x0, 1, 64 },
+};
+
+static void adl_uncore_imc_freerunning_init_box(struct intel_uncore_box *box)
+{
+ __uncore_imc_init_box(box, ADL_UNCORE_IMC_FREERUNNING_BASE);
+}
+
+static struct intel_uncore_ops adl_uncore_imc_freerunning_ops = {
+ .init_box = adl_uncore_imc_freerunning_init_box,
+ .exit_box = uncore_mmio_exit_box,
+ .read_counter = uncore_mmio_read_counter,
+ .hw_config = uncore_freerunning_hw_config,
+};
+
+static struct intel_uncore_type adl_uncore_imc_free_running = {
+ .name = "imc_free_running",
+ .num_counters = 3,
+ .num_boxes = 2,
+ .num_freerunning_types = ADL_MMIO_UNCORE_IMC_FREERUNNING_TYPE_MAX,
+ .mmio_map_size = ADL_UNCORE_IMC_FREERUNNING_MAP_SIZE,
+ .freerunning = adl_uncore_imc_freerunning,
+ .ops = &adl_uncore_imc_freerunning_ops,
+ .event_descs = tgl_uncore_imc_events,
+ .format_group = &tgl_uncore_imc_format_group,
+};
+
+static struct intel_uncore_type *adl_mmio_uncores[] = {
+ &adl_uncore_imc,
+ &adl_uncore_imc_free_running,
+ NULL
+};
+
+void adl_uncore_mmio_init(void)
+{
+ uncore_mmio_uncores = adl_mmio_uncores;
+}
+
+/* end of Alder Lake MMIO uncore support */
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c
index 3660f698fb2a..ed869443efb2 100644
--- a/arch/x86/events/intel/uncore_snbep.c
+++ b/arch/x86/events/intel/uncore_snbep.c
@@ -5482,7 +5482,7 @@ static struct intel_uncore_type icx_uncore_imc = {
.fixed_ctr_bits = 48,
.fixed_ctr = SNR_IMC_MMIO_PMON_FIXED_CTR,
.fixed_ctl = SNR_IMC_MMIO_PMON_FIXED_CTL,
- .event_descs = hswep_uncore_imc_events,
+ .event_descs = snr_uncore_imc_events,
.perf_ctr = SNR_IMC_MMIO_PMON_CTR0,
.event_ctl = SNR_IMC_MMIO_PMON_CTL0,
.event_mask = SNBEP_PMON_RAW_EVENT_MASK,
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 9d376e528dfc..150261d929b9 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -215,7 +215,8 @@ enum {
LBR_FORMAT_EIP_FLAGS2 = 0x04,
LBR_FORMAT_INFO = 0x05,
LBR_FORMAT_TIME = 0x06,
- LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME,
+ LBR_FORMAT_INFO2 = 0x07,
+ LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_INFO2,
};
enum {
@@ -840,6 +841,11 @@ struct x86_pmu {
bool lbr_double_abort; /* duplicated lbr aborts */
bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */
+ unsigned int lbr_has_info:1;
+ unsigned int lbr_has_tsx:1;
+ unsigned int lbr_from_flags:1;
+ unsigned int lbr_to_cycles:1;
+
/*
* Intel Architectural LBR CPUID Enumeration
*/
@@ -1392,6 +1398,8 @@ void intel_pmu_lbr_init_skl(void);
void intel_pmu_lbr_init_knl(void);
+void intel_pmu_lbr_init(void);
+
void intel_pmu_arch_lbr_init(void);
void intel_pmu_pebs_data_source_nhm(void);
diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c
index 85feafacc445..77e3a47af5ad 100644
--- a/arch/x86/events/rapl.c
+++ b/arch/x86/events/rapl.c
@@ -536,11 +536,14 @@ static struct perf_msr intel_rapl_spr_msrs[] = {
* - perf_msr_probe(PERF_RAPL_MAX)
* - want to use same event codes across both architectures
*/
-static struct perf_msr amd_rapl_msrs[PERF_RAPL_MAX] = {
- [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
+static struct perf_msr amd_rapl_msrs[] = {
+ [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 },
+ [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK },
+ [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 },
+ [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 },
+ [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, 0, false, 0 },
};
-
static int rapl_cpu_offline(unsigned int cpu)
{
struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 0367efdc5b7a..a288ecd230ab 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -380,8 +380,6 @@ static __always_inline int fls64(__u64 x)
#include <asm-generic/bitops/fls64.h>
#endif
-#include <asm-generic/bitops/find.h>
-
#include <asm-generic/bitops/sched.h>
#include <asm/arch_hweight.h>
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index f658bb4dbb74..631d5040b31e 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -55,6 +55,7 @@ KVM_X86_OP_NULL(tlb_remote_flush)
KVM_X86_OP_NULL(tlb_remote_flush_with_range)
KVM_X86_OP(tlb_flush_gva)
KVM_X86_OP(tlb_flush_guest)
+KVM_X86_OP(vcpu_pre_run)
KVM_X86_OP(run)
KVM_X86_OP_NULL(handle_exit)
KVM_X86_OP_NULL(skip_emulated_instruction)
@@ -98,8 +99,6 @@ KVM_X86_OP(handle_exit_irqoff)
KVM_X86_OP_NULL(request_immediate_exit)
KVM_X86_OP(sched_in)
KVM_X86_OP_NULL(update_cpu_dirty_logging)
-KVM_X86_OP_NULL(pre_block)
-KVM_X86_OP_NULL(post_block)
KVM_X86_OP_NULL(vcpu_blocking)
KVM_X86_OP_NULL(vcpu_unblocking)
KVM_X86_OP_NULL(update_pi_irte)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0677b9ea01c9..6e7c545bc7ee 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1381,6 +1381,7 @@ struct kvm_x86_ops {
*/
void (*tlb_flush_guest)(struct kvm_vcpu *vcpu);
+ int (*vcpu_pre_run)(struct kvm_vcpu *vcpu);
enum exit_fastpath_completion (*run)(struct kvm_vcpu *vcpu);
int (*handle_exit)(struct kvm_vcpu *vcpu,
enum exit_fastpath_completion exit_fastpath);
@@ -1454,18 +1455,6 @@ struct kvm_x86_ops {
const struct kvm_pmu_ops *pmu_ops;
const struct kvm_x86_nested_ops *nested_ops;
- /*
- * Architecture specific hooks for vCPU blocking due to
- * HLT instruction.
- * Returns for .pre_block():
- * - 0 means continue to block the vCPU.
- * - 1 means we cannot block the vCPU since some event
- * happens during this period, such as, 'ON' bit in
- * posted-interrupts descriptor is set.
- */
- int (*pre_block)(struct kvm_vcpu *vcpu);
- void (*post_block)(struct kvm_vcpu *vcpu);
-
void (*vcpu_blocking)(struct kvm_vcpu *vcpu);
void (*vcpu_unblocking)(struct kvm_vcpu *vcpu);
@@ -1494,7 +1483,8 @@ struct kvm_x86_ops {
int (*get_msr_feature)(struct kvm_msr_entry *entry);
- bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
+ bool (*can_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len);
bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu);
int (*enable_direct_tlbflush)(struct kvm_vcpu *vcpu);
@@ -1507,6 +1497,7 @@ struct kvm_x86_ops {
};
struct kvm_x86_nested_ops {
+ void (*leave_nested)(struct kvm_vcpu *vcpu);
int (*check_events)(struct kvm_vcpu *vcpu);
bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
void (*triple_fault)(struct kvm_vcpu *vcpu);
@@ -1872,7 +1863,6 @@ int kvm_cpu_has_extint(struct kvm_vcpu *v);
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
unsigned long ipi_bitmap_high, u32 min,
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 2da3316bb559..bf6e96011dfe 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -452,6 +452,9 @@ struct kvm_sync_regs {
#define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001
+/* attributes for system fd (group 0) */
+#define KVM_X86_XCOMP_GUEST_SUPP 0
+
struct kvm_vmx_nested_state_data {
__u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
__u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE];
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index c132daabe615..3e6f6b448f6a 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -760,9 +760,9 @@ void __init lapic_update_legacy_vectors(void)
void __init lapic_assign_system_vectors(void)
{
- unsigned int i, vector = 0;
+ unsigned int i, vector;
- for_each_set_bit_from(vector, system_vectors, NR_VECTORS)
+ for_each_set_bit(vector, system_vectors, NR_VECTORS)
irq_matrix_assign_system(vector_matrix, vector, false);
if (nr_legacy_irqs() > 1)
diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c
index a1e2f41796dc..9f4b508886dd 100644
--- a/arch/x86/kernel/cpu/mce/amd.c
+++ b/arch/x86/kernel/cpu/mce/amd.c
@@ -423,7 +423,7 @@ static void threshold_restart_bank(void *_tr)
u32 hi, lo;
/* sysfs write might race against an offline operation */
- if (this_cpu_read(threshold_banks))
+ if (!this_cpu_read(threshold_banks) && !tr->set_lvt_off)
return;
rdmsr(tr->b->address, lo, hi);
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index bb9a46a804bf..baafbb37be67 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -486,6 +486,7 @@ static void intel_ppin_init(struct cpuinfo_x86 *c)
case INTEL_FAM6_BROADWELL_X:
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_ICELAKE_X:
+ case INTEL_FAM6_ICELAKE_D:
case INTEL_FAM6_SAPPHIRERAPIDS_X:
case INTEL_FAM6_XEON_PHI_KNL:
case INTEL_FAM6_XEON_PHI_KNM:
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index fd2d3ab38ebb..dc7da08bc700 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -515,6 +515,7 @@ static const struct intel_early_ops gen11_early_ops __initconst = {
.stolen_size = gen9_stolen_size,
};
+/* Intel integrated GPUs for which we need to reserve "stolen memory" */
static const struct pci_device_id intel_early_ids[] __initconst = {
INTEL_I830_IDS(&i830_early_ops),
INTEL_I845G_IDS(&i845_early_ops),
@@ -592,6 +593,13 @@ static void __init intel_graphics_quirks(int num, int slot, int func)
u16 device;
int i;
+ /*
+ * Reserve "stolen memory" for an integrated GPU. If we've already
+ * found one, there's nothing to do for other (discrete) GPUs.
+ */
+ if (resource_size(&intel_graphics_stolen_res))
+ return;
+
device = read_pci_config_16(num, slot, func, PCI_DEVICE_ID);
for (i = 0; i < ARRAY_SIZE(intel_early_ids); i++) {
@@ -704,7 +712,7 @@ static struct chipset early_qrk[] __initdata = {
{ PCI_VENDOR_ID_INTEL, 0x3406, PCI_CLASS_BRIDGE_HOST,
PCI_BASE_CLASS_BRIDGE, 0, intel_remapping_check },
{ PCI_VENDOR_ID_INTEL, PCI_ANY_ID, PCI_CLASS_DISPLAY_VGA, PCI_ANY_ID,
- QFLAG_APPLY_ONCE, intel_graphics_quirks },
+ 0, intel_graphics_quirks },
/*
* HPET on the current version of the Baytrail platform has accuracy
* problems: it will halt in deep idle state - so we disable it.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 882213df3713..71f336425e58 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1435,8 +1435,12 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
hpet_rtc_timer_reinit();
memset(&curr_time, 0, sizeof(struct rtc_time));
- if (hpet_rtc_flags & (RTC_UIE | RTC_AIE))
- mc146818_get_time(&curr_time);
+ if (hpet_rtc_flags & (RTC_UIE | RTC_AIE)) {
+ if (unlikely(mc146818_get_time(&curr_time) < 0)) {
+ pr_err_ratelimited("unable to read current time from RTC\n");
+ return IRQ_HANDLED;
+ }
+ }
if (hpet_rtc_flags & RTC_UIE &&
curr_time.tm_sec != hpet_prev_update_sec) {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index c55e57b30e81..28be02adc669 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -119,6 +119,29 @@ static int kvm_check_cpuid(struct kvm_vcpu *vcpu,
return fpu_enable_guest_xfd_features(&vcpu->arch.guest_fpu, xfeatures);
}
+/* Check whether the supplied CPUID data is equal to what is already set for the vCPU. */
+static int kvm_cpuid_check_equal(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *orig;
+ int i;
+
+ if (nent != vcpu->arch.cpuid_nent)
+ return -EINVAL;
+
+ for (i = 0; i < nent; i++) {
+ orig = &vcpu->arch.cpuid_entries[i];
+ if (e2[i].function != orig->function ||
+ e2[i].index != orig->index ||
+ e2[i].flags != orig->flags ||
+ e2[i].eax != orig->eax || e2[i].ebx != orig->ebx ||
+ e2[i].ecx != orig->ecx || e2[i].edx != orig->edx)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
{
u32 function;
@@ -145,14 +168,21 @@ static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu)
}
}
-static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+static struct kvm_cpuid_entry2 *__kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu,
+ struct kvm_cpuid_entry2 *entries, int nent)
{
u32 base = vcpu->arch.kvm_cpuid_base;
if (!base)
return NULL;
- return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0);
+ return cpuid_entry2_find(entries, nent, base | KVM_CPUID_FEATURES, 0);
+}
+
+static struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu)
+{
+ return __kvm_find_kvm_cpuid_features(vcpu, vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent);
}
void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
@@ -167,11 +197,28 @@ void kvm_update_pv_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.pv_cpuid.features = best->eax;
}
-void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+/*
+ * Calculate guest's supported XCR0 taking into account guest CPUID data and
+ * supported_xcr0 (comprised of host configuration and KVM_SUPPORTED_XCR0).
+ */
+static u64 cpuid_get_supported_xcr0(struct kvm_cpuid_entry2 *entries, int nent)
{
struct kvm_cpuid_entry2 *best;
- best = kvm_find_cpuid_entry(vcpu, 1, 0);
+ best = cpuid_entry2_find(entries, nent, 0xd, 0);
+ if (!best)
+ return 0;
+
+ return (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
+}
+
+static void __kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *entries,
+ int nent)
+{
+ struct kvm_cpuid_entry2 *best;
+ u64 guest_supported_xcr0 = cpuid_get_supported_xcr0(entries, nent);
+
+ best = cpuid_entry2_find(entries, nent, 1, 0);
if (best) {
/* Update OSXSAVE bit */
if (boot_cpu_has(X86_FEATURE_XSAVE))
@@ -182,32 +229,52 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE);
}
- best = kvm_find_cpuid_entry(vcpu, 7, 0);
+ best = cpuid_entry2_find(entries, nent, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
kvm_read_cr4_bits(vcpu, X86_CR4_PKE));
- best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
+ best = cpuid_entry2_find(entries, nent, 0xD, 0);
if (best)
best->ebx = xstate_required_size(vcpu->arch.xcr0, false);
- best = kvm_find_cpuid_entry(vcpu, 0xD, 1);
+ best = cpuid_entry2_find(entries, nent, 0xD, 1);
if (best && (cpuid_entry_has(best, X86_FEATURE_XSAVES) ||
cpuid_entry_has(best, X86_FEATURE_XSAVEC)))
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
- best = kvm_find_kvm_cpuid_features(vcpu);
+ best = __kvm_find_kvm_cpuid_features(vcpu, entries, nent);
if (kvm_hlt_in_guest(vcpu->kvm) && best &&
(best->eax & (1 << KVM_FEATURE_PV_UNHALT)))
best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT);
if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT)) {
- best = kvm_find_cpuid_entry(vcpu, 0x1, 0);
+ best = cpuid_entry2_find(entries, nent, 0x1, 0);
if (best)
cpuid_entry_change(best, X86_FEATURE_MWAIT,
vcpu->arch.ia32_misc_enable_msr &
MSR_IA32_MISC_ENABLE_MWAIT);
}
+
+ /*
+ * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+ * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+ * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
+ * at the time of EENTER, thus adjust the allowed XFRM by the guest's
+ * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
+ * '1' even on CPUs that don't support XSAVE.
+ */
+ best = cpuid_entry2_find(entries, nent, 0x12, 0x1);
+ if (best) {
+ best->ecx &= guest_supported_xcr0 & 0xffffffff;
+ best->edx &= guest_supported_xcr0 >> 32;
+ best->ecx |= XFEATURE_MASK_FPSSE;
+ }
+}
+
+void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
+{
+ __kvm_update_cpuid_runtime(vcpu, vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
@@ -226,27 +293,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_apic_set_version(vcpu);
}
- best = kvm_find_cpuid_entry(vcpu, 0xD, 0);
- if (!best)
- vcpu->arch.guest_supported_xcr0 = 0;
- else
- vcpu->arch.guest_supported_xcr0 =
- (best->eax | ((u64)best->edx << 32)) & supported_xcr0;
-
- /*
- * Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
- * the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
- * requested XCR0 value. The enclave's XFRM must be a subset of XCRO
- * at the time of EENTER, thus adjust the allowed XFRM by the guest's
- * supported XCR0. Similar to XCR0 handling, FP and SSE are forced to
- * '1' even on CPUs that don't support XSAVE.
- */
- best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
- if (best) {
- best->ecx &= vcpu->arch.guest_supported_xcr0 & 0xffffffff;
- best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
- best->ecx |= XFEATURE_MASK_FPSSE;
- }
+ vcpu->arch.guest_supported_xcr0 =
+ cpuid_get_supported_xcr0(vcpu->arch.cpuid_entries, vcpu->arch.cpuid_nent);
kvm_update_pv_runtime(vcpu);
@@ -298,6 +346,28 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
{
int r;
+ __kvm_update_cpuid_runtime(vcpu, e2, nent);
+
+ /*
+ * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
+ * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
+ * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
+ * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
+ * the core vCPU model on the fly. It would've been better to forbid any
+ * KVM_SET_CPUID{,2} calls after KVM_RUN altogether but unfortunately
+ * some VMMs (e.g. QEMU) reuse vCPU fds for CPU hotplug/unplug and do
+ * KVM_SET_CPUID{,2} again. To support this legacy behavior, check
+ * whether the supplied CPUID data is equal to what's already set.
+ */
+ if (vcpu->arch.last_vmentry_cpu != -1) {
+ r = kvm_cpuid_check_equal(vcpu, e2, nent);
+ if (r)
+ return r;
+
+ kvfree(e2);
+ return 0;
+ }
+
r = kvm_check_cpuid(vcpu, e2, nent);
if (r)
return r;
@@ -307,7 +377,6 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
vcpu->arch.cpuid_nent = nent;
kvm_update_kvm_cpuid_base(vcpu);
- kvm_update_cpuid_runtime(vcpu);
kvm_vcpu_after_set_cpuid(vcpu);
return 0;
@@ -795,10 +864,10 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
perf_get_x86_pmu_capability(&cap);
/*
- * Only support guest architectural pmu on a host
- * with architectural pmu.
+ * The guest architecture pmu is only supported if the architecture
+ * pmu exists on the host and the module parameters allow it.
*/
- if (!cap.version)
+ if (!cap.version || !enable_pmu)
memset(&cap, 0, sizeof(cap));
eax.split.version_id = min(cap.version, 2);
@@ -837,13 +906,14 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
}
break;
case 0xd: {
- u64 guest_perm = xstate_get_guest_group_perm();
+ u64 permitted_xcr0 = supported_xcr0 & xstate_get_guest_group_perm();
+ u64 permitted_xss = supported_xss;
- entry->eax &= supported_xcr0 & guest_perm;
- entry->ebx = xstate_required_size(supported_xcr0, false);
+ entry->eax &= permitted_xcr0;
+ entry->ebx = xstate_required_size(permitted_xcr0, false);
entry->ecx = entry->ebx;
- entry->edx &= (supported_xcr0 & guest_perm) >> 32;
- if (!supported_xcr0)
+ entry->edx &= permitted_xcr0 >> 32;
+ if (!permitted_xcr0)
break;
entry = do_host_cpuid(array, function, 1);
@@ -852,20 +922,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
cpuid_entry_override(entry, CPUID_D_1_EAX);
if (entry->eax & (F(XSAVES)|F(XSAVEC)))
- entry->ebx = xstate_required_size(supported_xcr0 | supported_xss,
+ entry->ebx = xstate_required_size(permitted_xcr0 | permitted_xss,
true);
else {
- WARN_ON_ONCE(supported_xss != 0);
+ WARN_ON_ONCE(permitted_xss != 0);
entry->ebx = 0;
}
- entry->ecx &= supported_xss;
- entry->edx &= supported_xss >> 32;
+ entry->ecx &= permitted_xss;
+ entry->edx &= permitted_xss >> 32;
for (i = 2; i < 64; ++i) {
bool s_state;
- if (supported_xcr0 & BIT_ULL(i))
+ if (permitted_xcr0 & BIT_ULL(i))
s_state = false;
- else if (supported_xss & BIT_ULL(i))
+ else if (permitted_xss & BIT_ULL(i))
s_state = true;
else
continue;
@@ -879,13 +949,16 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
* invalid sub-leafs. Only valid sub-leafs should
* reach this point, and they should have a non-zero
* save state size. Furthermore, check whether the
- * processor agrees with supported_xcr0/supported_xss
+ * processor agrees with permitted_xcr0/permitted_xss
* on whether this is an XCR0- or IA32_XSS-managed area.
*/
if (WARN_ON_ONCE(!entry->eax || (entry->ecx & 0x1) != s_state)) {
--array->nent;
continue;
}
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XFD))
+ entry->ecx &= ~BIT_ULL(2);
entry->edx = 0;
}
break;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index c5028e6b0f96..4662469240bc 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1950,7 +1950,6 @@ void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
{
restart_apic_timer(vcpu->arch.apic);
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
{
@@ -1962,7 +1961,6 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
start_sw_timer(apic);
preempt_enable();
}
-EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
void kvm_lapic_restart_hv_timer(struct kvm_vcpu *vcpu)
{
@@ -2631,7 +2629,7 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
- hrtimer_cancel(&apic->lapic_timer.timer);
+ cancel_apic_timer(apic);
apic->lapic_timer.expired_tscdeadline = 0;
apic_update_lvtt(apic);
apic_manage_nmi_watchdog(apic, kvm_lapic_get_reg(apic, APIC_LVT0));
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 1d275e9d76b5..593093b52395 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5756,6 +5756,7 @@ static bool __kvm_zap_rmaps(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
continue;
flush = slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
+
PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
start, end - 1, true, flush);
}
@@ -5825,15 +5826,27 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
}
/*
- * We can flush all the TLBs out of the mmu lock without TLB
- * corruption since we just change the spte from writable to
- * readonly so that we only need to care the case of changing
- * spte from present to present (changing the spte from present
- * to nonpresent will flush all the TLBs immediately), in other
- * words, the only case we care is mmu_spte_update() where we
- * have checked Host-writable | MMU-writable instead of
- * PT_WRITABLE_MASK, that means it does not depend on PT_WRITABLE_MASK
- * anymore.
+ * Flush TLBs if any SPTEs had to be write-protected to ensure that
+ * guest writes are reflected in the dirty bitmap before the memslot
+ * update completes, i.e. before enabling dirty logging is visible to
+ * userspace.
+ *
+ * Perform the TLB flush outside the mmu_lock to reduce the amount of
+ * time the lock is held. However, this does mean that another CPU can
+ * now grab mmu_lock and encounter a write-protected SPTE while CPUs
+ * still have a writable mapping for the associated GFN in their TLB.
+ *
+ * This is safe but requires KVM to be careful when making decisions
+ * based on the write-protection status of an SPTE. Specifically, KVM
+ * also write-protects SPTEs to monitor changes to guest page tables
+ * during shadow paging, and must guarantee no CPUs can write to those
+ * page before the lock is dropped. As mentioned in the previous
+ * paragraph, a write-protected SPTE is no guarantee that CPU cannot
+ * perform writes. So to determine if a TLB flush is truly required, KVM
+ * will clear a separate software-only bit (MMU-writable) and skip the
+ * flush if-and-only-if this bit was already clear.
+ *
+ * See DEFAULT_SPTE_MMU_WRITEABLE for more details.
*/
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, memslot);
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index 351b04ad62a1..73cfe62fdad1 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -216,6 +216,7 @@ u64 kvm_mmu_changed_pte_notifier_make_spte(u64 old_spte, kvm_pfn_t new_pfn)
new_spte &= ~PT_WRITABLE_MASK;
new_spte &= ~shadow_host_writable_mask;
+ new_spte &= ~shadow_mmu_writable_mask;
new_spte = mark_spte_for_access_track(new_spte);
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index a4af2a42695c..be6a007a4af3 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -60,10 +60,6 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
-/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
-#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
-#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
-
/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
* as not-present for access tracking purposes. We do not save the W bit as the
@@ -79,6 +75,35 @@ static_assert(SPTE_TDP_AD_ENABLED_MASK == 0);
static_assert(!(SPTE_TDP_AD_MASK & SHADOW_ACC_TRACK_SAVED_MASK));
/*
+ * *_SPTE_HOST_WRITEABLE (aka Host-writable) indicates whether the host permits
+ * writes to the guest page mapped by the SPTE. This bit is cleared on SPTEs
+ * that map guest pages in read-only memslots and read-only VMAs.
+ *
+ * Invariants:
+ * - If Host-writable is clear, PT_WRITABLE_MASK must be clear.
+ *
+ *
+ * *_SPTE_MMU_WRITEABLE (aka MMU-writable) indicates whether the shadow MMU
+ * allows writes to the guest page mapped by the SPTE. This bit is cleared when
+ * the guest page mapped by the SPTE contains a page table that is being
+ * monitored for shadow paging. In this case the SPTE can only be made writable
+ * by unsyncing the shadow page under the mmu_lock.
+ *
+ * Invariants:
+ * - If MMU-writable is clear, PT_WRITABLE_MASK must be clear.
+ * - If MMU-writable is set, Host-writable must be set.
+ *
+ * If MMU-writable is set, PT_WRITABLE_MASK is normally set but can be cleared
+ * to track writes for dirty logging. For such SPTEs, KVM will locklessly set
+ * PT_WRITABLE_MASK upon the next write from the guest and record the write in
+ * the dirty log (see fast_page_fault()).
+ */
+
+/* Bits 9 and 10 are ignored by all non-EPT PTEs. */
+#define DEFAULT_SPTE_HOST_WRITEABLE BIT_ULL(9)
+#define DEFAULT_SPTE_MMU_WRITEABLE BIT_ULL(10)
+
+/*
* Low ignored bits are at a premium for EPT, use high ignored bits, taking care
* to not overlap the A/D type mask or the saved access bits of access-tracked
* SPTEs when A/D bits are disabled.
@@ -316,8 +341,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
static inline bool spte_can_locklessly_be_made_writable(u64 spte)
{
- return (spte & shadow_host_writable_mask) &&
- (spte & shadow_mmu_writable_mask);
+ if (spte & shadow_mmu_writable_mask) {
+ WARN_ON_ONCE(!(spte & shadow_host_writable_mask));
+ return true;
+ }
+
+ WARN_ON_ONCE(spte & PT_WRITABLE_MASK);
+ return false;
}
static inline u64 get_mmio_spte_generation(u64 spte)
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7b1bc816b7c3..bc9e3553fba2 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1442,12 +1442,12 @@ static bool write_protect_gfn(struct kvm *kvm, struct kvm_mmu_page *root,
!is_last_spte(iter.old_spte, iter.level))
continue;
- if (!is_writable_pte(iter.old_spte))
- break;
-
new_spte = iter.old_spte &
~(PT_WRITABLE_MASK | shadow_mmu_writable_mask);
+ if (new_spte == iter.old_spte)
+ break;
+
tdp_mmu_set_spte(kvm, &iter, new_spte);
spte_set = true;
}
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 261b39cbef6e..f614f95acc6b 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -13,6 +13,8 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <asm/perf_event.h>
#include "x86.h"
#include "cpuid.h"
@@ -109,6 +111,9 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type,
.config = config,
};
+ if (type == PERF_TYPE_HARDWARE && config >= PERF_COUNT_HW_MAX)
+ return;
+
attr.sample_period = get_sample_period(pmc, pmc->counter);
if (in_tx)
@@ -169,12 +174,16 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc)
return true;
}
+static int cmp_u64(const void *a, const void *b)
+{
+ return *(__u64 *)a - *(__u64 *)b;
+}
+
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
unsigned config, type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
- int i;
bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -189,16 +198,13 @@ void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (filter) {
- for (i = 0; i < filter->nevents; i++)
- if (filter->events[i] ==
- (eventsel & AMD64_RAW_EVENT_MASK_NB))
- break;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- i == filter->nevents)
- allow_event = false;
- if (filter->action == KVM_PMU_EVENT_DENY &&
- i < filter->nevents)
- allow_event = false;
+ __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+
+ if (bsearch(&key, filter->events, filter->nevents,
+ sizeof(__u64), cmp_u64))
+ allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
+ else
+ allow_event = filter->action == KVM_PMU_EVENT_DENY;
}
if (!allow_event)
return;
@@ -573,6 +579,11 @@ int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp)
/* Ensure nevents can't be changed between the user copies. */
*filter = tmp;
+ /*
+ * Sort the in-kernel list so that we can search it with bsearch.
+ */
+ sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index 0e5b49294086..90364d02f22a 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -295,13 +295,16 @@ static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source,
struct kvm_vcpu *vcpu;
unsigned long i;
+ /*
+ * Wake any target vCPUs that are blocking, i.e. waiting for a wake
+ * event. There's no need to signal doorbells, as hardware has handled
+ * vCPUs that were in guest at the time of the IPI, and vCPUs that have
+ * since entered the guest will have processed pending IRQs at VMRUN.
+ */
kvm_for_each_vcpu(i, vcpu, kvm) {
- bool m = kvm_apic_match_dest(vcpu, source,
- icrl & APIC_SHORT_MASK,
- GET_APIC_DEST_FIELD(icrh),
- icrl & APIC_DEST_MASK);
-
- if (m && !avic_vcpu_is_running(vcpu))
+ if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK,
+ GET_APIC_DEST_FIELD(icrh),
+ icrl & APIC_DEST_MASK))
kvm_vcpu_wake_up(vcpu);
}
}
@@ -672,9 +675,22 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
return -1;
kvm_lapic_set_irr(vec, vcpu->arch.apic);
+
+ /*
+ * Pairs with the smp_mb_*() after setting vcpu->guest_mode in
+ * vcpu_enter_guest() to ensure the write to the vIRR is ordered before
+ * the read of guest_mode, which guarantees that either VMRUN will see
+ * and process the new vIRR entry, or that the below code will signal
+ * the doorbell if the vCPU is already running in the guest.
+ */
smp_mb__after_atomic();
- if (avic_vcpu_is_running(vcpu)) {
+ /*
+ * Signal the doorbell to tell hardware to inject the IRQ if the vCPU
+ * is in the guest. If the vCPU is not in the guest, hardware will
+ * automatically process AVIC interrupts at VMRUN.
+ */
+ if (vcpu->mode == IN_GUEST_MODE) {
int cpu = READ_ONCE(vcpu->cpu);
/*
@@ -688,8 +704,13 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
if (cpu != get_cpu())
wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpu));
put_cpu();
- } else
+ } else {
+ /*
+ * Wake the vCPU if it was blocking. KVM will then detect the
+ * pending IRQ when checking if the vCPU has a wake event.
+ */
kvm_vcpu_wake_up(vcpu);
+ }
return 0;
}
@@ -957,6 +978,8 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
int h_physical_id = kvm_cpu_get_apicid(cpu);
struct vcpu_svm *svm = to_svm(vcpu);
+ lockdep_assert_preemption_disabled();
+
/*
* Since the host physical APIC id is 8 bits,
* we can support host APIC ID upto 255.
@@ -964,19 +987,25 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
+ /*
+ * No need to update anything if the vCPU is blocking, i.e. if the vCPU
+ * is being scheduled in after being preempted. The CPU entries in the
+ * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'.
+ * If the vCPU was migrated, its new CPU value will be stuffed when the
+ * vCPU unblocks.
+ */
+ if (kvm_vcpu_is_blocking(vcpu))
+ return;
+
entry = READ_ONCE(*(svm->avic_physical_id_cache));
WARN_ON(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK;
entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK);
-
- entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
- if (svm->avic_is_running)
- entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
+ entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
- avic_update_iommu_vcpu_affinity(vcpu, h_physical_id,
- svm->avic_is_running);
+ avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, true);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -984,42 +1013,56 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
u64 entry;
struct vcpu_svm *svm = to_svm(vcpu);
+ lockdep_assert_preemption_disabled();
+
entry = READ_ONCE(*(svm->avic_physical_id_cache));
- if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)
- avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
+
+ /* Nothing to do if IsRunning == '0' due to vCPU blocking. */
+ if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK))
+ return;
+
+ avic_update_iommu_vcpu_affinity(vcpu, -1, 0);
entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK;
WRITE_ONCE(*(svm->avic_physical_id_cache), entry);
}
-/*
- * This function is called during VCPU halt/unhalt.
- */
-static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- int cpu = get_cpu();
-
- WARN_ON(cpu != vcpu->cpu);
- svm->avic_is_running = is_run;
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
- if (kvm_vcpu_apicv_active(vcpu)) {
- if (is_run)
- avic_vcpu_load(vcpu, cpu);
- else
- avic_vcpu_put(vcpu);
- }
- put_cpu();
+ preempt_disable();
+
+ /*
+ * Unload the AVIC when the vCPU is about to block, _before_
+ * the vCPU actually blocks.
+ *
+ * Any IRQs that arrive before IsRunning=0 will not cause an
+ * incomplete IPI vmexit on the source, therefore vIRR will also
+ * be checked by kvm_vcpu_check_block() before blocking. The
+ * memory barrier implicit in set_current_state orders writing
+ * IsRunning=0 before reading the vIRR. The processor needs a
+ * matching memory barrier on interrupt delivery between writing
+ * IRR and reading IsRunning; the lack of this barrier might be
+ * the cause of errata #1235).
+ */
+ avic_vcpu_put(vcpu);
+
+ preempt_enable();
}
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu)
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu)
{
- avic_set_running(vcpu, false);
-}
+ int cpu;
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu)
-{
- if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
- kvm_vcpu_update_apicv(vcpu);
- avic_set_running(vcpu, true);
+ if (!kvm_vcpu_apicv_active(vcpu))
+ return;
+
+ cpu = get_cpu();
+ WARN_ON(cpu != vcpu->cpu);
+
+ avic_vcpu_load(vcpu, cpu);
+
+ put_cpu();
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index cf206855ebf0..1218b5a342fc 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -983,9 +983,9 @@ void svm_free_nested(struct vcpu_svm *svm)
/*
* Forcibly leave nested mode in order to be able to reset the VCPU later on.
*/
-void svm_leave_nested(struct vcpu_svm *svm)
+void svm_leave_nested(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu *vcpu = &svm->vcpu;
+ struct vcpu_svm *svm = to_svm(vcpu);
if (is_guest_mode(vcpu)) {
svm->nested.nested_run_pending = 0;
@@ -1411,7 +1411,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
return -EINVAL;
if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE)) {
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
svm_set_gif(svm, !!(kvm_state->flags & KVM_STATE_NESTED_GIF_SET));
return 0;
}
@@ -1478,7 +1478,7 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu,
*/
if (is_guest_mode(vcpu))
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
else
svm->nested.vmcb02.ptr->save = svm->vmcb01.ptr->save;
@@ -1532,6 +1532,7 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
}
struct kvm_x86_nested_ops svm_nested_ops = {
+ .leave_nested = svm_leave_nested,
.check_events = svm_check_nested_events,
.triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 12d8b301065a..5aa45f13b16d 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -101,7 +101,7 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr,
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
- if (!pmu)
+ if (!enable_pmu)
return NULL;
switch (msr) {
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 6a22798eaaee..17b53457d866 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2100,8 +2100,13 @@ void __init sev_hardware_setup(void)
if (!sev_enabled || !npt_enabled)
goto out;
- /* Does the CPU support SEV? */
- if (!boot_cpu_has(X86_FEATURE_SEV))
+ /*
+ * SEV must obviously be supported in hardware. Sanity check that the
+ * CPU supports decode assists, which is mandatory for SEV guests to
+ * support instruction emulation.
+ */
+ if (!boot_cpu_has(X86_FEATURE_SEV) ||
+ WARN_ON_ONCE(!boot_cpu_has(X86_FEATURE_DECODEASSISTS)))
goto out;
/* Retrieve SEV CPUID information */
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 46bcc706f257..6d97629655e3 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -192,10 +192,6 @@ module_param(vgif, int, 0444);
static int lbrv = true;
module_param(lbrv, int, 0444);
-/* enable/disable PMU virtualization */
-bool pmu = true;
-module_param(pmu, bool, 0444);
-
static int tsc_scaling = true;
module_param(tsc_scaling, int, 0444);
@@ -294,7 +290,7 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) {
if (!(efer & EFER_SVME)) {
- svm_leave_nested(svm);
+ svm_leave_nested(vcpu);
svm_set_gif(svm, true);
/* #GP intercept is still needed for vmware backdoor */
if (!enable_vmware_backdoor)
@@ -316,7 +312,11 @@ int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return ret;
}
- if (svm_gp_erratum_intercept)
+ /*
+ * Never intercept #GP for SEV guests, KVM can't
+ * decrypt guest memory to workaround the erratum.
+ */
+ if (svm_gp_erratum_intercept && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
}
}
@@ -873,47 +873,6 @@ static void shrink_ple_window(struct kvm_vcpu *vcpu)
}
}
-/*
- * The default MMIO mask is a single bit (excluding the present bit),
- * which could conflict with the memory encryption bit. Check for
- * memory encryption support and override the default MMIO mask if
- * memory encryption is enabled.
- */
-static __init void svm_adjust_mmio_mask(void)
-{
- unsigned int enc_bit, mask_bit;
- u64 msr, mask;
-
- /* If there is no memory encryption support, use existing mask */
- if (cpuid_eax(0x80000000) < 0x8000001f)
- return;
-
- /* If memory encryption is not enabled, use existing mask */
- rdmsrl(MSR_AMD64_SYSCFG, msr);
- if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
- return;
-
- enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
- mask_bit = boot_cpu_data.x86_phys_bits;
-
- /* Increment the mask bit if it is the same as the encryption bit */
- if (enc_bit == mask_bit)
- mask_bit++;
-
- /*
- * If the mask bit location is below 52, then some bits above the
- * physical addressing limit will always be reserved, so use the
- * rsvd_bits() function to generate the mask. This mask, along with
- * the present bit, will be used to generate a page fault with
- * PFER.RSV = 1.
- *
- * If the mask bit location is 52 (or above), then clear the mask.
- */
- mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
-
- kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
-}
-
static void svm_hardware_teardown(void)
{
int cpu;
@@ -928,198 +887,6 @@ static void svm_hardware_teardown(void)
iopm_base = 0;
}
-static __init void svm_set_cpu_caps(void)
-{
- kvm_set_cpu_caps();
-
- supported_xss = 0;
-
- /* CPUID 0x80000001 and 0x8000000A (SVM features) */
- if (nested) {
- kvm_cpu_cap_set(X86_FEATURE_SVM);
-
- if (nrips)
- kvm_cpu_cap_set(X86_FEATURE_NRIPS);
-
- if (npt_enabled)
- kvm_cpu_cap_set(X86_FEATURE_NPT);
-
- if (tsc_scaling)
- kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
-
- /* Nested VM can receive #VMEXIT instead of triggering #GP */
- kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
- }
-
- /* CPUID 0x80000008 */
- if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
- boot_cpu_has(X86_FEATURE_AMD_SSBD))
- kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
-
- /* AMD PMU PERFCTR_CORE CPUID */
- if (pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
- kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
-
- /* CPUID 0x8000001F (SME/SEV features) */
- sev_set_cpu_caps();
-}
-
-static __init int svm_hardware_setup(void)
-{
- int cpu;
- struct page *iopm_pages;
- void *iopm_va;
- int r;
- unsigned int order = get_order(IOPM_SIZE);
-
- /*
- * NX is required for shadow paging and for NPT if the NX huge pages
- * mitigation is enabled.
- */
- if (!boot_cpu_has(X86_FEATURE_NX)) {
- pr_err_ratelimited("NX (Execute Disable) not supported\n");
- return -EOPNOTSUPP;
- }
- kvm_enable_efer_bits(EFER_NX);
-
- iopm_pages = alloc_pages(GFP_KERNEL, order);
-
- if (!iopm_pages)
- return -ENOMEM;
-
- iopm_va = page_address(iopm_pages);
- memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
- iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
-
- init_msrpm_offsets();
-
- supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
-
- if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
- kvm_enable_efer_bits(EFER_FFXSR);
-
- if (tsc_scaling) {
- if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
- tsc_scaling = false;
- } else {
- pr_info("TSC scaling supported\n");
- kvm_has_tsc_control = true;
- kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
- kvm_tsc_scaling_ratio_frac_bits = 32;
- }
- }
-
- tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
-
- /* Check for pause filtering support */
- if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- pause_filter_count = 0;
- pause_filter_thresh = 0;
- } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
- pause_filter_thresh = 0;
- }
-
- if (nested) {
- printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
- kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
- }
-
- /*
- * KVM's MMU doesn't support using 2-level paging for itself, and thus
- * NPT isn't supported if the host is using 2-level paging since host
- * CR4 is unchanged on VMRUN.
- */
- if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
- npt_enabled = false;
-
- if (!boot_cpu_has(X86_FEATURE_NPT))
- npt_enabled = false;
-
- /* Force VM NPT level equal to the host's paging level */
- kvm_configure_mmu(npt_enabled, get_npt_level(),
- get_npt_level(), PG_LEVEL_1G);
- pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
-
- /* Note, SEV setup consumes npt_enabled. */
- sev_hardware_setup();
-
- svm_hv_hardware_setup();
-
- svm_adjust_mmio_mask();
-
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
- if (nrips) {
- if (!boot_cpu_has(X86_FEATURE_NRIPS))
- nrips = false;
- }
-
- enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
-
- if (enable_apicv) {
- pr_info("AVIC enabled\n");
-
- amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
- }
-
- if (vls) {
- if (!npt_enabled ||
- !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
- !IS_ENABLED(CONFIG_X86_64)) {
- vls = false;
- } else {
- pr_info("Virtual VMLOAD VMSAVE supported\n");
- }
- }
-
- if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
- svm_gp_erratum_intercept = false;
-
- if (vgif) {
- if (!boot_cpu_has(X86_FEATURE_VGIF))
- vgif = false;
- else
- pr_info("Virtual GIF supported\n");
- }
-
- if (lbrv) {
- if (!boot_cpu_has(X86_FEATURE_LBRV))
- lbrv = false;
- else
- pr_info("LBR virtualization supported\n");
- }
-
- if (!pmu)
- pr_info("PMU virtualization is disabled\n");
-
- svm_set_cpu_caps();
-
- /*
- * It seems that on AMD processors PTE's accessed bit is
- * being set by the CPU hardware before the NPF vmexit.
- * This is not expected behaviour and our tests fail because
- * of it.
- * A workaround here is to disable support for
- * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
- * In this case userspace can know if there is support using
- * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
- * it
- * If future AMD CPU models change the behaviour described above,
- * this variable can be changed accordingly
- */
- allow_smaller_maxphyaddr = !npt_enabled;
-
- return 0;
-
-err:
- svm_hardware_teardown();
- return r;
-}
-
static void init_seg(struct vmcb_seg *seg)
{
seg->selector = 0;
@@ -1247,9 +1014,10 @@ static void init_vmcb(struct kvm_vcpu *vcpu)
* Guest access to VMware backdoor ports could legitimately
* trigger #GP because of TSS I/O permission bitmap.
* We intercept those #GP and allow access to them anyway
- * as VMware does.
+ * as VMware does. Don't intercept #GP for SEV guests as KVM can't
+ * decrypt guest memory to decode the faulting instruction.
*/
- if (enable_vmware_backdoor)
+ if (enable_vmware_backdoor && !sev_guest(vcpu->kvm))
set_exception_intercept(svm, GP_VECTOR);
svm_set_intercept(svm, INTERCEPT_INTR);
@@ -1444,12 +1212,6 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu)
if (err)
goto error_free_vmsa_page;
- /* We initialize this flag to true to make sure that the is_running
- * bit would be set the first time the vcpu is loaded.
- */
- if (irqchip_in_kernel(vcpu->kvm) && kvm_apicv_activated(vcpu->kvm))
- svm->avic_is_running = true;
-
svm->msrpm = svm_vcpu_alloc_msrpm();
if (!svm->msrpm) {
err = -ENOMEM;
@@ -2334,10 +2096,6 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (error_code)
goto reinject;
- /* All SVM instructions expect page aligned RAX */
- if (svm->vmcb->save.rax & ~PAGE_MASK)
- goto reinject;
-
/* Decode the instruction for usage later */
if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK)
goto reinject;
@@ -2355,8 +2113,13 @@ static int gp_interception(struct kvm_vcpu *vcpu)
if (!is_guest_mode(vcpu))
return kvm_emulate_instruction(vcpu,
EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE);
- } else
+ } else {
+ /* All SVM instructions expect page aligned RAX */
+ if (svm->vmcb->save.rax & ~PAGE_MASK)
+ goto reinject;
+
return emulate_svm_instr(vcpu, opcode);
+ }
reinject:
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
@@ -3833,6 +3596,11 @@ static void svm_cancel_injection(struct kvm_vcpu *vcpu)
svm_complete_interrupts(vcpu);
}
+static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ return 1;
+}
+
static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR &&
@@ -4490,79 +4258,140 @@ static void svm_enable_smi_window(struct kvm_vcpu *vcpu)
}
}
-static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool svm_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
bool smep, smap, is_user;
unsigned long cr4;
+ u64 error_code;
+
+ /* Emulation is always possible when KVM has access to all guest state. */
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ /* #UD and #GP should never be intercepted for SEV guests. */
+ WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD |
+ EMULTYPE_TRAP_UD_FORCED |
+ EMULTYPE_VMWARE_GP));
/*
- * When the guest is an SEV-ES guest, emulation is not possible.
+ * Emulation is impossible for SEV-ES guests as KVM doesn't have access
+ * to guest register state.
*/
if (sev_es_guest(vcpu->kvm))
return false;
/*
+ * Emulation is possible if the instruction is already decoded, e.g.
+ * when completing I/O after returning from userspace.
+ */
+ if (emul_type & EMULTYPE_NO_DECODE)
+ return true;
+
+ /*
+ * Emulation is possible for SEV guests if and only if a prefilled
+ * buffer containing the bytes of the intercepted instruction is
+ * available. SEV guest memory is encrypted with a guest specific key
+ * and cannot be decrypted by KVM, i.e. KVM would read cyphertext and
+ * decode garbage.
+ *
+ * Inject #UD if KVM reached this point without an instruction buffer.
+ * In practice, this path should never be hit by a well-behaved guest,
+ * e.g. KVM doesn't intercept #UD or #GP for SEV guests, but this path
+ * is still theoretically reachable, e.g. via unaccelerated fault-like
+ * AVIC access, and needs to be handled by KVM to avoid putting the
+ * guest into an infinite loop. Injecting #UD is somewhat arbitrary,
+ * but its the least awful option given lack of insight into the guest.
+ */
+ if (unlikely(!insn)) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return false;
+ }
+
+ /*
+ * Emulate for SEV guests if the insn buffer is not empty. The buffer
+ * will be empty if the DecodeAssist microcode cannot fetch bytes for
+ * the faulting instruction because the code fetch itself faulted, e.g.
+ * the guest attempted to fetch from emulated MMIO or a guest page
+ * table used to translate CS:RIP resides in emulated MMIO.
+ */
+ if (likely(insn_len))
+ return true;
+
+ /*
* Detect and workaround Errata 1096 Fam_17h_00_0Fh.
*
* Errata:
- * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
- * possible that CPU microcode implementing DecodeAssist will fail
- * to read bytes of instruction which caused #NPF. In this case,
- * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
- * return 0 instead of the correct guest instruction bytes.
+ * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is
+ * possible that CPU microcode implementing DecodeAssist will fail to
+ * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly
+ * be '0'. This happens because microcode reads CS:RIP using a _data_
+ * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode
+ * gives up and does not fill the instruction bytes buffer.
*
- * This happens because CPU microcode reading instruction bytes
- * uses a special opcode which attempts to read data using CPL=0
- * privileges. The microcode reads CS:RIP and if it hits a SMAP
- * fault, it gives up and returns no instruction bytes.
+ * As above, KVM reaches this point iff the VM is an SEV guest, the CPU
+ * supports DecodeAssist, a #NPF was raised, KVM's page fault handler
+ * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the
+ * GuestIntrBytes field of the VMCB.
*
- * Detection:
- * We reach here in case CPU supports DecodeAssist, raised #NPF and
- * returned 0 in GuestIntrBytes field of the VMCB.
- * First, errata can only be triggered in case vCPU CR4.SMAP=1.
- * Second, if vCPU CR4.SMEP=1, errata could only be triggered
- * in case vCPU CPL==3 (Because otherwise guest would have triggered
- * a SMEP fault instead of #NPF).
- * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
- * As most guests enable SMAP if they have also enabled SMEP, use above
- * logic in order to attempt minimize false-positive of detecting errata
- * while still preserving all cases semantic correctness.
+ * This does _not_ mean that the erratum has been encountered, as the
+ * DecodeAssist will also fail if the load for CS:RIP hits a legitimate
+ * #PF, e.g. if the guest attempt to execute from emulated MMIO and
+ * encountered a reserved/not-present #PF.
*
- * Workaround:
- * To determine what instruction the guest was executing, the hypervisor
- * will have to decode the instruction at the instruction pointer.
+ * To hit the erratum, the following conditions must be true:
+ * 1. CR4.SMAP=1 (obviously).
+ * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot
+ * have been hit as the guest would have encountered a SMEP
+ * violation #PF, not a #NPF.
+ * 3. The #NPF is not due to a code fetch, in which case failure to
+ * retrieve the instruction bytes is legitimate (see abvoe).
*
- * In non SEV guest, hypervisor will be able to read the guest
- * memory to decode the instruction pointer when insn_len is zero
- * so we return true to indicate that decoding is possible.
- *
- * But in the SEV guest, the guest memory is encrypted with the
- * guest specific key and hypervisor will not be able to decode the
- * instruction pointer so we will not able to workaround it. Lets
- * print the error and request to kill the guest.
+ * In addition, don't apply the erratum workaround if the #NPF occurred
+ * while translating guest page tables (see below).
*/
- if (likely(!insn || insn_len))
- return true;
-
- /*
- * If RIP is invalid, go ahead with emulation which will cause an
- * internal error exit.
- */
- if (!kvm_vcpu_gfn_to_memslot(vcpu, kvm_rip_read(vcpu) >> PAGE_SHIFT))
- return true;
+ error_code = to_svm(vcpu)->vmcb->control.exit_info_1;
+ if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK))
+ goto resume_guest;
cr4 = kvm_read_cr4(vcpu);
smep = cr4 & X86_CR4_SMEP;
smap = cr4 & X86_CR4_SMAP;
is_user = svm_get_cpl(vcpu) == 3;
if (smap && (!smep || is_user)) {
- if (!sev_guest(vcpu->kvm))
- return true;
-
pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
- kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+
+ /*
+ * If the fault occurred in userspace, arbitrarily inject #GP
+ * to avoid killing the guest and to hopefully avoid confusing
+ * the guest kernel too much, e.g. injecting #PF would not be
+ * coherent with respect to the guest's page tables. Request
+ * triple fault if the fault occurred in the kernel as there's
+ * no fault that KVM can inject without confusing the guest.
+ * In practice, the triple fault is moot as no sane SEV kernel
+ * will execute from user memory while also running with SMAP=1.
+ */
+ if (is_user)
+ kvm_inject_gp(vcpu, 0);
+ else
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
}
+resume_guest:
+ /*
+ * If the erratum was not hit, simply resume the guest and let it fault
+ * again. While awful, e.g. the vCPU may get stuck in an infinite loop
+ * if the fault is at CPL=0, it's the lesser of all evils. Exiting to
+ * userspace will kill the guest, and letting the emulator read garbage
+ * will yield random behavior and potentially corrupt the guest.
+ *
+ * Simply resuming the guest is technically not a violation of the SEV
+ * architecture. AMD's APM states that all code fetches and page table
+ * accesses for SEV guest are encrypted, regardless of the C-Bit. The
+ * APM also states that encrypted accesses to MMIO are "ignored", but
+ * doesn't explicitly define "ignored", i.e. doing nothing and letting
+ * the guest spin is technically "ignoring" the access.
+ */
return false;
}
@@ -4629,8 +4458,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.prepare_guest_switch = svm_prepare_guest_switch,
.vcpu_load = svm_vcpu_load,
.vcpu_put = svm_vcpu_put,
- .vcpu_blocking = svm_vcpu_blocking,
- .vcpu_unblocking = svm_vcpu_unblocking,
+ .vcpu_blocking = avic_vcpu_blocking,
+ .vcpu_unblocking = avic_vcpu_unblocking,
.update_exception_bitmap = svm_update_exception_bitmap,
.get_msr_feature = svm_get_msr_feature,
@@ -4662,6 +4491,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.tlb_flush_gva = svm_flush_tlb_gva,
.tlb_flush_guest = svm_flush_tlb,
+ .vcpu_pre_run = svm_vcpu_pre_run,
.run = svm_vcpu_run,
.handle_exit = handle_exit,
.skip_emulated_instruction = skip_emulated_instruction,
@@ -4742,6 +4572,243 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector,
};
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_AMD64_SYSCFG, msr);
+ if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
+static __init void svm_set_cpu_caps(void)
+{
+ kvm_set_cpu_caps();
+
+ supported_xss = 0;
+
+ /* CPUID 0x80000001 and 0x8000000A (SVM features) */
+ if (nested) {
+ kvm_cpu_cap_set(X86_FEATURE_SVM);
+
+ if (nrips)
+ kvm_cpu_cap_set(X86_FEATURE_NRIPS);
+
+ if (npt_enabled)
+ kvm_cpu_cap_set(X86_FEATURE_NPT);
+
+ if (tsc_scaling)
+ kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR);
+
+ /* Nested VM can receive #VMEXIT instead of triggering #GP */
+ kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK);
+ }
+
+ /* CPUID 0x80000008 */
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD);
+
+ /* AMD PMU PERFCTR_CORE CPUID */
+ if (enable_pmu && boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
+ kvm_cpu_cap_set(X86_FEATURE_PERFCTR_CORE);
+
+ /* CPUID 0x8000001F (SME/SEV features) */
+ sev_set_cpu_caps();
+}
+
+static __init int svm_hardware_setup(void)
+{
+ int cpu;
+ struct page *iopm_pages;
+ void *iopm_va;
+ int r;
+ unsigned int order = get_order(IOPM_SIZE);
+
+ /*
+ * NX is required for shadow paging and for NPT if the NX huge pages
+ * mitigation is enabled.
+ */
+ if (!boot_cpu_has(X86_FEATURE_NX)) {
+ pr_err_ratelimited("NX (Execute Disable) not supported\n");
+ return -EOPNOTSUPP;
+ }
+ kvm_enable_efer_bits(EFER_NX);
+
+ iopm_pages = alloc_pages(GFP_KERNEL, order);
+
+ if (!iopm_pages)
+ return -ENOMEM;
+
+ iopm_va = page_address(iopm_pages);
+ memset(iopm_va, 0xff, PAGE_SIZE * (1 << order));
+ iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT;
+
+ init_msrpm_offsets();
+
+ supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR);
+
+ if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
+ kvm_enable_efer_bits(EFER_FFXSR);
+
+ if (tsc_scaling) {
+ if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
+ tsc_scaling = false;
+ } else {
+ pr_info("TSC scaling supported\n");
+ kvm_has_tsc_control = true;
+ kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX;
+ kvm_tsc_scaling_ratio_frac_bits = 32;
+ }
+ }
+
+ tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX);
+
+ /* Check for pause filtering support */
+ if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
+ pause_filter_count = 0;
+ pause_filter_thresh = 0;
+ } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) {
+ pause_filter_thresh = 0;
+ }
+
+ if (nested) {
+ printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
+ kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
+ }
+
+ /*
+ * KVM's MMU doesn't support using 2-level paging for itself, and thus
+ * NPT isn't supported if the host is using 2-level paging since host
+ * CR4 is unchanged on VMRUN.
+ */
+ if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE))
+ npt_enabled = false;
+
+ if (!boot_cpu_has(X86_FEATURE_NPT))
+ npt_enabled = false;
+
+ /* Force VM NPT level equal to the host's paging level */
+ kvm_configure_mmu(npt_enabled, get_npt_level(),
+ get_npt_level(), PG_LEVEL_1G);
+ pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis");
+
+ /* Note, SEV setup consumes npt_enabled. */
+ sev_hardware_setup();
+
+ svm_hv_hardware_setup();
+
+ svm_adjust_mmio_mask();
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
+ if (nrips) {
+ if (!boot_cpu_has(X86_FEATURE_NRIPS))
+ nrips = false;
+ }
+
+ enable_apicv = avic = avic && npt_enabled && boot_cpu_has(X86_FEATURE_AVIC);
+
+ if (enable_apicv) {
+ pr_info("AVIC enabled\n");
+
+ amd_iommu_register_ga_log_notifier(&avic_ga_log_notifier);
+ } else {
+ svm_x86_ops.vcpu_blocking = NULL;
+ svm_x86_ops.vcpu_unblocking = NULL;
+ }
+
+ if (vls) {
+ if (!npt_enabled ||
+ !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) ||
+ !IS_ENABLED(CONFIG_X86_64)) {
+ vls = false;
+ } else {
+ pr_info("Virtual VMLOAD VMSAVE supported\n");
+ }
+ }
+
+ if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK))
+ svm_gp_erratum_intercept = false;
+
+ if (vgif) {
+ if (!boot_cpu_has(X86_FEATURE_VGIF))
+ vgif = false;
+ else
+ pr_info("Virtual GIF supported\n");
+ }
+
+ if (lbrv) {
+ if (!boot_cpu_has(X86_FEATURE_LBRV))
+ lbrv = false;
+ else
+ pr_info("LBR virtualization supported\n");
+ }
+
+ if (!enable_pmu)
+ pr_info("PMU virtualization is disabled\n");
+
+ svm_set_cpu_caps();
+
+ /*
+ * It seems that on AMD processors PTE's accessed bit is
+ * being set by the CPU hardware before the NPF vmexit.
+ * This is not expected behaviour and our tests fail because
+ * of it.
+ * A workaround here is to disable support for
+ * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled.
+ * In this case userspace can know if there is support using
+ * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle
+ * it
+ * If future AMD CPU models change the behaviour described above,
+ * this variable can be changed accordingly
+ */
+ allow_smaller_maxphyaddr = !npt_enabled;
+
+ return 0;
+
+err:
+ svm_hardware_teardown();
+ return r;
+}
+
+
static struct kvm_x86_init_ops svm_init_ops __initdata = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 9f153c59f2c8..73525353e424 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -32,7 +32,6 @@
extern u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly;
extern bool npt_enabled;
extern bool intercept_smi;
-extern bool pmu;
/*
* Clean bits in VMCB.
@@ -226,7 +225,6 @@ struct vcpu_svm {
u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
- bool avic_is_running;
/*
* Per-vcpu list of struct amd_svm_iommu_ir:
@@ -306,11 +304,6 @@ static inline void vmcb_mark_all_clean(struct vmcb *vmcb)
& ~VMCB_ALWAYS_DIRTY_MASK;
}
-static inline bool vmcb_is_clean(struct vmcb *vmcb, int bit)
-{
- return (vmcb->control.clean & (1 << bit));
-}
-
static inline void vmcb_mark_dirty(struct vmcb *vmcb, int bit)
{
vmcb->control.clean &= ~(1 << bit);
@@ -527,7 +520,7 @@ static inline bool nested_exit_on_nmi(struct vcpu_svm *svm)
int enter_svm_guest_mode(struct kvm_vcpu *vcpu,
u64 vmcb_gpa, struct vmcb *vmcb12, bool from_vmrun);
-void svm_leave_nested(struct vcpu_svm *svm);
+void svm_leave_nested(struct kvm_vcpu *vcpu);
void svm_free_nested(struct vcpu_svm *svm);
int svm_allocate_nested(struct vcpu_svm *svm);
int nested_svm_vmrun(struct kvm_vcpu *vcpu);
@@ -574,17 +567,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL
-static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- u64 *entry = svm->avic_physical_id_cache;
-
- if (!entry)
- return false;
-
- return (READ_ONCE(*entry) & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
-}
-
int avic_ga_log_notifier(u32 ga_tag);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
@@ -605,8 +587,8 @@ int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec);
bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu);
int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
-void svm_vcpu_blocking(struct kvm_vcpu *vcpu);
-void svm_vcpu_unblocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_blocking(struct kvm_vcpu *vcpu);
+void avic_vcpu_unblocking(struct kvm_vcpu *vcpu);
/* sev.c */
diff --git a/arch/x86/kvm/svm/svm_onhyperv.h b/arch/x86/kvm/svm/svm_onhyperv.h
index c53b8bf8d013..489ca56212c6 100644
--- a/arch/x86/kvm/svm/svm_onhyperv.h
+++ b/arch/x86/kvm/svm/svm_onhyperv.h
@@ -46,6 +46,9 @@ static inline void svm_hv_init_vmcb(struct vmcb *vmcb)
if (npt_enabled &&
ms_hyperv.nested_features & HV_X64_NESTED_ENLIGHTENED_TLB)
hve->hv_enlightenments_control.enlightened_npt_tlb = 1;
+
+ if (ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)
+ hve->hv_enlightenments_control.msr_bitmap = 1;
}
static inline void svm_hv_hardware_setup(void)
@@ -83,14 +86,7 @@ static inline void svm_hv_vmcb_dirty_nested_enlightenments(
struct hv_enlightenments *hve =
(struct hv_enlightenments *)vmcb->control.reserved_sw;
- /*
- * vmcb can be NULL if called during early vcpu init.
- * And its okay not to mark vmcb dirty during vcpu init
- * as we mark it dirty unconditionally towards end of vcpu
- * init phase.
- */
- if (vmcb_is_clean(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS) &&
- hve->hv_enlightenments_control.msr_bitmap)
+ if (hve->hv_enlightenments_control.msr_bitmap)
vmcb_mark_dirty(vmcb, VMCB_HV_NESTED_ENLIGHTENMENTS);
}
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index c8029b7845b6..3f430e218375 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -5,6 +5,7 @@
#include <asm/vmx.h>
#include "lapic.h"
+#include "x86.h"
extern bool __read_mostly enable_vpid;
extern bool __read_mostly flexpriority_enabled;
@@ -53,7 +54,6 @@ struct nested_vmx_msrs {
struct vmcs_config {
int size;
- int order;
u32 basic_cap;
u32 revision_id;
u32 pin_based_exec_ctrl;
@@ -389,6 +389,9 @@ static inline u64 vmx_get_perf_capabilities(void)
{
u64 perf_cap = 0;
+ if (!enable_pmu)
+ return perf_cap;
+
if (boot_cpu_has(X86_FEATURE_PDCM))
rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_cap);
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index ba6f99f584ac..87e3dc10edf4 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -12,8 +12,6 @@
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
-#if IS_ENABLED(CONFIG_HYPERV)
-
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
#define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \
{EVMCS1_OFFSET(name), clean_field}
@@ -296,6 +294,7 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
+#if IS_ENABLED(CONFIG_HYPERV)
__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
@@ -362,6 +361,7 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
case MSR_IA32_VMX_PROCBASED_CTLS2:
ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
break;
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 16731d2cf231..8d70f9aea94b 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -59,12 +59,12 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_SHADOW_VMCS | \
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
-#if IS_ENABLED(CONFIG_HYPERV)
-
struct evmcs_field {
u16 offset;
u16 clean_field;
@@ -73,26 +73,56 @@ struct evmcs_field {
extern const struct evmcs_field vmcs_field_to_evmcs_1[];
extern const unsigned int nr_evmcs_1_fields;
-static __always_inline int get_evmcs_offset(unsigned long field,
- u16 *clean_field)
+static __always_inline int evmcs_field_offset(unsigned long field,
+ u16 *clean_field)
{
unsigned int index = ROL16(field, 6);
const struct evmcs_field *evmcs_field;
- if (unlikely(index >= nr_evmcs_1_fields)) {
- WARN_ONCE(1, "KVM: accessing unsupported EVMCS field %lx\n",
- field);
+ if (unlikely(index >= nr_evmcs_1_fields))
return -ENOENT;
- }
evmcs_field = &vmcs_field_to_evmcs_1[index];
+ /*
+ * Use offset=0 to detect holes in eVMCS. This offset belongs to
+ * 'revision_id' but this field has no encoding and is supposed to
+ * be accessed directly.
+ */
+ if (unlikely(!evmcs_field->offset))
+ return -ENOENT;
+
if (clean_field)
*clean_field = evmcs_field->clean_field;
return evmcs_field->offset;
}
+static inline u64 evmcs_read_any(struct hv_enlightened_vmcs *evmcs,
+ unsigned long field, u16 offset)
+{
+ /*
+ * vmcs12_read_any() doesn't care whether the supplied structure
+ * is 'struct vmcs12' or 'struct hv_enlightened_vmcs' as it takes
+ * the exact offset of the required field, use it for convenience
+ * here.
+ */
+ return vmcs12_read_any((void *)evmcs, field, offset);
+}
+
+#if IS_ENABLED(CONFIG_HYPERV)
+
+static __always_inline int get_evmcs_offset(unsigned long field,
+ u16 *clean_field)
+{
+ int offset = evmcs_field_offset(field, clean_field);
+
+ WARN_ONCE(offset < 0, "KVM: accessing unsupported EVMCS field %lx\n",
+ field);
+
+ return offset;
+}
+
static __always_inline void evmcs_write64(unsigned long field, u64 value)
{
u16 clean_field;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index f235f77cbc03..ba34e94049c7 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -7,6 +7,7 @@
#include <asm/mmu_context.h>
#include "cpuid.h"
+#include "evmcs.h"
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
@@ -4851,18 +4852,20 @@ static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
/*
- * We should allocate a shadow vmcs for vmcs01 only when L1
- * executes VMXON and free it when L1 executes VMXOFF.
- * As it is invalid to execute VMXON twice, we shouldn't reach
- * here when vmcs01 already have an allocated shadow vmcs.
+ * KVM allocates a shadow VMCS only when L1 executes VMXON and frees it
+ * when L1 executes VMXOFF or the vCPU is forced out of nested
+ * operation. VMXON faults if the CPU is already post-VMXON, so it
+ * should be impossible to already have an allocated shadow VMCS. KVM
+ * doesn't support virtualization of VMCS shadowing, so vmcs01 should
+ * always be the loaded VMCS.
*/
- WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
+ if (WARN_ON(loaded_vmcs != &vmx->vmcs01 || loaded_vmcs->shadow_vmcs))
+ return loaded_vmcs->shadow_vmcs;
+
+ loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
+ if (loaded_vmcs->shadow_vmcs)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
- if (!loaded_vmcs->shadow_vmcs) {
- loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
- if (loaded_vmcs->shadow_vmcs)
- vmcs_clear(loaded_vmcs->shadow_vmcs);
- }
return loaded_vmcs->shadow_vmcs;
}
@@ -5099,27 +5102,49 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
if (!nested_vmx_check_permission(vcpu))
return 1;
- /*
- * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
- * any VMREAD sets the ALU flags for VMfailInvalid.
- */
- if (vmx->nested.current_vmptr == INVALID_GPA ||
- (is_guest_mode(vcpu) &&
- get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
- return nested_vmx_failInvalid(vcpu);
-
/* Decode instruction info and find the field to read */
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
- if (offset < 0)
- return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ if (!evmptr_is_valid(vmx->nested.hv_evmcs_vmptr)) {
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is INVALID_GPA,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == INVALID_GPA ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == INVALID_GPA))
+ return nested_vmx_failInvalid(vcpu);
- if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
- copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
+ offset = get_vmcs12_field_offset(field);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- /* Read the field, zero-extended to a u64 value */
- value = vmcs12_read_any(vmcs12, field, offset);
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
+ } else {
+ /*
+ * Hyper-V TLFS (as of 6.0b) explicitly states, that while an
+ * enlightened VMCS is active VMREAD/VMWRITE instructions are
+ * unsupported. Unfortunately, certain versions of Windows 11
+ * don't comply with this requirement which is not enforced in
+ * genuine Hyper-V. Allow VMREAD from an enlightened VMCS as a
+ * workaround, as misbehaving guests will panic on VM-Fail.
+ * Note, enlightened VMCS is incompatible with shadow VMCS so
+ * all VMREADs from L2 should go to L1.
+ */
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ return nested_vmx_failInvalid(vcpu);
+
+ offset = evmcs_field_offset(field, NULL);
+ if (offset < 0)
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
+ /* Read the field, zero-extended to a u64 value */
+ value = evmcs_read_any(vmx->nested.hv_evmcs, field, offset);
+ }
/*
* Now copy part of this value to register or memory, as requested.
@@ -5214,7 +5239,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
field = kvm_register_read(vcpu, (((instr_info) >> 28) & 0xf));
- offset = vmcs_field_to_offset(field);
+ offset = get_vmcs12_field_offset(field);
if (offset < 0)
return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
@@ -6462,7 +6487,7 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
max_idx = 0;
for (i = 0; i < nr_vmcs12_fields; i++) {
/* The vmcs12 table is very, very sparsely populated. */
- if (!vmcs_field_to_offset_table[i])
+ if (!vmcs12_field_offsets[i])
continue;
idx = vmcs_field_index(VMCS12_IDX_TO_ENC(i));
@@ -6771,6 +6796,7 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
}
struct kvm_x86_nested_ops vmx_nested_ops = {
+ .leave_nested = vmx_leave_nested,
.check_events = vmx_check_nested_events,
.hv_timer_pending = nested_vmx_preemption_timer_pending,
.triple_fault = nested_vmx_triple_fault,
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 5e0ac57d6d1b..466d18fc0c5d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -21,7 +21,6 @@
#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
- /* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
[1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
[2] = { 0x3c, 0x01, PERF_COUNT_HW_BUS_CYCLES },
@@ -29,6 +28,7 @@ static struct kvm_event_hw_type_mapping intel_arch_events[] = {
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ /* The above index must match CPUID 0x0A.EBX bit vector */
[7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
@@ -75,11 +75,17 @@ static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
- for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
- if (intel_arch_events[i].eventsel == event_select &&
- intel_arch_events[i].unit_mask == unit_mask &&
- (pmc_is_fixed(pmc) || pmu->available_event_types & (1 << i)))
- break;
+ for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++) {
+ if (intel_arch_events[i].eventsel != event_select ||
+ intel_arch_events[i].unit_mask != unit_mask)
+ continue;
+
+ /* disable event that reported as not present by cpuid */
+ if ((i < 7) && !(pmu->available_event_types & (1 << i)))
+ return PERF_COUNT_HW_MAX + 1;
+
+ break;
+ }
if (i == ARRAY_SIZE(intel_arch_events))
return PERF_COUNT_HW_MAX;
@@ -481,7 +487,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
pmu->reserved_bits = 0xffffffff00200000ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
- if (!entry)
+ if (!entry || !enable_pmu)
return;
eax.full = entry->eax;
edx.full = entry->edx;
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
index 88c53c521094..aa1fe9085d77 100644
--- a/arch/x86/kvm/vmx/posted_intr.c
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -19,7 +19,7 @@
* wake the target vCPUs. vCPUs are removed from the list and the notification
* vector is reset when the vCPU is scheduled in.
*/
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu);
/*
* Protect the per-CPU list with a per-CPU spinlock to handle task migration.
* When a blocking vCPU is awakened _and_ migrated to a different pCPU, the
@@ -27,7 +27,7 @@ static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
* CPU. IRQs must be disabled when taking this lock, otherwise deadlock will
* occur if a wakeup IRQ arrives and attempts to acquire the lock.
*/
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock);
static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
{
@@ -51,7 +51,9 @@ static int pi_try_set_control(struct pi_desc *pi_desc, u64 old, u64 new)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct pi_desc old, new;
+ unsigned long flags;
unsigned int dest;
/*
@@ -62,23 +64,34 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
if (!enable_apicv || !lapic_in_kernel(vcpu))
return;
- /* Nothing to do if PI.SN and PI.NDST both have the desired value. */
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ /*
+ * If the vCPU wasn't on the wakeup list and wasn't migrated, then the
+ * full update can be skipped as neither the vector nor the destination
+ * needs to be changed.
+ */
+ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) {
+ /*
+ * Clear SN if it was set due to being preempted. Again, do
+ * this even if there is no assigned device for simplicity.
+ */
+ if (pi_test_and_clear_sn(pi_desc))
+ goto after_clear_sn;
return;
+ }
+
+ local_irq_save(flags);
/*
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
- * correctly.
+ * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup
+ * list of the _previous_ pCPU, which will not be the same as the
+ * current pCPU if the task was migrated.
*/
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- goto after_clear_sn;
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) {
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_del(&vmx->pi_wakeup_list);
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
}
- /* The full case. Set the new destination and clear SN. */
dest = cpu_physical_id(cpu);
if (!x2apic_mode)
dest = (dest << 8) & 0xFF00;
@@ -86,10 +99,22 @@ void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
do {
old.control = new.control = READ_ONCE(pi_desc->control);
+ /*
+ * Clear SN (as above) and refresh the destination APIC ID to
+ * handle task migration (@cpu != vcpu->cpu).
+ */
new.ndst = dest;
new.sn = 0;
+
+ /*
+ * Restore the notification vector; in the blocking case, the
+ * descriptor was modified on "put" to use the wakeup vector.
+ */
+ new.nv = POSTED_INTR_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control));
+ local_irq_restore(flags);
+
after_clear_sn:
/*
@@ -111,83 +136,25 @@ static bool vmx_can_use_vtd_pi(struct kvm *kvm)
irq_remapping_cap(IRQ_POSTING_CAP);
}
-void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!vmx_can_use_vtd_pi(vcpu->kvm))
- return;
-
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
-}
-
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- /*
- * Remove the vCPU from the wakeup list of the _previous_ pCPU, which
- * will not be the same as the current pCPU if the task was migrated.
- */
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
-
- dest = cpu_physical_id(vcpu->cpu);
- if (!x2apic_mode)
- dest = (dest << 8) & 0xFF00;
-
- WARN(pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the vCPU was blocking");
-
- do {
- old.control = new.control = READ_ONCE(pi_desc->control);
-
- new.ndst = dest;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (pi_try_set_control(pi_desc, old.control, new.control));
-
- vcpu->pre_pcpu = -1;
-}
-
/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
+ * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set
+ * WAKEUP as the notification vector in the PI descriptor.
*/
-int pi_pre_block(struct kvm_vcpu *vcpu)
+static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu)
{
- struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct pi_desc old, new;
unsigned long flags;
- if (!vmx_can_use_vtd_pi(vcpu->kvm) ||
- vmx_interrupt_blocked(vcpu))
- return 0;
-
local_irq_save(flags);
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu, vcpu->cpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->cpu));
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
+ list_add_tail(&vmx->pi_wakeup_list,
+ &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu));
- WARN(pi_desc->sn == 1,
- "Posted Interrupt Suppress Notification set before blocking");
+ WARN(pi_desc->sn, "PI descriptor SN field set before blocking");
do {
old.control = new.control = READ_ONCE(pi_desc->control);
@@ -196,24 +163,37 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
new.nv = POSTED_INTR_WAKEUP_VECTOR;
} while (pi_try_set_control(pi_desc, old.control, new.control));
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc))
- __pi_post_block(vcpu);
+ /*
+ * Send a wakeup IPI to this CPU if an interrupt may have been posted
+ * before the notification vector was updated, in which case the IRQ
+ * will arrive on the non-wakeup vector. An IPI is needed as calling
+ * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not
+ * enabled until it is safe to call try_to_wake_up() on the task being
+ * scheduled out).
+ */
+ if (pi_test_on(&new))
+ apic->send_IPI_self(POSTED_INTR_WAKEUP_VECTOR);
local_irq_restore(flags);
- return (vcpu->pre_pcpu == -1);
}
-void pi_post_block(struct kvm_vcpu *vcpu)
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
- unsigned long flags;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- if (vcpu->pre_pcpu == -1)
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
- local_irq_save(flags);
- __pi_post_block(vcpu);
- local_irq_restore(flags);
+ if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu))
+ pi_enable_wakeup_handler(vcpu);
+
+ /*
+ * Set SN when the vCPU is preempted. Note, the vCPU can both be seen
+ * as blocking and preempted, e.g. if it's preempted between setting
+ * its wait state and manually scheduling out.
+ */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
}
/*
@@ -221,24 +201,23 @@ void pi_post_block(struct kvm_vcpu *vcpu)
*/
void pi_wakeup_handler(void)
{
- struct kvm_vcpu *vcpu;
int cpu = smp_processor_id();
+ struct vcpu_vmx *vmx;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
+ list_for_each_entry(vmx, &per_cpu(wakeup_vcpus_on_cpu, cpu),
+ pi_wakeup_list) {
- if (pi_test_on(pi_desc))
- kvm_vcpu_kick(vcpu);
+ if (pi_test_on(&vmx->pi_desc))
+ kvm_vcpu_wake_up(&vmx->vcpu);
}
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
void __init pi_init_cpu(int cpu)
{
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu));
+ raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu));
}
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
@@ -254,7 +233,7 @@ bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
* Bail out of the block loop if the VM has an assigned
* device, but the blocking vCPU didn't reconfigure the
* PI.NV to the wakeup vector, i.e. the assigned device
- * came along after the initial check in pi_pre_block().
+ * came along after the initial check in vmx_vcpu_pi_put().
*/
void vmx_pi_start_assignment(struct kvm *kvm)
{
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
index 36ae035f14aa..eb14e76b84ef 100644
--- a/arch/x86/kvm/vmx/posted_intr.h
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -40,6 +40,12 @@ static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
(unsigned long *)&pi_desc->control);
}
+static inline bool pi_test_and_clear_sn(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
static inline bool pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
{
return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
@@ -88,8 +94,6 @@ static inline bool pi_test_sn(struct pi_desc *pi_desc)
void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
-int pi_pre_block(struct kvm_vcpu *vcpu);
-void pi_post_block(struct kvm_vcpu *vcpu);
void pi_wakeup_handler(void);
void __init pi_init_cpu(int cpu);
bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index cab6ba7a5005..2251b60920f8 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -8,7 +8,7 @@
FIELD(number, name), \
[ROL16(number##_HIGH, 6)] = VMCS12_OFFSET(name) + sizeof(u32)
-const unsigned short vmcs_field_to_offset_table[] = {
+const unsigned short vmcs12_field_offsets[] = {
FIELD(VIRTUAL_PROCESSOR_ID, virtual_processor_id),
FIELD(POSTED_INTR_NV, posted_intr_nv),
FIELD(GUEST_ES_SELECTOR, guest_es_selector),
@@ -151,4 +151,4 @@ const unsigned short vmcs_field_to_offset_table[] = {
FIELD(HOST_RSP, host_rsp),
FIELD(HOST_RIP, host_rip),
};
-const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs_field_to_offset_table);
+const unsigned int nr_vmcs12_fields = ARRAY_SIZE(vmcs12_field_offsets);
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index 2a45f026ee11..746129ddd5ae 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -361,10 +361,10 @@ static inline void vmx_check_vmcs12_offsets(void)
CHECK_OFFSET(guest_pml_index, 996);
}
-extern const unsigned short vmcs_field_to_offset_table[];
+extern const unsigned short vmcs12_field_offsets[];
extern const unsigned int nr_vmcs12_fields;
-static inline short vmcs_field_to_offset(unsigned long field)
+static inline short get_vmcs12_field_offset(unsigned long field)
{
unsigned short offset;
unsigned int index;
@@ -377,7 +377,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
return -ENOENT;
index = array_index_nospec(index, nr_vmcs12_fields);
- offset = vmcs_field_to_offset_table[index];
+ offset = vmcs12_field_offsets[index];
if (offset == 0)
return -ENOENT;
return offset;
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 1b2e9d8c5cc9..aca3ae2a02f3 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1487,11 +1487,12 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
return 0;
}
-static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
{
/*
* Emulation of instructions in SGX enclaves is impossible as RIP does
- * not point tthe failing instruction, and even if it did, the code
+ * not point at the failing instruction, and even if it did, the code
* stream is inaccessible. Inject #UD instead of exiting to userspace
* so that guest userspace can't DoS the guest simply by triggering
* emulation (enclaves are CPL3 only).
@@ -2603,7 +2604,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
return -EIO;
vmcs_conf->size = vmx_msr_high & 0x1fff;
- vmcs_conf->order = get_order(vmcs_conf->size);
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
vmcs_conf->revision_id = vmx_msr_low;
@@ -2628,7 +2628,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
struct page *pages;
struct vmcs *vmcs;
- pages = __alloc_pages_node(node, flags, vmcs_config.order);
+ pages = __alloc_pages_node(node, flags, 0);
if (!pages)
return NULL;
vmcs = page_address(pages);
@@ -2647,7 +2647,7 @@ struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags)
void free_vmcs(struct vmcs *vmcs)
{
- free_pages((unsigned long)vmcs, vmcs_config.order);
+ free_page((unsigned long)vmcs);
}
/*
@@ -3931,12 +3931,10 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
pt_update_intercept_for_msr(vcpu);
}
-static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
- bool nested)
+static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
+ int pi_vec)
{
#ifdef CONFIG_SMP
- int pi_vec = nested ? POSTED_INTR_NESTED_VECTOR : POSTED_INTR_VECTOR;
-
if (vcpu->mode == IN_GUEST_MODE) {
/*
* The vector of interrupt to be delivered to vcpu had
@@ -3964,10 +3962,15 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
*/
apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), pi_vec);
- return true;
+ return;
}
#endif
- return false;
+ /*
+ * The vCPU isn't in the guest; wake the vCPU in case it is blocking,
+ * otherwise do nothing as KVM will grab the highest priority pending
+ * IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
+ */
+ kvm_vcpu_wake_up(vcpu);
}
static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
@@ -3997,8 +4000,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
smp_mb__after_atomic();
/* the PIR and ON have been set by L1. */
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, true))
- kvm_vcpu_kick(vcpu);
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
return 0;
}
return -1;
@@ -4035,9 +4037,7 @@ static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
* guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
* posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
*/
- if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
- kvm_vcpu_kick(vcpu);
-
+ kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
return 0;
}
@@ -4094,10 +4094,14 @@ void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
vmcs_write32(HOST_IA32_SYSENTER_CS, low32);
/*
- * If 32-bit syscall is enabled, vmx_vcpu_load_vcms rewrites
- * HOST_IA32_SYSENTER_ESP.
+ * SYSENTER is used for 32-bit system calls on either 32-bit or
+ * 64-bit kernels. It is always zero If neither is allowed, otherwise
+ * vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
+ * have already done so!).
*/
- vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+ if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
+ vmcs_writel(HOST_IA32_SYSENTER_ESP, 0);
+
rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
vmcs_writel(HOST_IA32_SYSENTER_EIP, tmpl); /* 22.2.3 */
@@ -4901,8 +4905,33 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Note, skipping ICEBP also
+ * clears STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
@@ -5397,7 +5426,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
{
gpa_t gpa;
- if (!vmx_can_emulate_instruction(vcpu, NULL, 0))
+ if (!vmx_can_emulate_instruction(vcpu, EMULTYPE_PF, NULL, 0))
return 1;
/*
@@ -5426,6 +5455,14 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
return 1;
}
+static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->emulation_required && !vmx->rmode.vm86_active &&
+ vcpu->arch.exception.pending;
+}
+
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -5445,8 +5482,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
if (!kvm_emulate_instruction(vcpu, 0))
return 0;
- if (vmx->emulation_required && !vmx->rmode.vm86_active &&
- vcpu->arch.exception.pending) {
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
kvm_prepare_emulation_failure_exit(vcpu);
return 0;
}
@@ -5468,6 +5504,16 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
return 1;
}
+static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
+{
+ if (vmx_emulation_required_with_pending_exception(vcpu)) {
+ kvm_prepare_emulation_failure_exit(vcpu);
+ return 0;
+ }
+
+ return 1;
+}
+
static void grow_ple_window(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6928,6 +6974,8 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
vmx = to_vmx(vcpu);
+ INIT_LIST_HEAD(&vmx->pi_wakeup_list);
+
err = -ENOMEM;
vmx->vpid = allocate_vpid();
@@ -7549,25 +7597,6 @@ void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
}
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
-{
- if (pi_pre_block(vcpu))
- return 1;
-
- if (kvm_lapic_hv_timer_in_use(vcpu))
- kvm_lapic_switch_to_sw_timer(vcpu);
-
- return 0;
-}
-
-static void vmx_post_block(struct kvm_vcpu *vcpu)
-{
- if (kvm_x86_ops.set_hv_timer)
- kvm_lapic_switch_to_hv_timer(vcpu);
-
- pi_post_block(vcpu);
-}
-
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
@@ -7710,6 +7739,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.tlb_flush_gva = vmx_flush_tlb_gva,
.tlb_flush_guest = vmx_flush_tlb_guest,
+ .vcpu_pre_run = vmx_vcpu_pre_run,
.run = vmx_vcpu_run,
.handle_exit = vmx_handle_exit,
.skip_emulated_instruction = vmx_skip_emulated_instruction,
@@ -7768,9 +7798,6 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.cpu_dirty_log_size = PML_ENTITY_NUM,
.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
.pmu_ops = &intel_pmu_ops,
.nested_ops = &vmx_nested_ops,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index f8fc7441baea..7f2c82e7f38f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -317,6 +317,9 @@ struct vcpu_vmx {
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
+ /* Used if this vCPU is waiting for PI notification wakeup. */
+ struct list_head pi_wakeup_list;
+
/* Support for a guest hypervisor (nested VMX) */
struct nested_vmx nested;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 76b4803dd3bd..74b53a16f38a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -187,6 +187,11 @@ module_param(force_emulation_prefix, bool, S_IRUGO);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
+/* Enable/disable PMU virtualization */
+bool __read_mostly enable_pmu = true;
+EXPORT_SYMBOL_GPL(enable_pmu);
+module_param(enable_pmu, bool, 0444);
+
/*
* Restoring the host value for MSRs that are only consumed when running in
* usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
@@ -3530,6 +3535,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
if (data & ~supported_xss)
return 1;
vcpu->arch.ia32_xss = data;
+ kvm_update_cpuid_runtime(vcpu);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@@ -4224,6 +4230,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
+ case KVM_CAP_SYS_ATTRIBUTES:
r = 1;
break;
case KVM_CAP_EXIT_HYPERCALL:
@@ -4326,7 +4333,49 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
}
return r;
+}
+
+static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr)
+{
+ void __user *uaddr = (void __user*)(unsigned long)attr->addr;
+
+ if ((u64)(unsigned long)uaddr != attr->addr)
+ return ERR_PTR(-EFAULT);
+ return uaddr;
+}
+
+static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr)
+{
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
+
+ if (attr->group)
+ return -ENXIO;
+
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ if (put_user(supported_xcr0, uaddr))
+ return -EFAULT;
+ return 0;
+ default:
+ return -ENXIO;
+ break;
+ }
+}
+static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr)
+{
+ if (attr->group)
+ return -ENXIO;
+
+ switch (attr->attr) {
+ case KVM_X86_XCOMP_GUEST_SUPP:
+ return 0;
+ default:
+ return -ENXIO;
+ }
}
long kvm_arch_dev_ioctl(struct file *filp,
@@ -4417,6 +4466,22 @@ long kvm_arch_dev_ioctl(struct file *filp,
case KVM_GET_SUPPORTED_HV_CPUID:
r = kvm_ioctl_get_supported_hv_cpuid(NULL, argp);
break;
+ case KVM_GET_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_get_attr(&attr);
+ break;
+ }
+ case KVM_HAS_DEVICE_ATTR: {
+ struct kvm_device_attr attr;
+ r = -EFAULT;
+ if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+ break;
+ r = kvm_x86_dev_has_attr(&attr);
+ break;
+ }
default:
r = -EINVAL;
break;
@@ -4855,8 +4920,10 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
vcpu->arch.apic->sipi_vector = events->sipi_vector;
if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
- if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm)
+ if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
kvm_smm_changed(vcpu, events->smi.smm);
+ }
vcpu->arch.smi_pending = events->smi.pending;
@@ -5017,11 +5084,11 @@ static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
int r;
- if ((u64)(unsigned long)uaddr != attr->addr)
- return -EFAULT;
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET:
@@ -5040,12 +5107,12 @@ static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu,
static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
struct kvm_device_attr *attr)
{
- u64 __user *uaddr = (u64 __user *)(unsigned long)attr->addr;
+ u64 __user *uaddr = kvm_get_attr_addr(attr);
struct kvm *kvm = vcpu->kvm;
int r;
- if ((u64)(unsigned long)uaddr != attr->addr)
- return -EFAULT;
+ if (IS_ERR(uaddr))
+ return PTR_ERR(uaddr);
switch (attr->attr) {
case KVM_VCPU_TSC_OFFSET: {
@@ -5230,17 +5297,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
- /*
- * KVM does not correctly handle changing guest CPUID after KVM_RUN, as
- * MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
- * tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
- * faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
- * the core vCPU model on the fly, so fail.
- */
- r = -EINVAL;
- if (vcpu->arch.last_vmentry_cpu != -1)
- goto out;
-
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5251,14 +5307,6 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
- /*
- * KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
- * KVM_SET_CPUID case above.
- */
- r = -EINVAL;
- if (vcpu->arch.last_vmentry_cpu != -1)
- goto out;
-
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -6824,6 +6872,13 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val,
}
EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
+static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
+ void *insn, int insn_len)
+{
+ return static_call(kvm_x86_can_emulate_instruction)(vcpu, emul_type,
+ insn, insn_len);
+}
+
int handle_ud(struct kvm_vcpu *vcpu)
{
static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
@@ -6831,7 +6886,7 @@ int handle_ud(struct kvm_vcpu *vcpu)
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
- if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, NULL, 0)))
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
return 1;
if (force_emulation_prefix &&
@@ -8207,7 +8262,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool writeback = true;
bool write_fault_to_spt;
- if (unlikely(!static_call(kvm_x86_can_emulate_instruction)(vcpu, insn, insn_len)))
+ if (unlikely(!kvm_can_emulate_insn(vcpu, emulation_type, insn, insn_len)))
return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -9720,7 +9775,7 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
-void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
+static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
return;
@@ -9945,10 +10000,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
smp_mb__after_srcu_read_unlock();
/*
- * This handles the case where a posted interrupt was
- * notified with kvm_vcpu_kick. Assigned devices can
- * use the POSTED_INTR_VECTOR even if APICv is disabled,
- * so do it even if APICv is disabled on this vCPU.
+ * Process pending posted interrupts to handle the case where the
+ * notification IRQ arrived in the host, or was never sent (because the
+ * target vCPU wasn't running). Do this regardless of the vCPU's APICv
+ * status, KVM doesn't update assigned devices when APICv is inhibited,
+ * i.e. they can post interrupts even if APICv is temporarily disabled.
*/
if (kvm_lapic_enabled(vcpu))
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
@@ -10113,8 +10169,20 @@ out:
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
- if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops.pre_block || static_call(kvm_x86_pre_block)(vcpu) == 0)) {
+ bool hv_timer;
+
+ if (!kvm_arch_vcpu_runnable(vcpu)) {
+ /*
+ * Switch to the software timer before halt-polling/blocking as
+ * the guest's timer may be a break event for the vCPU, and the
+ * hypervisor timer runs only when the CPU is in guest mode.
+ * Switch before halt-polling so that KVM recognizes an expired
+ * timer before blocking.
+ */
+ hv_timer = kvm_lapic_hv_timer_in_use(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
kvm_vcpu_halt(vcpu);
@@ -10122,8 +10190,8 @@ static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_x86_ops.post_block)
- static_call(kvm_x86_post_block)(vcpu);
+ if (hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@@ -10316,6 +10384,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
r = -EINTR;
goto out;
}
+ /*
+ * It should be impossible for the hypervisor timer to be in
+ * use before KVM has ever run the vCPU.
+ */
+ WARN_ON_ONCE(kvm_lapic_hv_timer_in_use(vcpu));
kvm_vcpu_block(vcpu);
if (kvm_apic_accept_events(vcpu) < 0) {
r = 0;
@@ -10360,10 +10433,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
} else
WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
- if (kvm_run->immediate_exit)
+ if (kvm_run->immediate_exit) {
r = -EINTR;
- else
- r = vcpu_run(vcpu);
+ goto out;
+ }
+
+ r = static_call(kvm_x86_vcpu_pre_run)(vcpu);
+ if (r <= 0)
+ goto out;
+
+ r = vcpu_run(vcpu);
out:
kvm_put_guest_fpu(vcpu);
@@ -11199,7 +11278,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
vcpu->arch.msr_misc_features_enables = 0;
- vcpu->arch.xcr0 = XFEATURE_MASK_FP;
+ __kvm_set_xcr(vcpu, 0, XFEATURE_MASK_FP);
+ __kvm_set_msr(vcpu, MSR_IA32_XSS, 0, true);
}
/* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */
@@ -11216,8 +11296,6 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
cpuid_0x1 = kvm_find_cpuid_entry(vcpu, 1, 0);
kvm_rdx_write(vcpu, cpuid_0x1 ? cpuid_0x1->eax : 0x600);
- vcpu->arch.ia32_xss = 0;
-
static_call(kvm_x86_vcpu_reset)(vcpu, init_event);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index bec8ed090abc..635b75f9e145 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -336,6 +336,7 @@ extern u64 host_xcr0;
extern u64 supported_xcr0;
extern u64 host_xss;
extern u64 supported_xss;
+extern bool enable_pmu;
static inline bool kvm_mpx_supported(void)
{
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 0e3f7d6e9fd7..bad57535fad0 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -316,10 +316,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
"\tnotq %0\n"
"\t" LOCK_PREFIX "andq %0, %2\n"
"2:\n"
- "\t.section .fixup,\"ax\"\n"
- "3:\tjmp\t2b\n"
- "\t.previous\n"
- _ASM_EXTABLE_UA(1b, 3b)
+ _ASM_EXTABLE_UA(1b, 2b)
: "=r" (evtchn_pending_sel),
"+m" (vi->evtchn_pending_sel),
"+m" (v->arch.xen.evtchn_pending_sel)
@@ -335,10 +332,7 @@ int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
"\tnotl %0\n"
"\t" LOCK_PREFIX "andl %0, %2\n"
"2:\n"
- "\t.section .fixup,\"ax\"\n"
- "3:\tjmp\t2b\n"
- "\t.previous\n"
- _ASM_EXTABLE_UA(1b, 3b)
+ _ASM_EXTABLE_UA(1b, 2b)
: "=r" (evtchn_pending_sel32),
"+m" (vi->evtchn_pending_sel),
"+m" (v->arch.xen.evtchn_pending_sel)
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 2edd86649468..615a76d70019 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -353,8 +353,8 @@ static void pci_fixup_video(struct pci_dev *pdev)
}
}
}
-DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID,
- PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
+DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_ANY_ID, PCI_ANY_ID,
+ PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video);
static const struct dmi_system_id msi_k8t_dmi_table[] = {
diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig
index 95d26a69088b..40d6a06e41c8 100644
--- a/arch/x86/um/Kconfig
+++ b/arch/x86/um/Kconfig
@@ -8,7 +8,6 @@ endmenu
config UML_X86
def_bool y
- select GENERIC_FIND_FIRST_BIT
config 64BIT
bool "64-bit kernel" if "$(SUBARCH)" = "x86"
diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h
index 3f71d364ba90..cd225896c40f 100644
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -205,7 +205,6 @@ BIT_OPS(change, "xor", )
#undef BIT_OP
#undef TEST_AND_BIT_OP
-#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/le.h>
#include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 07b642c1916a..8eb6ad1a3a1d 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -208,7 +208,7 @@ static int simdisk_detach(struct simdisk *dev)
static ssize_t proc_read_simdisk(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
- struct simdisk *dev = PDE_DATA(file_inode(file));
+ struct simdisk *dev = pde_data(file_inode(file));
const char *s = dev->filename;
if (s) {
ssize_t n = simple_read_from_buffer(buf, size, ppos,
@@ -225,7 +225,7 @@ static ssize_t proc_write_simdisk(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
char *tmp = memdup_user_nul(buf, count);
- struct simdisk *dev = PDE_DATA(file_inode(file));
+ struct simdisk *dev = pde_data(file_inode(file));
int err;
if (IS_ERR(tmp))