summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig13
-rw-r--r--arch/x86/Makefile10
-rw-r--r--arch/x86/boot/bitops.h4
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S5
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/events/intel/core.c40
-rw-r--r--arch/x86/events/intel/ds.c9
-rw-r--r--arch/x86/events/perf_event.h2
-rw-r--r--arch/x86/include/asm/acrn.h14
-rw-r--r--arch/x86/include/asm/bitops.h54
-rw-r--r--arch/x86/include/asm/cfi.h22
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/hyperv-tlfs.h22
-rw-r--r--arch/x86/include/asm/intel-family.h3
-rw-r--r--arch/x86/include/asm/kvm-x86-ops.h2
-rw-r--r--arch/x86/include/asm/kvm_host.h39
-rw-r--r--arch/x86/include/asm/linkage.h12
-rw-r--r--arch/x86/include/asm/mc146818rtc.h2
-rw-r--r--arch/x86/include/asm/mce.h1
-rw-r--r--arch/x86/include/asm/microcode.h4
-rw-r--r--arch/x86/include/asm/msr-index.h13
-rw-r--r--arch/x86/include/asm/paravirt.h1
-rw-r--r--arch/x86/include/asm/paravirt_types.h28
-rw-r--r--arch/x86/include/asm/resctrl.h9
-rw-r--r--arch/x86/include/asm/smp.h25
-rw-r--r--arch/x86/include/asm/uaccess.h16
-rw-r--r--arch/x86/include/asm/uaccess_64.h45
-rw-r--r--arch/x86/kernel/Makefile2
-rw-r--r--arch/x86/kernel/alternative.c54
-rw-r--r--arch/x86/kernel/amd_gart_64.c2
-rw-r--r--arch/x86/kernel/aperture_64.c2
-rw-r--r--arch/x86/kernel/apic/apic.c44
-rw-r--r--arch/x86/kernel/cfi.c86
-rw-r--r--arch/x86/kernel/cpu/acrn.c3
-rw-r--r--arch/x86/kernel/cpu/feat_ctl.c2
-rw-r--r--arch/x86/kernel/cpu/mce/apei.c13
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c10
-rw-r--r--arch/x86/kernel/cpu/microcode/core.c5
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c17
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c117
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c75
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h61
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c232
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c14
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c216
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.c29
-rw-r--r--arch/x86/kernel/cpu/sgx/encl.h2
-rw-r--r--arch/x86/kernel/cpu/sgx/main.c15
-rw-r--r--arch/x86/kernel/dumpstack.c2
-rw-r--r--arch/x86/kernel/early_printk.c14
-rw-r--r--arch/x86/kernel/rtc.c63
-rw-r--r--arch/x86/kernel/traps.c4
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/cpuid.c20
-rw-r--r--arch/x86/kvm/emulate.c31
-rw-r--r--arch/x86/kvm/hyperv.c70
-rw-r--r--arch/x86/kvm/hyperv.h6
-rw-r--r--arch/x86/kvm/lapic.c38
-rw-r--r--arch/x86/kvm/lapic.h9
-rw-r--r--arch/x86/kvm/mmu/mmu.c22
-rw-r--r--arch/x86/kvm/mmu/paging_tmpl.h2
-rw-r--r--arch/x86/kvm/mmu/tdp_mmu.c12
-rw-r--r--arch/x86/kvm/svm/nested.c124
-rw-r--r--arch/x86/kvm/svm/svm.c35
-rw-r--r--arch/x86/kvm/trace.h53
-rw-r--r--arch/x86/kvm/vmx/capabilities.h14
-rw-r--r--arch/x86/kvm/vmx/evmcs.c192
-rw-r--r--arch/x86/kvm/vmx/evmcs.h10
-rw-r--r--arch/x86/kvm/vmx/nested.c470
-rw-r--r--arch/x86/kvm/vmx/nested.h2
-rw-r--r--arch/x86/kvm/vmx/sgx.c2
-rw-r--r--arch/x86/kvm/vmx/vmenter.S24
-rw-r--r--arch/x86/kvm/vmx/vmx.c321
-rw-r--r--arch/x86/kvm/vmx/vmx.h172
-rw-r--r--arch/x86/kvm/vmx/vmx_ops.h2
-rw-r--r--arch/x86/kvm/x86.c576
-rw-r--r--arch/x86/kvm/x86.h16
-rw-r--r--arch/x86/kvm/xen.c1
-rw-r--r--arch/x86/lib/clear_page_64.S138
-rw-r--r--arch/x86/lib/memcpy_64.S3
-rw-r--r--arch/x86/lib/usercopy.c2
-rw-r--r--arch/x86/lib/usercopy_64.c40
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/fault.c10
-rw-r--r--arch/x86/net/bpf_jit_comp.c102
-rw-r--r--arch/x86/platform/efi/efi.c2
-rw-r--r--arch/x86/platform/efi/efi_64.c18
-rw-r--r--arch/x86/platform/efi/efi_thunk_64.S13
-rw-r--r--arch/x86/purgatory/Makefile4
-rw-r--r--arch/x86/tools/relocs.c1
-rw-r--r--arch/x86/xen/enlighten_pv.c3
91 files changed, 2657 insertions, 1392 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f9920f1341c8..088af7c84e5d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -107,6 +107,8 @@ config X86
select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
+ select ARCH_SUPPORTS_CFI_CLANG if X86_64
+ select ARCH_USES_CFI_TRAPS if X86_64 && CFI_CLANG
select ARCH_SUPPORTS_LTO_CLANG
select ARCH_SUPPORTS_LTO_CLANG_THIN
select ARCH_USE_BUILTIN_BSWAP
@@ -257,6 +259,7 @@ config X86
select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL
select HAVE_PREEMPT_DYNAMIC_CALL
select HAVE_RSEQ
+ select HAVE_RUST if X86_64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_UACCESS_VALIDATION if HAVE_OBJTOOL
select HAVE_UNSTABLE_SCHED_CLOCK
@@ -284,6 +287,7 @@ config X86
select PROC_PID_ARCH_STATUS if PROC_FS
select HAVE_ARCH_NODE_DEV_GROUP if X86_SGX
imply IMA_SECURE_AND_OR_TRUSTED_BOOT if EFI
+ select HAVE_DYNAMIC_FTRACE_NO_PATCHABLE
config INSTRUCTION_DECODER
def_bool y
@@ -448,6 +452,11 @@ config X86_X2APIC
This allows 32-bit apic IDs (so it can support very large systems),
and accesses the local apic via MSRs not via mmio.
+ Some Intel systems circa 2022 and later are locked into x2APIC mode
+ and can not fall back to the legacy APIC modes if SGX or TDX are
+ enabled in the BIOS. They will be unable to boot without enabling
+ this option.
+
If you don't know what to do here, say N.
config X86_MPPARSE
@@ -1919,7 +1928,7 @@ endchoice
config X86_SGX
bool "Software Guard eXtensions (SGX)"
- depends on X86_64 && CPU_SUP_INTEL
+ depends on X86_64 && CPU_SUP_INTEL && X86_X2APIC
depends on CRYPTO=y
depends on CRYPTO_SHA256=y
select SRCU
@@ -2569,7 +2578,7 @@ menuconfig APM
1) make sure that you have enough swap space and that it is
enabled.
- 2) pass the "no-hlt" option to the kernel
+ 2) pass the "idle=poll" option to the kernel
3) switch on floating point emulation in the kernel and pass
the "no387" option to the kernel
4) pass the "floppy=nodma" option to the kernel
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index bafbd905e6e7..2d7e640674c6 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -68,6 +68,7 @@ export BITS
# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
#
KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
ifeq ($(CONFIG_X86_KERNEL_IBT),y)
#
@@ -155,8 +156,17 @@ else
cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic
KBUILD_CFLAGS += $(cflags-y)
+ rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8
+ rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona
+ rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2
+ rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom
+ rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic
+ KBUILD_RUSTFLAGS += $(rustflags-y)
+
KBUILD_CFLAGS += -mno-red-zone
KBUILD_CFLAGS += -mcmodel=kernel
+ KBUILD_RUSTFLAGS += -Cno-redzone=y
+ KBUILD_RUSTFLAGS += -Ccode-model=kernel
endif
#
diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h
index 02e1dea11d94..8518ae214c9b 100644
--- a/arch/x86/boot/bitops.h
+++ b/arch/x86/boot/bitops.h
@@ -19,13 +19,13 @@
static inline bool constant_test_bit(int nr, const void *addr)
{
- const u32 *p = (const u32 *)addr;
+ const u32 *p = addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
static inline bool variable_test_bit(int nr, const void *addr)
{
bool v;
- const u32 *p = (const u32 *)addr;
+ const u32 *p = addr;
asm("btl %2,%1" CC_SET(c) : CC_OUT(c) (v) : "m" (*p), "Ir" (nr));
return v;
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
index 802d71582689..4a43e072d2d1 100644
--- a/arch/x86/crypto/blowfish-x86_64-asm_64.S
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -6,6 +6,7 @@
*/
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
.file "blowfish-x86_64-asm.S"
.text
@@ -141,7 +142,7 @@ SYM_FUNC_START(__blowfish_enc_blk)
RET;
SYM_FUNC_END(__blowfish_enc_blk)
-SYM_FUNC_START(blowfish_dec_blk)
+SYM_TYPED_FUNC_START(blowfish_dec_blk)
/* input:
* %rdi: ctx
* %rsi: dst
@@ -332,7 +333,7 @@ SYM_FUNC_START(__blowfish_enc_blk_4way)
RET;
SYM_FUNC_END(__blowfish_enc_blk_4way)
-SYM_FUNC_START(blowfish_dec_blk_4way)
+SYM_TYPED_FUNC_START(blowfish_dec_blk_4way)
/* input:
* %rdi: ctx
* %rsi: dst
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 12f6c4d714cd..381d3333b996 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -91,7 +91,7 @@ ifneq ($(RETPOLINE_VDSO_CFLAGS),)
endif
endif
-$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(CC_FLAGS_CFI) $(RANDSTRUCT_CFLAGS) $(GCC_PLUGINS_CFLAGS) $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
$(vobjs): KBUILD_AFLAGS += -DBUILD_VDSO
#
@@ -153,6 +153,7 @@ KBUILD_CFLAGS_32 := $(filter-out $(RANDSTRUCT_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_LTO),$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(CC_FLAGS_CFI),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += -fno-stack-protector
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 3939debb27e7..a646a5f9a235 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2102,6 +2102,15 @@ static struct extra_reg intel_tnt_extra_regs[] __read_mostly = {
EVENT_EXTRA_END
};
+EVENT_ATTR_STR(mem-loads, mem_ld_grt, "event=0xd0,umask=0x5,ldlat=3");
+EVENT_ATTR_STR(mem-stores, mem_st_grt, "event=0xd0,umask=0x6");
+
+static struct attribute *grt_mem_attrs[] = {
+ EVENT_PTR(mem_ld_grt),
+ EVENT_PTR(mem_st_grt),
+ NULL
+};
+
static struct extra_reg intel_grt_extra_regs[] __read_mostly = {
/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0),
@@ -5995,6 +6004,36 @@ __init int intel_pmu_init(void)
name = "Tremont";
break;
+ case INTEL_FAM6_ALDERLAKE_N:
+ x86_pmu.mid_ack = true;
+ memcpy(hw_cache_event_ids, glp_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+ memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs,
+ sizeof(hw_cache_extra_regs));
+ hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1;
+
+ x86_pmu.event_constraints = intel_slm_event_constraints;
+ x86_pmu.pebs_constraints = intel_grt_pebs_event_constraints;
+ x86_pmu.extra_regs = intel_grt_extra_regs;
+
+ x86_pmu.pebs_aliases = NULL;
+ x86_pmu.pebs_prec_dist = true;
+ x86_pmu.pebs_block = true;
+ x86_pmu.lbr_pt_coexist = true;
+ x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+ x86_pmu.flags |= PMU_FL_INSTR_LATENCY;
+
+ intel_pmu_pebs_data_source_grt();
+ x86_pmu.pebs_latency_data = adl_latency_data_small;
+ x86_pmu.get_event_constraints = tnt_get_event_constraints;
+ x86_pmu.limit_period = spr_limit_period;
+ td_attr = tnt_events_attrs;
+ mem_attr = grt_mem_attrs;
+ extra_attr = nhm_format_attr;
+ pr_cont("Gracemont events, ");
+ name = "gracemont";
+ break;
+
case INTEL_FAM6_WESTMERE:
case INTEL_FAM6_WESTMERE_EP:
case INTEL_FAM6_WESTMERE_EX:
@@ -6341,7 +6380,6 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_ALDERLAKE:
case INTEL_FAM6_ALDERLAKE_L:
- case INTEL_FAM6_ALDERLAKE_N:
case INTEL_FAM6_RAPTORLAKE:
case INTEL_FAM6_RAPTORLAKE_P:
case INTEL_FAM6_RAPTORLAKE_S:
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 1380ae13ad2b..7839507b3844 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -110,13 +110,18 @@ void __init intel_pmu_pebs_data_source_skl(bool pmem)
__intel_pmu_pebs_data_source_skl(pmem, pebs_data_source);
}
-static void __init intel_pmu_pebs_data_source_grt(u64 *data_source)
+static void __init __intel_pmu_pebs_data_source_grt(u64 *data_source)
{
data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT);
data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM);
data_source[0x08] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOPX, FWD);
}
+void __init intel_pmu_pebs_data_source_grt(void)
+{
+ __intel_pmu_pebs_data_source_grt(pebs_data_source);
+}
+
void __init intel_pmu_pebs_data_source_adl(void)
{
u64 *data_source;
@@ -127,7 +132,7 @@ void __init intel_pmu_pebs_data_source_adl(void)
data_source = x86_pmu.hybrid_pmu[X86_HYBRID_PMU_ATOM_IDX].pebs_data_source;
memcpy(data_source, pebs_data_source, sizeof(pebs_data_source));
- intel_pmu_pebs_data_source_grt(data_source);
+ __intel_pmu_pebs_data_source_grt(data_source);
}
static u64 precise_store_data(u64 status)
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 371967054871..332d2e6d8ae4 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1598,6 +1598,8 @@ void intel_pmu_pebs_data_source_skl(bool pmem);
void intel_pmu_pebs_data_source_adl(void);
+void intel_pmu_pebs_data_source_grt(void);
+
int intel_pmu_setup_lbr_filter(struct perf_event *event);
void intel_pt_interrupt(void);
diff --git a/arch/x86/include/asm/acrn.h b/arch/x86/include/asm/acrn.h
index e003a01b7c67..1dd14381bcb6 100644
--- a/arch/x86/include/asm/acrn.h
+++ b/arch/x86/include/asm/acrn.h
@@ -10,6 +10,15 @@
/* Bit 0 indicates whether guest VM is privileged */
#define ACRN_FEATURE_PRIVILEGED_VM BIT(0)
+/*
+ * Timing Information.
+ * This leaf returns the current TSC frequency in kHz.
+ *
+ * EAX: (Virtual) TSC frequency in kHz.
+ * EBX, ECX, EDX: RESERVED (reserved fields are set to zero).
+ */
+#define ACRN_CPUID_TIMING_INFO 0x40000010
+
void acrn_setup_intr_handler(void (*handler)(void));
void acrn_remove_intr_handler(void);
@@ -21,6 +30,11 @@ static inline u32 acrn_cpuid_base(void)
return 0;
}
+static inline unsigned long acrn_get_tsc_khz(void)
+{
+ return cpuid_eax(ACRN_CPUID_TIMING_INFO);
+}
+
/*
* Hypercalls for ACRN
*
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 0fe9de58af31..2edf68475fec 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -247,17 +247,30 @@ arch_test_bit_acquire(unsigned long nr, const volatile unsigned long *addr)
variable_test_bit(nr, addr);
}
+static __always_inline unsigned long variable__ffs(unsigned long word)
+{
+ asm("rep; bsf %1,%0"
+ : "=r" (word)
+ : "rm" (word));
+ return word;
+}
+
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
-static __always_inline unsigned long __ffs(unsigned long word)
+#define __ffs(word) \
+ (__builtin_constant_p(word) ? \
+ (unsigned long)__builtin_ctzl(word) : \
+ variable__ffs(word))
+
+static __always_inline unsigned long variable_ffz(unsigned long word)
{
asm("rep; bsf %1,%0"
: "=r" (word)
- : "rm" (word));
+ : "r" (~word));
return word;
}
@@ -267,13 +280,10 @@ static __always_inline unsigned long __ffs(unsigned long word)
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
-static __always_inline unsigned long ffz(unsigned long word)
-{
- asm("rep; bsf %1,%0"
- : "=r" (word)
- : "r" (~word));
- return word;
-}
+#define ffz(word) \
+ (__builtin_constant_p(word) ? \
+ (unsigned long)__builtin_ctzl(~word) : \
+ variable_ffz(word))
/*
* __fls: find last set bit in word
@@ -292,18 +302,7 @@ static __always_inline unsigned long __fls(unsigned long word)
#undef ADDR
#ifdef __KERNEL__
-/**
- * ffs - find first set bit in word
- * @x: the word to search
- *
- * This is defined the same way as the libc and compiler builtin ffs
- * routines, therefore differs in spirit from the other bitops.
- *
- * ffs(value) returns 0 if value is 0 or the position of the first
- * set bit if value is nonzero. The first (least significant) bit
- * is at position 1.
- */
-static __always_inline int ffs(int x)
+static __always_inline int variable_ffs(int x)
{
int r;
@@ -334,6 +333,19 @@ static __always_inline int ffs(int x)
}
/**
+ * ffs - find first set bit in word
+ * @x: the word to search
+ *
+ * This is defined the same way as the libc and compiler builtin ffs
+ * routines, therefore differs in spirit from the other bitops.
+ *
+ * ffs(value) returns 0 if value is 0 or the position of the first
+ * set bit if value is nonzero. The first (least significant) bit
+ * is at position 1.
+ */
+#define ffs(x) (__builtin_constant_p(x) ? __builtin_ffs(x) : variable_ffs(x))
+
+/**
* fls - find last set bit in word
* @x: the word to search
*
diff --git a/arch/x86/include/asm/cfi.h b/arch/x86/include/asm/cfi.h
new file mode 100644
index 000000000000..58dacd90daef
--- /dev/null
+++ b/arch/x86/include/asm/cfi.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CFI_H
+#define _ASM_X86_CFI_H
+
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+
+#include <linux/cfi.h>
+
+#ifdef CONFIG_CFI_CLANG
+enum bug_trap_type handle_cfi_failure(struct pt_regs *regs);
+#else
+static inline enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+ return BUG_TRAP_TYPE_NONE;
+}
+#endif /* CONFIG_CFI_CLANG */
+
+#endif /* _ASM_X86_CFI_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index 8cbf623f0ecf..b472ef76826a 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -94,4 +94,6 @@ static inline bool intel_cpu_signatures_match(unsigned int s1, unsigned int p1,
return p1 & p2;
}
+extern u64 x86_read_arch_cap_msr(void);
+
#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/hyperv-tlfs.h b/arch/x86/include/asm/hyperv-tlfs.h
index 0a9407dc0859..3089ec352743 100644
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -138,6 +138,9 @@
#define HV_X64_NESTED_GUEST_MAPPING_FLUSH BIT(18)
#define HV_X64_NESTED_MSR_BITMAP BIT(19)
+/* Nested features #2. These are HYPERV_CPUID_NESTED_FEATURES.EBX bits. */
+#define HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL BIT(0)
+
/*
* This is specific to AMD and specifies that enlightened TLB flush is
* supported. If guest opts in to this feature, ASID invalidations only
@@ -546,7 +549,7 @@ struct hv_enlightened_vmcs {
u64 guest_rip;
u32 hv_clean_fields;
- u32 hv_padding_32;
+ u32 padding32_1;
u32 hv_synthetic_controls;
struct {
u32 nested_flush_hypercall:1;
@@ -554,14 +557,25 @@ struct hv_enlightened_vmcs {
u32 reserved:30;
} __packed hv_enlightenments_control;
u32 hv_vp_id;
-
+ u32 padding32_2;
u64 hv_vm_id;
u64 partition_assist_page;
u64 padding64_4[4];
u64 guest_bndcfgs;
- u64 padding64_5[7];
+ u64 guest_ia32_perf_global_ctrl;
+ u64 guest_ia32_s_cet;
+ u64 guest_ssp;
+ u64 guest_ia32_int_ssp_table_addr;
+ u64 guest_ia32_lbr_ctl;
+ u64 padding64_5[2];
u64 xss_exit_bitmap;
- u64 padding64_6[7];
+ u64 encls_exiting_bitmap;
+ u64 host_ia32_perf_global_ctrl;
+ u64 tsc_multiplier;
+ u64 host_ia32_s_cet;
+ u64 host_ssp;
+ u64 host_ia32_int_ssp_table_addr;
+ u64 padding64_6;
} __packed;
#define HV_VMX_ENLIGHTENED_CLEAN_FIELD_NONE 0
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index aeb38023a703..5d75fe229342 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -115,6 +115,9 @@
#define INTEL_FAM6_RAPTORLAKE_P 0xBA
#define INTEL_FAM6_RAPTORLAKE_S 0xBF
+#define INTEL_FAM6_METEORLAKE 0xAC
+#define INTEL_FAM6_METEORLAKE_L 0xAA
+
/* "Small Core" Processors (Atom) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index 51f777071584..82ba4a564e58 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -67,7 +67,7 @@ KVM_X86_OP(get_interrupt_shadow)
KVM_X86_OP(patch_hypercall)
KVM_X86_OP(inject_irq)
KVM_X86_OP(inject_nmi)
-KVM_X86_OP(queue_exception)
+KVM_X86_OP(inject_exception)
KVM_X86_OP(cancel_injection)
KVM_X86_OP(interrupt_allowed)
KVM_X86_OP(nmi_allowed)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa381ab69a19..61b9dd34d333 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -615,6 +615,8 @@ struct kvm_vcpu_hv {
u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */
u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */
u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */
+ u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */
+ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */
} cpuid_cache;
};
@@ -639,6 +641,16 @@ struct kvm_vcpu_xen {
struct timer_list poll_timer;
};
+struct kvm_queued_exception {
+ bool pending;
+ bool injected;
+ bool has_error_code;
+ u8 vector;
+ u32 error_code;
+ unsigned long payload;
+ bool has_payload;
+};
+
struct kvm_vcpu_arch {
/*
* rip and regs accesses must go through
@@ -738,16 +750,12 @@ struct kvm_vcpu_arch {
u8 event_exit_inst_len;
- struct kvm_queued_exception {
- bool pending;
- bool injected;
- bool has_error_code;
- u8 nr;
- u32 error_code;
- unsigned long payload;
- bool has_payload;
- u8 nested_apf;
- } exception;
+ bool exception_from_userspace;
+
+ /* Exceptions to be injected to the guest. */
+ struct kvm_queued_exception exception;
+ /* Exception VM-Exits to be synthesized to L1. */
+ struct kvm_queued_exception exception_vmexit;
struct kvm_queued_interrupt {
bool injected;
@@ -858,7 +866,6 @@ struct kvm_vcpu_arch {
u32 id;
bool send_user_only;
u32 host_apf_flags;
- unsigned long nested_apf_token;
bool delivery_as_pf_vmexit;
bool pageready_pending;
} apf;
@@ -1524,7 +1531,7 @@ struct kvm_x86_ops {
unsigned char *hypercall_addr);
void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected);
void (*inject_nmi)(struct kvm_vcpu *vcpu);
- void (*queue_exception)(struct kvm_vcpu *vcpu);
+ void (*inject_exception)(struct kvm_vcpu *vcpu);
void (*cancel_injection)(struct kvm_vcpu *vcpu);
int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection);
@@ -1634,10 +1641,10 @@ struct kvm_x86_ops {
struct kvm_x86_nested_ops {
void (*leave_nested)(struct kvm_vcpu *vcpu);
+ bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code);
int (*check_events)(struct kvm_vcpu *vcpu);
- bool (*handle_page_fault_workaround)(struct kvm_vcpu *vcpu,
- struct x86_exception *fault);
- bool (*hv_timer_pending)(struct kvm_vcpu *vcpu);
+ bool (*has_events)(struct kvm_vcpu *vcpu);
void (*triple_fault)(struct kvm_vcpu *vcpu);
int (*get_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
@@ -1863,7 +1870,7 @@ void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long pay
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
-bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault);
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr);
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 73ca20049835..f484d656d34e 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -43,6 +43,18 @@
#endif /* __ASSEMBLY__ */
+#define __CFI_TYPE(name) \
+ SYM_START(__cfi_##name, SYM_L_LOCAL, SYM_A_NONE) \
+ .fill 11, 1, 0x90 ASM_NL \
+ .byte 0xb8 ASM_NL \
+ .long __kcfi_typeid_##name ASM_NL \
+ SYM_FUNC_END(__cfi_##name)
+
+/* SYM_TYPED_FUNC_START -- use for indirectly called globals, w/ CFI type */
+#define SYM_TYPED_FUNC_START(name) \
+ SYM_TYPED_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
+ ENDBR
+
/* SYM_FUNC_START -- use for global functions */
#define SYM_FUNC_START(name) \
SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) \
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
index 97198001e567..6115bb3d5795 100644
--- a/arch/x86/include/asm/mc146818rtc.h
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -95,7 +95,7 @@ static inline unsigned char current_lock_cmos_reg(void)
unsigned char rtc_cmos_read(unsigned char addr);
void rtc_cmos_write(unsigned char val, unsigned char addr);
-extern int mach_set_rtc_mmss(const struct timespec64 *now);
+extern int mach_set_cmos_time(const struct timespec64 *now);
extern void mach_get_cmos_time(struct timespec64 *now);
#define RTC_IRQ 8
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index cc73061e7255..6e986088817d 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -42,6 +42,7 @@
#define MCI_STATUS_CEC_SHIFT 38 /* Corrected Error Count */
#define MCI_STATUS_CEC_MASK GENMASK_ULL(52,38)
#define MCI_STATUS_CEC(c) (((c) & MCI_STATUS_CEC_MASK) >> MCI_STATUS_CEC_SHIFT)
+#define MCI_STATUS_MSCOD(m) (((m) >> 16) & 0xffff)
/* AMD-specific bits */
#define MCI_STATUS_TCC BIT_ULL(55) /* Task context corrupt */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 0c3d3440fe27..74ecc2bd6cd0 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,6 +9,7 @@
struct ucode_patch {
struct list_head plist;
void *data; /* Intel uses only this one */
+ unsigned int size;
u32 patch_id;
u16 equiv_cpu;
};
@@ -32,9 +33,6 @@ enum ucode_state {
};
struct microcode_ops {
- enum ucode_state (*request_microcode_user) (int cpu,
- const void __user *buf, size_t size);
-
enum ucode_state (*request_microcode_fw) (int cpu, struct device *,
bool refresh_fw);
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 109c404e6137..10ac52705892 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -155,6 +155,11 @@
* Return Stack Buffer Predictions.
*/
+#define ARCH_CAP_XAPIC_DISABLE BIT(21) /*
+ * IA32_XAPIC_DISABLE_STATUS MSR
+ * supported
+ */
+
#define MSR_IA32_FLUSH_CMD 0x0000010b
#define L1D_FLUSH BIT(0) /*
* Writeback and invalidate the
@@ -1059,4 +1064,12 @@
#define MSR_IA32_HW_FEEDBACK_PTR 0x17d0
#define MSR_IA32_HW_FEEDBACK_CONFIG 0x17d1
+/* x2APIC locked status */
+#define MSR_IA32_XAPIC_DISABLE_STATUS 0xBD
+#define LEGACY_XAPIC_DISABLED BIT(0) /*
+ * x2APIC mode is locked and
+ * disabling x2APIC will cause
+ * a #GP
+ */
+
#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 964442b99245..2a0b8dd4ec33 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -743,6 +743,7 @@ extern void default_banner(void);
word 771b; \
.byte ptype; \
.byte 772b-771b; \
+ _ASM_ALIGN; \
.popsection
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 89df6c6617f5..f3d601574730 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,6 +294,7 @@ extern struct paravirt_patch_template pv_ops;
" .byte " type "\n" \
" .byte 772b-771b\n" \
" .short " clobber "\n" \
+ _ASM_ALIGN "\n" \
".popsection\n"
/* Generate patchable code, with the default asm parameters. */
@@ -328,7 +329,7 @@ int paravirt_disable_iospace(void);
* Unfortunately, this is a relatively slow operation for modern CPUs,
* because it cannot necessarily determine what the destination
* address is. In this case, the address is a runtime constant, so at
- * the very least we can patch the call to e a simple direct call, or
+ * the very least we can patch the call to a simple direct call, or,
* ideally, patch an inline implementation into the callsite. (Direct
* calls are essentially free, because the call and return addresses
* are completely predictable.)
@@ -339,10 +340,10 @@ int paravirt_disable_iospace(void);
* on the stack. All caller-save registers (eax,edx,ecx) are expected
* to be modified (either clobbered or used for return values).
* X86_64, on the other hand, already specifies a register-based calling
- * conventions, returning at %rax, with parameters going on %rdi, %rsi,
+ * conventions, returning at %rax, with parameters going in %rdi, %rsi,
* %rdx, and %rcx. Note that for this reason, x86_64 does not need any
* special handling for dealing with 4 arguments, unlike i386.
- * However, x86_64 also have to clobber all caller saved registers, which
+ * However, x86_64 also has to clobber all caller saved registers, which
* unfortunately, are quite a bit (r8 - r11)
*
* The call instruction itself is marked by placing its start address
@@ -360,22 +361,22 @@ int paravirt_disable_iospace(void);
* There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
* It could be extended to more arguments, but there would be little
* to be gained from that. For each number of arguments, there are
- * the two VCALL and CALL variants for void and non-void functions.
+ * two VCALL and CALL variants for void and non-void functions.
*
* When there is a return value, the invoker of the macro must specify
* the return type. The macro then uses sizeof() on that type to
- * determine whether its a 32 or 64 bit value, and places the return
+ * determine whether it's a 32 or 64 bit value and places the return
* in the right register(s) (just %eax for 32-bit, and %edx:%eax for
- * 64-bit). For x86_64 machines, it just returns at %rax regardless of
+ * 64-bit). For x86_64 machines, it just returns in %rax regardless of
* the return value size.
*
- * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
+ * 64-bit arguments are passed as a pair of adjacent 32-bit arguments;
* i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
* in low,high order
*
* Small structures are passed and returned in registers. The macro
* calling convention can't directly deal with this, so the wrapper
- * functions must do this.
+ * functions must do it.
*
* These PVOP_* macros are only defined within this header. This
* means that all uses must be wrapped in inline functions. This also
@@ -414,8 +415,17 @@ int paravirt_disable_iospace(void);
"=c" (__ecx)
#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
-/* void functions are still allowed [re]ax for scratch */
+/*
+ * void functions are still allowed [re]ax for scratch.
+ *
+ * The ZERO_CALL_USED REGS feature may end up zeroing out callee-saved
+ * registers. Make sure we model this with the appropriate clobbers.
+ */
+#ifdef CONFIG_ZERO_CALL_USED_REGS
+#define PVOP_VCALLEE_CLOBBERS "=a" (__eax), PVOP_VCALL_CLOBBERS
+#else
#define PVOP_VCALLEE_CLOBBERS "=a" (__eax)
+#endif
#define PVOP_CALLEE_CLOBBERS PVOP_VCALLEE_CLOBBERS
#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
diff --git a/arch/x86/include/asm/resctrl.h b/arch/x86/include/asm/resctrl.h
index d60ed0668a59..d24b04ebf950 100644
--- a/arch/x86/include/asm/resctrl.h
+++ b/arch/x86/include/asm/resctrl.h
@@ -81,6 +81,15 @@ static void __resctrl_sched_in(void)
}
}
+static inline unsigned int resctrl_arch_round_mon_val(unsigned int val)
+{
+ unsigned int scale = boot_cpu_data.x86_cache_occ_scale;
+
+ /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
+ val /= scale;
+ return val * scale;
+}
+
static inline void resctrl_sched_in(void)
{
if (static_branch_likely(&rdt_enable_key))
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 81a0211a372d..a73bced40e24 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -21,16 +21,6 @@ DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_llc_id);
DECLARE_PER_CPU_READ_MOSTLY(u16, cpu_l2c_id);
DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number);
-static inline struct cpumask *cpu_llc_shared_mask(int cpu)
-{
- return per_cpu(cpu_llc_shared_map, cpu);
-}
-
-static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
-{
- return per_cpu(cpu_l2c_shared_map, cpu);
-}
-
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_cpu_to_apicid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u32, x86_cpu_to_acpiid);
DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid);
@@ -172,6 +162,16 @@ extern int safe_smp_processor_id(void);
# define safe_smp_processor_id() smp_processor_id()
#endif
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return per_cpu(cpu_llc_shared_map, cpu);
+}
+
+static inline struct cpumask *cpu_l2c_shared_mask(int cpu)
+{
+ return per_cpu(cpu_l2c_shared_map, cpu);
+}
+
#else /* !CONFIG_SMP */
#define wbinvd_on_cpu(cpu) wbinvd()
static inline int wbinvd_on_all_cpus(void)
@@ -179,6 +179,11 @@ static inline int wbinvd_on_all_cpus(void)
wbinvd();
return 0;
}
+
+static inline struct cpumask *cpu_llc_shared_mask(int cpu)
+{
+ return (struct cpumask *)cpumask_of(0);
+}
#endif /* CONFIG_SMP */
extern unsigned disabled_cpus;
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 913e593a3b45..1ec6a9ea2328 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -448,7 +448,7 @@ do { \
#ifdef CONFIG_X86_32
/*
- * Unlike the normal CMPXCHG, hardcode ECX for both success/fail and error.
+ * Unlike the normal CMPXCHG, use output GPR for both success/fail and error.
* There are only six GPRs available and four (EAX, EBX, ECX, and EDX) are
* hardcoded by CMPXCHG8B, leaving only ESI and EDI. If the compiler uses
* both ESI and EDI for the memory operand, compilation will fail if the error
@@ -461,11 +461,12 @@ do { \
__typeof__(*(_ptr)) __new = (_new); \
asm volatile("\n" \
"1: " LOCK_PREFIX "cmpxchg8b %[ptr]\n" \
- "mov $0, %%ecx\n\t" \
- "setz %%cl\n" \
+ "mov $0, %[result]\n\t" \
+ "setz %b[result]\n" \
"2:\n" \
- _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %%ecx) \
- : [result]"=c" (__result), \
+ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, \
+ %[result]) \
+ : [result] "=q" (__result), \
"+A" (__old), \
[ptr] "+m" (*_ptr) \
: "b" ((u32)__new), \
@@ -502,9 +503,6 @@ strncpy_from_user(char *dst, const char __user *src, long count);
extern __must_check long strnlen_user(const char __user *str, long n);
-unsigned long __must_check clear_user(void __user *mem, unsigned long len);
-unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
-
#ifdef CONFIG_ARCH_HAS_COPY_MC
unsigned long __must_check
copy_mc_to_kernel(void *to, const void *from, unsigned len);
@@ -526,6 +524,8 @@ extern struct movsl_mask {
#define ARCH_HAS_NOCACHE_UACCESS 1
#ifdef CONFIG_X86_32
+unsigned long __must_check clear_user(void __user *mem, unsigned long len);
+unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
# include <asm/uaccess_32.h>
#else
# include <asm/uaccess_64.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 45697e04d771..d13d71af5cf6 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -79,4 +79,49 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size)
kasan_check_write(dst, size);
return __copy_user_flushcache(dst, src, size);
}
+
+/*
+ * Zero Userspace.
+ */
+
+__must_check unsigned long
+clear_user_original(void __user *addr, unsigned long len);
+__must_check unsigned long
+clear_user_rep_good(void __user *addr, unsigned long len);
+__must_check unsigned long
+clear_user_erms(void __user *addr, unsigned long len);
+
+static __always_inline __must_check unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+ might_fault();
+ stac();
+
+ /*
+ * No memory constraint because it doesn't change any memory gcc
+ * knows about.
+ */
+ asm volatile(
+ "1:\n\t"
+ ALTERNATIVE_3("rep stosb",
+ "call clear_user_erms", ALT_NOT(X86_FEATURE_FSRM),
+ "call clear_user_rep_good", ALT_NOT(X86_FEATURE_ERMS),
+ "call clear_user_original", ALT_NOT(X86_FEATURE_REP_GOOD))
+ "2:\n"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : "+c" (size), "+D" (addr), ASM_CALL_CONSTRAINT
+ : "a" (0)
+ /* rep_good clobbers %rdx */
+ : "rdx");
+
+ clac();
+
+ return size;
+}
+
+static __always_inline unsigned long clear_user(void __user *to, unsigned long n)
+{
+ if (access_ok(to, n))
+ return __clear_user(to, n);
+ return n;
+}
#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a20a5ebfacd7..1286a73ebdbc 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -139,6 +139,8 @@ obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
obj-$(CONFIG_AMD_MEM_ENCRYPT) += sev.o
+obj-$(CONFIG_CFI_CLANG) += cfi.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 62f6b8b7c4a5..5cadcea035e0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -453,6 +453,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
return ret;
i += ret;
+ /*
+ * The compiler is supposed to EMIT an INT3 after every unconditional
+ * JMP instruction due to AMD BTC. However, if the compiler is too old
+ * or SLS isn't enabled, we still need an INT3 after indirect JMPs
+ * even on Intel.
+ */
+ if (op == JMP32_INSN_OPCODE && i < insn->length)
+ bytes[i++] = INT3_INSN_OPCODE;
+
for (; i < insn->length;)
bytes[i++] = BYTES_NOP1;
@@ -1319,22 +1328,23 @@ struct bp_patching_desc {
atomic_t refs;
};
-static struct bp_patching_desc *bp_desc;
+static struct bp_patching_desc bp_desc;
static __always_inline
-struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
+struct bp_patching_desc *try_get_desc(void)
{
- /* rcu_dereference */
- struct bp_patching_desc *desc = __READ_ONCE(*descp);
+ struct bp_patching_desc *desc = &bp_desc;
- if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
+ if (!arch_atomic_inc_not_zero(&desc->refs))
return NULL;
return desc;
}
-static __always_inline void put_desc(struct bp_patching_desc *desc)
+static __always_inline void put_desc(void)
{
+ struct bp_patching_desc *desc = &bp_desc;
+
smp_mb__before_atomic();
arch_atomic_dec(&desc->refs);
}
@@ -1367,15 +1377,15 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
/*
* Having observed our INT3 instruction, we now must observe
- * bp_desc:
+ * bp_desc with non-zero refcount:
*
- * bp_desc = desc INT3
+ * bp_desc.refs = 1 INT3
* WMB RMB
- * write INT3 if (desc)
+ * write INT3 if (bp_desc.refs != 0)
*/
smp_rmb();
- desc = try_get_desc(&bp_desc);
+ desc = try_get_desc();
if (!desc)
return 0;
@@ -1429,7 +1439,7 @@ noinstr int poke_int3_handler(struct pt_regs *regs)
ret = 1;
out_put:
- put_desc(desc);
+ put_desc();
return ret;
}
@@ -1460,18 +1470,20 @@ static int tp_vec_nr;
*/
static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
{
- struct bp_patching_desc desc = {
- .vec = tp,
- .nr_entries = nr_entries,
- .refs = ATOMIC_INIT(1),
- };
unsigned char int3 = INT3_INSN_OPCODE;
unsigned int i;
int do_sync;
lockdep_assert_held(&text_mutex);
- smp_store_release(&bp_desc, &desc); /* rcu_assign_pointer */
+ bp_desc.vec = tp;
+ bp_desc.nr_entries = nr_entries;
+
+ /*
+ * Corresponds to the implicit memory barrier in try_get_desc() to
+ * ensure reading a non-zero refcount provides up to date bp_desc data.
+ */
+ atomic_set_release(&bp_desc.refs, 1);
/*
* Corresponding read barrier in int3 notifier for making sure the
@@ -1559,12 +1571,10 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries
text_poke_sync();
/*
- * Remove and synchronize_rcu(), except we have a very primitive
- * refcount based completion.
+ * Remove and wait for refs to be zero.
*/
- WRITE_ONCE(bp_desc, NULL); /* RCU_INIT_POINTER */
- if (!atomic_dec_and_test(&desc.refs))
- atomic_cond_read_acquire(&desc.refs, !VAL);
+ if (!atomic_dec_and_test(&bp_desc.refs))
+ atomic_cond_read_acquire(&bp_desc.refs, !VAL);
}
static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 194d54eed537..19a0207e529f 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -53,7 +53,7 @@ static u32 *iommu_gatt_base; /* Remapping table */
* of only flushing when an mapping is reused. With it true the GART is
* flushed for every mapping. Problem is that doing the lazy flush seems
* to trigger bugs with some popular PCI cards, in particular 3ware (but
- * has been also also seen with Qlogic at least).
+ * has been also seen with Qlogic at least).
*/
static int iommu_fullflush = 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 7a5630d904b2..4feaa670d578 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -36,7 +36,7 @@
/*
* Using 512M as goal, in case kexec will load kernel_big
* that will do the on-position decompress, and could overlap with
- * with the gart aperture that is used.
+ * the gart aperture that is used.
* Sequence:
* kernel_small
* ==> kexec (with kdump trigger path or gart still enabled)
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 6d303d1d276c..c6876d3ea4b1 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,6 +61,7 @@
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#include <asm/irq_regs.h>
+#include <asm/cpu.h>
unsigned int num_processors;
@@ -1751,11 +1752,26 @@ EXPORT_SYMBOL_GPL(x2apic_mode);
enum {
X2APIC_OFF,
- X2APIC_ON,
X2APIC_DISABLED,
+ /* All states below here have X2APIC enabled */
+ X2APIC_ON,
+ X2APIC_ON_LOCKED
};
static int x2apic_state;
+static bool x2apic_hw_locked(void)
+{
+ u64 ia32_cap;
+ u64 msr;
+
+ ia32_cap = x86_read_arch_cap_msr();
+ if (ia32_cap & ARCH_CAP_XAPIC_DISABLE) {
+ rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr);
+ return (msr & LEGACY_XAPIC_DISABLED);
+ }
+ return false;
+}
+
static void __x2apic_disable(void)
{
u64 msr;
@@ -1793,6 +1809,10 @@ static int __init setup_nox2apic(char *str)
apicid);
return 0;
}
+ if (x2apic_hw_locked()) {
+ pr_warn("APIC locked in x2apic mode, can't disable\n");
+ return 0;
+ }
pr_warn("x2apic already enabled.\n");
__x2apic_disable();
}
@@ -1807,10 +1827,18 @@ early_param("nox2apic", setup_nox2apic);
void x2apic_setup(void)
{
/*
- * If x2apic is not in ON state, disable it if already enabled
+ * Try to make the AP's APIC state match that of the BSP, but if the
+ * BSP is unlocked and the AP is locked then there is a state mismatch.
+ * Warn about the mismatch in case a GP fault occurs due to a locked AP
+ * trying to be turned off.
+ */
+ if (x2apic_state != X2APIC_ON_LOCKED && x2apic_hw_locked())
+ pr_warn("x2apic lock mismatch between BSP and AP.\n");
+ /*
+ * If x2apic is not in ON or LOCKED state, disable it if already enabled
* from BIOS.
*/
- if (x2apic_state != X2APIC_ON) {
+ if (x2apic_state < X2APIC_ON) {
__x2apic_disable();
return;
}
@@ -1831,6 +1859,11 @@ static __init void x2apic_disable(void)
if (x2apic_id >= 255)
panic("Cannot disable x2apic, id: %08x\n", x2apic_id);
+ if (x2apic_hw_locked()) {
+ pr_warn("Cannot disable locked x2apic, id: %08x\n", x2apic_id);
+ return;
+ }
+
__x2apic_disable();
register_lapic_address(mp_lapic_addr);
}
@@ -1889,7 +1922,10 @@ void __init check_x2apic(void)
if (x2apic_enabled()) {
pr_info("x2apic: enabled by BIOS, switching to x2apic ops\n");
x2apic_mode = 1;
- x2apic_state = X2APIC_ON;
+ if (x2apic_hw_locked())
+ x2apic_state = X2APIC_ON_LOCKED;
+ else
+ x2apic_state = X2APIC_ON;
} else if (!boot_cpu_has(X86_FEATURE_X2APIC)) {
x2apic_state = X2APIC_DISABLED;
}
diff --git a/arch/x86/kernel/cfi.c b/arch/x86/kernel/cfi.c
new file mode 100644
index 000000000000..8674a5c0c031
--- /dev/null
+++ b/arch/x86/kernel/cfi.c
@@ -0,0 +1,86 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Clang Control Flow Integrity (CFI) support.
+ *
+ * Copyright (C) 2022 Google LLC
+ */
+#include <asm/cfi.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/string.h>
+
+/*
+ * Returns the target address and the expected type when regs->ip points
+ * to a compiler-generated CFI trap.
+ */
+static bool decode_cfi_insn(struct pt_regs *regs, unsigned long *target,
+ u32 *type)
+{
+ char buffer[MAX_INSN_SIZE];
+ struct insn insn;
+ int offset = 0;
+
+ *target = *type = 0;
+
+ /*
+ * The compiler generates the following instruction sequence
+ * for indirect call checks:
+ *
+ *   movl -<id>, %r10d ; 6 bytes
+ * addl -4(%reg), %r10d ; 4 bytes
+ * je .Ltmp1 ; 2 bytes
+ * ud2 ; <- regs->ip
+ * .Ltmp1:
+ *
+ * We can decode the expected type and the target address from the
+ * movl/addl instructions.
+ */
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 12, MAX_INSN_SIZE))
+ return false;
+ if (insn_decode_kernel(&insn, &buffer[offset]))
+ return false;
+ if (insn.opcode.value != 0xBA)
+ return false;
+
+ *type = -(u32)insn.immediate.value;
+
+ if (copy_from_kernel_nofault(buffer, (void *)regs->ip - 6, MAX_INSN_SIZE))
+ return false;
+ if (insn_decode_kernel(&insn, &buffer[offset]))
+ return false;
+ if (insn.opcode.value != 0x3)
+ return false;
+
+ /* Read the target address from the register. */
+ offset = insn_get_modrm_rm_off(&insn, regs);
+ if (offset < 0)
+ return false;
+
+ *target = *(unsigned long *)((void *)regs + offset);
+
+ return true;
+}
+
+/*
+ * Checks if a ud2 trap is because of a CFI failure, and handles the trap
+ * if needed. Returns a bug_trap_type value similarly to report_bug.
+ */
+enum bug_trap_type handle_cfi_failure(struct pt_regs *regs)
+{
+ unsigned long target;
+ u32 type;
+
+ if (!is_cfi_trap(regs->ip))
+ return BUG_TRAP_TYPE_NONE;
+
+ if (!decode_cfi_insn(regs, &target, &type))
+ return report_cfi_failure_noaddr(regs, regs->ip);
+
+ return report_cfi_failure(regs, regs->ip, &target, type);
+}
+
+/*
+ * Ensure that __kcfi_typeid_ symbols are emitted for functions that may
+ * not be indirectly called with all configurations.
+ */
+__ADDRESSABLE(__memcpy)
diff --git a/arch/x86/kernel/cpu/acrn.c b/arch/x86/kernel/cpu/acrn.c
index 23f5f27b5a02..485441b7f030 100644
--- a/arch/x86/kernel/cpu/acrn.c
+++ b/arch/x86/kernel/cpu/acrn.c
@@ -28,6 +28,9 @@ static void __init acrn_init_platform(void)
{
/* Setup the IDT for ACRN hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
+
+ x86_platform.calibrate_tsc = acrn_get_tsc_khz;
+ x86_platform.calibrate_cpu = acrn_get_tsc_khz;
}
static bool acrn_x2apic_available(void)
diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c
index 993697e71854..03851240c3e3 100644
--- a/arch/x86/kernel/cpu/feat_ctl.c
+++ b/arch/x86/kernel/cpu/feat_ctl.c
@@ -1,11 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/tboot.h>
+#include <asm/cpu.h>
#include <asm/cpufeature.h>
#include <asm/msr-index.h>
#include <asm/processor.h>
#include <asm/vmx.h>
-#include "cpu.h"
#undef pr_fmt
#define pr_fmt(fmt) "x86/cpu: " fmt
diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index 717192915f28..8ed341714686 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -29,15 +29,26 @@
void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)
{
struct mce m;
+ int lsb;
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
return;
+ /*
+ * Even if the ->validation_bits are set for address mask,
+ * to be extra safe, check and reject an error radius '0',
+ * and fall back to the default page size.
+ */
+ if (mem_err->validation_bits & CPER_MEM_VALID_PA_MASK)
+ lsb = find_first_bit((void *)&mem_err->physical_addr_mask, PAGE_SHIFT);
+ else
+ lsb = PAGE_SHIFT;
+
mce_setup(&m);
m.bank = -1;
/* Fake a memory read error with unknown channel */
m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | MCI_STATUS_MISCV | 0x9f;
- m.misc = (MCI_MISC_ADDR_PHYS << 6) | PAGE_SHIFT;
+ m.misc = (MCI_MISC_ADDR_PHYS << 6) | lsb;
if (severity >= GHES_SEV_RECOVERABLE)
m.status |= MCI_STATUS_UC;
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 8b2fcdfa6d31..e7410e98fc1f 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -788,6 +788,7 @@ static int verify_and_add_patch(u8 family, u8 *fw, unsigned int leftover,
kfree(patch);
return -EINVAL;
}
+ patch->size = *patch_size;
mc_hdr = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE);
proc_id = mc_hdr->processor_rev_id;
@@ -869,7 +870,7 @@ load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
return ret;
memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
- memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data), PATCH_MAX_SIZE));
+ memcpy(amd_ucode_patch, p->data, min_t(u32, p->size, PATCH_MAX_SIZE));
return ret;
}
@@ -924,12 +925,6 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
return ret;
}
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- return UCODE_ERROR;
-}
-
static void microcode_fini_cpu_amd(int cpu)
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
@@ -938,7 +933,6 @@ static void microcode_fini_cpu_amd(int cpu)
}
static struct microcode_ops microcode_amd_ops = {
- .request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_amd,
.collect_cpu_info = collect_cpu_info_amd,
.apply_microcode = apply_microcode_amd,
diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index ad57e0e4d674..6a41cee242f6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -491,7 +491,7 @@ wait_for_siblings:
*/
static int microcode_reload_late(void)
{
- int ret;
+ int old = boot_cpu_data.microcode, ret;
pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
pr_err("You should switch to early loading, if possible.\n");
@@ -503,7 +503,8 @@ static int microcode_reload_late(void)
if (ret == 0)
microcode_check();
- pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode);
+ pr_info("Reload completed, microcode revision: 0x%x -> 0x%x\n",
+ old, boot_cpu_data.microcode);
return ret;
}
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 025c8f0cd948..1fcbd671f1df 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -916,24 +916,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device,
return ret;
}
-static enum ucode_state
-request_microcode_user(int cpu, const void __user *buf, size_t size)
-{
- struct iov_iter iter;
- struct iovec iov;
-
- if (is_blacklisted(cpu))
- return UCODE_NFOUND;
-
- iov.iov_base = (void __user *)buf;
- iov.iov_len = size;
- iov_iter_init(&iter, WRITE, &iov, 1, size);
-
- return generic_load_microcode(cpu, &iter);
-}
-
static struct microcode_ops microcode_intel_ops = {
- .request_microcode_user = request_microcode_user,
.request_microcode_fw = request_microcode_fw,
.collect_cpu_info = collect_cpu_info,
.apply_microcode = apply_microcode_intel,
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index bb1c3f5f60c8..de62b0b87ced 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -147,7 +147,6 @@ static inline void cache_alloc_hsw_probe(void)
r->cache.shareable_bits = 0xc0000;
r->cache.min_cbm_bits = 2;
r->alloc_capable = true;
- r->alloc_enabled = true;
rdt_alloc_capable = true;
}
@@ -211,7 +210,6 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
thread_throttle_mode_init();
r->alloc_capable = true;
- r->alloc_enabled = true;
return true;
}
@@ -242,7 +240,6 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
r->data_width = 4;
r->alloc_capable = true;
- r->alloc_enabled = true;
return true;
}
@@ -261,7 +258,6 @@ static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
r->cache.shareable_bits = ebx & r->default_ctrl;
r->data_width = (r->cache.cbm_len + 3) / 4;
r->alloc_capable = true;
- r->alloc_enabled = true;
}
static void rdt_get_cdp_config(int level)
@@ -300,7 +296,7 @@ mba_wrmsr_amd(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
* that can be written to QOS_MSRs.
* There are currently no SKUs which support non linear delay values.
*/
-u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
+static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
{
if (r->membw.delay_linear)
return MAX_MBA_BW - bw;
@@ -401,7 +397,7 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
return NULL;
}
-void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
+static void setup_default_ctrlval(struct rdt_resource *r, u32 *dc)
{
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
int i;
@@ -410,12 +406,17 @@ void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm)
* Initialize the Control MSRs to having no control.
* For Cache Allocation: Set all bits in cbm
* For Memory Allocation: Set b/w requested to 100%
- * and the bandwidth in MBps to U32_MAX
*/
- for (i = 0; i < hw_res->num_closid; i++, dc++, dm++) {
+ for (i = 0; i < hw_res->num_closid; i++, dc++)
*dc = r->default_ctrl;
- *dm = MBA_MAX_MBPS;
- }
+}
+
+static void domain_free(struct rdt_hw_domain *hw_dom)
+{
+ kfree(hw_dom->arch_mbm_total);
+ kfree(hw_dom->arch_mbm_local);
+ kfree(hw_dom->ctrl_val);
+ kfree(hw_dom);
}
static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
@@ -423,23 +424,15 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
struct msr_param m;
- u32 *dc, *dm;
+ u32 *dc;
dc = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->ctrl_val),
GFP_KERNEL);
if (!dc)
return -ENOMEM;
- dm = kmalloc_array(hw_res->num_closid, sizeof(*hw_dom->mbps_val),
- GFP_KERNEL);
- if (!dm) {
- kfree(dc);
- return -ENOMEM;
- }
-
hw_dom->ctrl_val = dc;
- hw_dom->mbps_val = dm;
- setup_default_ctrlval(r, dc, dm);
+ setup_default_ctrlval(r, dc);
m.low = 0;
m.high = hw_res->num_closid;
@@ -447,39 +440,31 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
return 0;
}
-static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+/**
+ * arch_domain_mbm_alloc() - Allocate arch private storage for the MBM counters
+ * @num_rmid: The size of the MBM counter array
+ * @hw_dom: The domain that owns the allocated arrays
+ */
+static int arch_domain_mbm_alloc(u32 num_rmid, struct rdt_hw_domain *hw_dom)
{
size_t tsize;
- if (is_llc_occupancy_enabled()) {
- d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
- if (!d->rmid_busy_llc)
- return -ENOMEM;
- INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
- }
if (is_mbm_total_enabled()) {
- tsize = sizeof(*d->mbm_total);
- d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
- if (!d->mbm_total) {
- bitmap_free(d->rmid_busy_llc);
+ tsize = sizeof(*hw_dom->arch_mbm_total);
+ hw_dom->arch_mbm_total = kcalloc(num_rmid, tsize, GFP_KERNEL);
+ if (!hw_dom->arch_mbm_total)
return -ENOMEM;
- }
}
if (is_mbm_local_enabled()) {
- tsize = sizeof(*d->mbm_local);
- d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
- if (!d->mbm_local) {
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
+ tsize = sizeof(*hw_dom->arch_mbm_local);
+ hw_dom->arch_mbm_local = kcalloc(num_rmid, tsize, GFP_KERNEL);
+ if (!hw_dom->arch_mbm_local) {
+ kfree(hw_dom->arch_mbm_total);
+ hw_dom->arch_mbm_total = NULL;
return -ENOMEM;
}
}
- if (is_mbm_enabled()) {
- INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
- mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
- }
-
return 0;
}
@@ -502,6 +487,7 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
struct list_head *add_pos = NULL;
struct rdt_hw_domain *hw_dom;
struct rdt_domain *d;
+ int err;
d = rdt_find_domain(r, id, &add_pos);
if (IS_ERR(d)) {
@@ -527,25 +513,22 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
rdt_domain_reconfigure_cdp(r);
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
- kfree(hw_dom);
+ domain_free(hw_dom);
return;
}
- if (r->mon_capable && domain_setup_mon_state(r, d)) {
- kfree(hw_dom->ctrl_val);
- kfree(hw_dom->mbps_val);
- kfree(hw_dom);
+ if (r->mon_capable && arch_domain_mbm_alloc(r->num_rmid, hw_dom)) {
+ domain_free(hw_dom);
return;
}
list_add_tail(&d->list, add_pos);
- /*
- * If resctrl is mounted, add
- * per domain monitor data directories.
- */
- if (static_branch_unlikely(&rdt_mon_enable_key))
- mkdir_mondata_subdir_allrdtgrp(r, d);
+ err = resctrl_online_domain(r, d);
+ if (err) {
+ list_del(&d->list);
+ domain_free(hw_dom);
+ }
}
static void domain_remove_cpu(int cpu, struct rdt_resource *r)
@@ -563,27 +546,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
cpumask_clear_cpu(cpu, &d->cpu_mask);
if (cpumask_empty(&d->cpu_mask)) {
- /*
- * If resctrl is mounted, remove all the
- * per domain monitor data directories.
- */
- if (static_branch_unlikely(&rdt_mon_enable_key))
- rmdir_mondata_subdir_allrdtgrp(r, d->id);
+ resctrl_offline_domain(r, d);
list_del(&d->list);
- if (r->mon_capable && is_mbm_enabled())
- cancel_delayed_work(&d->mbm_over);
- if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
- /*
- * When a package is going down, forcefully
- * decrement rmid->ebusy. There is no way to know
- * that the L3 was flushed and hence may lead to
- * incorrect counts in rare scenarios, but leaving
- * the RMID as busy creates RMID leaks if the
- * package never comes back.
- */
- __check_limbo(d, true);
- cancel_delayed_work(&d->cqm_limbo);
- }
/*
* rdt_domain "d" is going to be freed below, so clear
@@ -591,13 +555,8 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
*/
if (d->plr)
d->plr->d = NULL;
+ domain_free(hw_dom);
- kfree(hw_dom->ctrl_val);
- kfree(hw_dom->mbps_val);
- bitmap_free(d->rmid_busy_llc);
- kfree(d->mbm_total);
- kfree(d->mbm_local);
- kfree(hw_dom);
return;
}
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index 87666275eed9..1dafbdc5ac31 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -61,6 +61,7 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
struct rdt_domain *d)
{
struct resctrl_staged_config *cfg;
+ u32 closid = data->rdtgrp->closid;
struct rdt_resource *r = s->res;
unsigned long bw_val;
@@ -72,6 +73,12 @@ int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s,
if (!bw_validate(data->buf, &bw_val, r))
return -EINVAL;
+
+ if (is_mba_sc(r)) {
+ d->mbps_val[closid] = bw_val;
+ return 0;
+ }
+
cfg->new_ctrl = bw_val;
cfg->have_new_ctrl = true;
@@ -261,14 +268,13 @@ static u32 get_config_index(u32 closid, enum resctrl_conf_type type)
static bool apply_config(struct rdt_hw_domain *hw_dom,
struct resctrl_staged_config *cfg, u32 idx,
- cpumask_var_t cpu_mask, bool mba_sc)
+ cpumask_var_t cpu_mask)
{
struct rdt_domain *dom = &hw_dom->d_resctrl;
- u32 *dc = !mba_sc ? hw_dom->ctrl_val : hw_dom->mbps_val;
- if (cfg->new_ctrl != dc[idx]) {
+ if (cfg->new_ctrl != hw_dom->ctrl_val[idx]) {
cpumask_set_cpu(cpumask_any(&dom->cpu_mask), cpu_mask);
- dc[idx] = cfg->new_ctrl;
+ hw_dom->ctrl_val[idx] = cfg->new_ctrl;
return true;
}
@@ -276,6 +282,27 @@ static bool apply_config(struct rdt_hw_domain *hw_dom,
return false;
}
+int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_domain *d,
+ u32 closid, enum resctrl_conf_type t, u32 cfg_val)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ u32 idx = get_config_index(closid, t);
+ struct msr_param msr_param;
+
+ if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+ return -EINVAL;
+
+ hw_dom->ctrl_val[idx] = cfg_val;
+
+ msr_param.res = r;
+ msr_param.low = idx;
+ msr_param.high = idx + 1;
+ hw_res->msr_update(d, &msr_param, r);
+
+ return 0;
+}
+
int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
@@ -284,14 +311,12 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
enum resctrl_conf_type t;
cpumask_var_t cpu_mask;
struct rdt_domain *d;
- bool mba_sc;
int cpu;
u32 idx;
if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL))
return -ENOMEM;
- mba_sc = is_mba_sc(r);
msr_param.res = NULL;
list_for_each_entry(d, &r->domains, list) {
hw_dom = resctrl_to_arch_dom(d);
@@ -301,7 +326,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
continue;
idx = get_config_index(closid, t);
- if (!apply_config(hw_dom, cfg, idx, cpu_mask, mba_sc))
+ if (!apply_config(hw_dom, cfg, idx, cpu_mask))
continue;
if (!msr_param.res) {
@@ -315,11 +340,7 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid)
}
}
- /*
- * Avoid writing the control msr with control values when
- * MBA software controller is enabled
- */
- if (cpumask_empty(cpu_mask) || mba_sc)
+ if (cpumask_empty(cpu_mask))
goto done;
cpu = get_cpu();
/* Update resource control msr on this CPU if it's in cpu_mask. */
@@ -406,6 +427,14 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
+
+ /*
+ * Writes to mba_sc resources update the software controller,
+ * not the control MSR.
+ */
+ if (is_mba_sc(r))
+ continue;
+
ret = resctrl_arch_update_domains(r, rdtgrp->closid);
if (ret)
goto out;
@@ -433,9 +462,7 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_domain *d,
struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
u32 idx = get_config_index(closid, type);
- if (!is_mba_sc(r))
- return hw_dom->ctrl_val[idx];
- return hw_dom->mbps_val[idx];
+ return hw_dom->ctrl_val[idx];
}
static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid)
@@ -450,8 +477,12 @@ static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int clo
if (sep)
seq_puts(s, ";");
- ctrl_val = resctrl_arch_get_config(r, dom, closid,
- schema->conf_type);
+ if (is_mba_sc(r))
+ ctrl_val = dom->mbps_val[closid];
+ else
+ ctrl_val = resctrl_arch_get_config(r, dom, closid,
+ schema->conf_type);
+
seq_printf(s, r->format_str, dom->id, max_data_width,
ctrl_val);
sep = true;
@@ -518,7 +549,6 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
int rdtgroup_mondata_show(struct seq_file *m, void *arg)
{
struct kernfs_open_file *of = m->private;
- struct rdt_hw_resource *hw_res;
u32 resid, evtid, domid;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
@@ -538,8 +568,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
domid = md.u.domid;
evtid = md.u.evtid;
- hw_res = &rdt_resources_all[resid];
- r = &hw_res->r_resctrl;
+ r = &rdt_resources_all[resid].r_resctrl;
d = rdt_find_domain(r, domid, NULL);
if (IS_ERR_OR_NULL(d)) {
ret = -ENOENT;
@@ -548,12 +577,12 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
mon_event_read(&rr, r, d, rdtgrp, evtid, false);
- if (rr.val & RMID_VAL_ERROR)
+ if (rr.err == -EIO)
seq_puts(m, "Error\n");
- else if (rr.val & RMID_VAL_UNAVAIL)
+ else if (rr.err == -EINVAL)
seq_puts(m, "Unavailable\n");
else
- seq_printf(m, "%llu\n", rr.val * hw_res->mon_scale);
+ seq_printf(m, "%llu\n", rr.val);
out:
rdtgroup_kn_unlock(of->kn);
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 1d647188a43b..5f7128686cfd 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -22,21 +22,12 @@
#define L2_QOS_CDP_ENABLE 0x01ULL
-/*
- * Event IDs are used to program IA32_QM_EVTSEL before reading event
- * counter from IA32_QM_CTR
- */
-#define QOS_L3_OCCUP_EVENT_ID 0x01
-#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02
-#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03
-
#define CQM_LIMBOCHECK_INTERVAL 1000
#define MBM_CNTR_WIDTH_BASE 24
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
-#define MBA_MAX_MBPS U32_MAX
#define MAX_MBA_BW_AMD 0x800
#define MBM_CNTR_WIDTH_OFFSET_AMD 20
@@ -74,7 +65,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
* @list: entry in &rdt_resource->evt_list
*/
struct mon_evt {
- u32 evtid;
+ enum resctrl_event_id evtid;
char *name;
struct list_head list;
};
@@ -91,9 +82,9 @@ struct mon_evt {
union mon_data_bits {
void *priv;
struct {
- unsigned int rid : 10;
- unsigned int evtid : 8;
- unsigned int domid : 14;
+ unsigned int rid : 10;
+ enum resctrl_event_id evtid : 8;
+ unsigned int domid : 14;
} u;
};
@@ -101,12 +92,12 @@ struct rmid_read {
struct rdtgroup *rgrp;
struct rdt_resource *r;
struct rdt_domain *d;
- int evtid;
+ enum resctrl_event_id evtid;
bool first;
+ int err;
u64 val;
};
-extern unsigned int resctrl_cqm_threshold;
extern bool rdt_alloc_capable;
extern bool rdt_mon_capable;
extern unsigned int rdt_mon_features;
@@ -288,35 +279,45 @@ struct rftype {
/**
* struct mbm_state - status for each MBM counter in each domain
- * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
- * @prev_msr: Value of IA32_QM_CTR for this RMID last time we read it
- * @prev_bw_msr:Value of previous IA32_QM_CTR for bandwidth counting
+ * @prev_bw_bytes: Previous bytes value read for bandwidth calculation
* @prev_bw: The most recent bandwidth in MBps
* @delta_bw: Difference between the current and previous bandwidth
* @delta_comp: Indicates whether to compute the delta_bw
*/
struct mbm_state {
- u64 chunks;
- u64 prev_msr;
- u64 prev_bw_msr;
+ u64 prev_bw_bytes;
u32 prev_bw;
u32 delta_bw;
bool delta_comp;
};
/**
+ * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s
+ * return value.
+ * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes)
+ * @prev_msr: Value of IA32_QM_CTR last time it was read for the RMID used to
+ * find this struct.
+ */
+struct arch_mbm_state {
+ u64 chunks;
+ u64 prev_msr;
+};
+
+/**
* struct rdt_hw_domain - Arch private attributes of a set of CPUs that share
* a resource
* @d_resctrl: Properties exposed to the resctrl file system
* @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID)
- * @mbps_val: When mba_sc is enabled, this holds the bandwidth in MBps
+ * @arch_mbm_total: arch private state for MBM total bandwidth
+ * @arch_mbm_local: arch private state for MBM local bandwidth
*
* Members of this structure are accessed via helpers that provide abstraction.
*/
struct rdt_hw_domain {
struct rdt_domain d_resctrl;
u32 *ctrl_val;
- u32 *mbps_val;
+ struct arch_mbm_state *arch_mbm_total;
+ struct arch_mbm_state *arch_mbm_local;
};
static inline struct rdt_hw_domain *resctrl_to_arch_dom(struct rdt_domain *r)
@@ -459,14 +460,6 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable);
for_each_rdt_resource(r) \
if (r->mon_capable)
-#define for_each_alloc_enabled_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->alloc_enabled)
-
-#define for_each_mon_enabled_rdt_resource(r) \
- for_each_rdt_resource(r) \
- if (r->mon_enabled)
-
/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
union cpuid_0x10_1_eax {
struct {
@@ -530,10 +523,6 @@ void free_rmid(u32 rmid);
int rdt_get_mon_l3_config(struct rdt_resource *r);
void mon_event_count(void *info);
int rdtgroup_mondata_show(struct seq_file *m, void *arg);
-void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- unsigned int dom_id);
-void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_domain *d);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_domain *d, struct rdtgroup *rdtgrp,
int evtid, int first);
@@ -542,8 +531,6 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
void mbm_handle_overflow(struct work_struct *work);
void __init intel_rdt_mbm_apply_quirk(void);
bool is_mba_sc(struct rdt_resource *r);
-void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
-u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
void cqm_handle_limbo(struct work_struct *work);
bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index eaf25a234ff5..efe0c30d3a12 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -16,8 +16,12 @@
*/
#include <linux/module.h>
+#include <linux/sizes.h>
#include <linux/slab.h>
+
#include <asm/cpu_device_id.h>
+#include <asm/resctrl.h>
+
#include "internal.h"
struct rmid_entry {
@@ -37,8 +41,8 @@ static LIST_HEAD(rmid_free_lru);
* @rmid_limbo_count count of currently unused but (potentially)
* dirty RMIDs.
* This counts RMIDs that no one is currently using but that
- * may have a occupancy value > intel_cqm_threshold. User can change
- * the threshold occupancy value.
+ * may have a occupancy value > resctrl_rmid_realloc_threshold. User can
+ * change the threshold occupancy value.
*/
static unsigned int rmid_limbo_count;
@@ -59,10 +63,15 @@ bool rdt_mon_capable;
unsigned int rdt_mon_features;
/*
- * This is the threshold cache occupancy at which we will consider an
+ * This is the threshold cache occupancy in bytes at which we will consider an
* RMID available for re-allocation.
*/
-unsigned int resctrl_cqm_threshold;
+unsigned int resctrl_rmid_realloc_threshold;
+
+/*
+ * This is the maximum value for the reallocation threshold, in bytes.
+ */
+unsigned int resctrl_rmid_realloc_limit;
#define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5))
@@ -137,9 +146,54 @@ static inline struct rmid_entry *__rmid_entry(u32 rmid)
return entry;
}
-static u64 __rmid_read(u32 rmid, u32 eventid)
+static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_domain *hw_dom,
+ u32 rmid,
+ enum resctrl_event_id eventid)
{
- u64 val;
+ switch (eventid) {
+ case QOS_L3_OCCUP_EVENT_ID:
+ return NULL;
+ case QOS_L3_MBM_TOTAL_EVENT_ID:
+ return &hw_dom->arch_mbm_total[rmid];
+ case QOS_L3_MBM_LOCAL_EVENT_ID:
+ return &hw_dom->arch_mbm_local[rmid];
+ }
+
+ /* Never expect to get here */
+ WARN_ON_ONCE(1);
+
+ return NULL;
+}
+
+void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_domain *d,
+ u32 rmid, enum resctrl_event_id eventid)
+{
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct arch_mbm_state *am;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am)
+ memset(am, 0, sizeof(*am));
+}
+
+static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
+{
+ u64 shift = 64 - width, chunks;
+
+ chunks = (cur_msr << shift) - (prev_msr << shift);
+ return chunks >> shift;
+}
+
+int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_domain *d,
+ u32 rmid, enum resctrl_event_id eventid, u64 *val)
+{
+ struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
+ struct rdt_hw_domain *hw_dom = resctrl_to_arch_dom(d);
+ struct arch_mbm_state *am;
+ u64 msr_val, chunks;
+
+ if (!cpumask_test_cpu(smp_processor_id(), &d->cpu_mask))
+ return -EINVAL;
/*
* As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
@@ -150,16 +204,26 @@ static u64 __rmid_read(u32 rmid, u32 eventid)
* are error bits.
*/
wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
- rdmsrl(MSR_IA32_QM_CTR, val);
-
- return val;
-}
+ rdmsrl(MSR_IA32_QM_CTR, msr_val);
+
+ if (msr_val & RMID_VAL_ERROR)
+ return -EIO;
+ if (msr_val & RMID_VAL_UNAVAIL)
+ return -EINVAL;
+
+ am = get_arch_mbm_state(hw_dom, rmid, eventid);
+ if (am) {
+ am->chunks += mbm_overflow_count(am->prev_msr, msr_val,
+ hw_res->mbm_width);
+ chunks = get_corrected_mbm_count(rmid, am->chunks);
+ am->prev_msr = msr_val;
+ } else {
+ chunks = msr_val;
+ }
-static bool rmid_dirty(struct rmid_entry *entry)
-{
- u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
+ *val = chunks * hw_res->mon_scale;
- return val >= resctrl_cqm_threshold;
+ return 0;
}
/*
@@ -170,11 +234,11 @@ static bool rmid_dirty(struct rmid_entry *entry)
*/
void __check_limbo(struct rdt_domain *d, bool force_free)
{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rmid_entry *entry;
- struct rdt_resource *r;
u32 crmid = 1, nrmid;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ bool rmid_dirty;
+ u64 val = 0;
/*
* Skip RMID 0 and start from RMID 1 and check all the RMIDs that
@@ -188,7 +252,15 @@ void __check_limbo(struct rdt_domain *d, bool force_free)
break;
entry = __rmid_entry(nrmid);
- if (force_free || !rmid_dirty(entry)) {
+
+ if (resctrl_arch_rmid_read(r, d, entry->rmid,
+ QOS_L3_OCCUP_EVENT_ID, &val)) {
+ rmid_dirty = true;
+ } else {
+ rmid_dirty = (val >= resctrl_rmid_realloc_threshold);
+ }
+
+ if (force_free || !rmid_dirty) {
clear_bit(entry->rmid, d->rmid_busy_llc);
if (!--entry->busy) {
rmid_limbo_count--;
@@ -227,19 +299,19 @@ int alloc_rmid(void)
static void add_rmid_to_limbo(struct rmid_entry *entry)
{
- struct rdt_resource *r;
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
struct rdt_domain *d;
- int cpu;
- u64 val;
-
- r = &rdt_resources_all[RDT_RESOURCE_L3].r_resctrl;
+ int cpu, err;
+ u64 val = 0;
entry->busy = 0;
cpu = get_cpu();
list_for_each_entry(d, &r->domains, list) {
if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
- val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
- if (val <= resctrl_cqm_threshold)
+ err = resctrl_arch_rmid_read(r, d, entry->rmid,
+ QOS_L3_OCCUP_EVENT_ID,
+ &val);
+ if (err || val <= resctrl_rmid_realloc_threshold)
continue;
}
@@ -277,24 +349,18 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
-static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
{
- u64 shift = 64 - width, chunks;
+ struct mbm_state *m;
+ u64 tval = 0;
- chunks = (cur_msr << shift) - (prev_msr << shift);
- return chunks >> shift;
-}
+ if (rr->first)
+ resctrl_arch_reset_rmid(rr->r, rr->d, rmid, rr->evtid);
-static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
-{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
- struct mbm_state *m;
- u64 chunks, tval;
+ rr->err = resctrl_arch_rmid_read(rr->r, rr->d, rmid, rr->evtid, &tval);
+ if (rr->err)
+ return rr->err;
- tval = __rmid_read(rmid, rr->evtid);
- if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
- return tval;
- }
switch (rr->evtid) {
case QOS_L3_OCCUP_EVENT_ID:
rr->val += tval;
@@ -308,48 +374,47 @@ static u64 __mon_event_count(u32 rmid, struct rmid_read *rr)
default:
/*
* Code would never reach here because an invalid
- * event id would fail the __rmid_read.
+ * event id would fail in resctrl_arch_rmid_read().
*/
- return RMID_VAL_ERROR;
+ return -EINVAL;
}
if (rr->first) {
memset(m, 0, sizeof(struct mbm_state));
- m->prev_bw_msr = m->prev_msr = tval;
return 0;
}
- chunks = mbm_overflow_count(m->prev_msr, tval, hw_res->mbm_width);
- m->chunks += chunks;
- m->prev_msr = tval;
-
- rr->val += get_corrected_mbm_count(rmid, m->chunks);
+ rr->val += tval;
return 0;
}
/*
+ * mbm_bw_count() - Update bw count from values previously read by
+ * __mon_event_count().
+ * @rmid: The rmid used to identify the cached mbm_state.
+ * @rr: The struct rmid_read populated by __mon_event_count().
+ *
* Supporting function to calculate the memory bandwidth
- * and delta bandwidth in MBps.
+ * and delta bandwidth in MBps. The chunks value previously read by
+ * __mon_event_count() is compared with the chunks value from the previous
+ * invocation. This must be called once per second to maintain values in MBps.
*/
static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
{
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(rr->r);
struct mbm_state *m = &rr->d->mbm_local[rmid];
- u64 tval, cur_bw, chunks;
+ u64 cur_bw, bytes, cur_bytes;
- tval = __rmid_read(rmid, rr->evtid);
- if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
- return;
+ cur_bytes = rr->val;
+ bytes = cur_bytes - m->prev_bw_bytes;
+ m->prev_bw_bytes = cur_bytes;
- chunks = mbm_overflow_count(m->prev_bw_msr, tval, hw_res->mbm_width);
- cur_bw = (get_corrected_mbm_count(rmid, chunks) * hw_res->mon_scale) >> 20;
+ cur_bw = bytes / SZ_1M;
if (m->delta_comp)
m->delta_bw = abs(cur_bw - m->prev_bw);
m->delta_comp = false;
m->prev_bw = cur_bw;
- m->prev_bw_msr = tval;
}
/*
@@ -361,11 +426,11 @@ void mon_event_count(void *info)
struct rdtgroup *rdtgrp, *entry;
struct rmid_read *rr = info;
struct list_head *head;
- u64 ret_val;
+ int ret;
rdtgrp = rr->rgrp;
- ret_val = __mon_event_count(rdtgrp->mon.rmid, rr);
+ ret = __mon_event_count(rdtgrp->mon.rmid, rr);
/*
* For Ctrl groups read data from child monitor groups and
@@ -377,13 +442,17 @@ void mon_event_count(void *info)
if (rdtgrp->type == RDTCTRL_GROUP) {
list_for_each_entry(entry, head, mon.crdtgrp_list) {
if (__mon_event_count(entry->mon.rmid, rr) == 0)
- ret_val = 0;
+ ret = 0;
}
}
- /* Report error if none of rmid_reads are successful */
- if (ret_val)
- rr->val = ret_val;
+ /*
+ * __mon_event_count() calls for newly created monitor groups may
+ * report -EINVAL/Unavailable if the monitor hasn't seen any traffic.
+ * Discard error if any of the monitor event reads succeeded.
+ */
+ if (ret == 0)
+ rr->err = 0;
}
/*
@@ -420,10 +489,8 @@ void mon_event_count(void *info)
*/
static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
{
- u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
+ u32 closid, rmid, cur_msr_val, new_msr_val;
struct mbm_state *pmbm_data, *cmbm_data;
- struct rdt_hw_resource *hw_r_mba;
- struct rdt_hw_domain *hw_dom_mba;
u32 cur_bw, delta_bw, user_bw;
struct rdt_resource *r_mba;
struct rdt_domain *dom_mba;
@@ -433,8 +500,8 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
if (!is_mbm_local_enabled())
return;
- hw_r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
- r_mba = &hw_r_mba->r_resctrl;
+ r_mba = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+
closid = rgrp->closid;
rmid = rgrp->mon.rmid;
pmbm_data = &dom_mbm->mbm_local[rmid];
@@ -444,16 +511,13 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
pr_warn_once("Failure to get domain for MBA update\n");
return;
}
- hw_dom_mba = resctrl_to_arch_dom(dom_mba);
cur_bw = pmbm_data->prev_bw;
- user_bw = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
+ user_bw = dom_mba->mbps_val[closid];
delta_bw = pmbm_data->delta_bw;
- /*
- * resctrl_arch_get_config() chooses the mbps/ctrl value to return
- * based on is_mba_sc(). For now, reach into the hw_dom.
- */
- cur_msr_val = hw_dom_mba->ctrl_val[closid];
+
+ /* MBA resource doesn't support CDP */
+ cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE);
/*
* For Ctrl groups read data from child monitor groups.
@@ -488,9 +552,7 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
return;
}
- cur_msr = hw_r_mba->msr_base + closid;
- wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
- hw_dom_mba->ctrl_val[closid] = new_msr_val;
+ resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val);
/*
* Delta values are updated dynamically package wise for each
@@ -523,10 +585,12 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
*/
if (is_mbm_total_enabled()) {
rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
+ rr.val = 0;
__mon_event_count(rmid, &rr);
}
if (is_mbm_local_enabled()) {
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
+ rr.val = 0;
__mon_event_count(rmid, &rr);
/*
@@ -686,9 +750,10 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
- unsigned int cl_size = boot_cpu_data.x86_cache_size;
+ unsigned int threshold;
int ret;
+ resctrl_rmid_realloc_limit = boot_cpu_data.x86_cache_size * 1024;
hw_res->mon_scale = boot_cpu_data.x86_cache_occ_scale;
r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
hw_res->mbm_width = MBM_CNTR_WIDTH_BASE;
@@ -705,10 +770,14 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
*
* For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
*/
- resctrl_cqm_threshold = cl_size * 1024 / r->num_rmid;
+ threshold = resctrl_rmid_realloc_limit / r->num_rmid;
- /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
- resctrl_cqm_threshold /= hw_res->mon_scale;
+ /*
+ * Because num_rmid may not be a power of two, round the value
+ * to the nearest multiple of hw_res->mon_scale so it matches a
+ * value the hardware will measure. mon_scale may not be a power of 2.
+ */
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(threshold);
ret = dom_data_init(r);
if (ret)
@@ -717,7 +786,6 @@ int rdt_get_mon_l3_config(struct rdt_resource *r)
l3_mon_evt_init(r);
r->mon_capable = true;
- r->mon_enabled = true;
return 0;
}
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index db813f819ad6..d961ae3ed96e 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -420,6 +420,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
struct pseudo_lock_region *plr = rdtgrp->plr;
u32 rmid_p, closid_p;
unsigned long i;
+ u64 saved_msr;
#ifdef CONFIG_KASAN
/*
* The registers used for local register variables are also used
@@ -463,6 +464,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
* the buffer and evict pseudo-locked memory read earlier from the
* cache.
*/
+ saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL);
__wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
closid_p = this_cpu_read(pqr_state.cur_closid);
rmid_p = this_cpu_read(pqr_state.cur_rmid);
@@ -514,7 +516,7 @@ static int pseudo_lock_fn(void *_rdtgrp)
__wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
/* Re-enable the hardware prefetcher(s) */
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr);
local_irq_enable();
plr->thread_done = 1;
@@ -835,7 +837,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
* First determine which cpus have pseudo-locked regions
* associated with them.
*/
- for_each_alloc_enabled_rdt_resource(r) {
+ for_each_alloc_capable_rdt_resource(r) {
list_for_each_entry(d_i, &r->domains, list) {
if (d_i->plr)
cpumask_or(cpu_with_psl, cpu_with_psl,
@@ -871,6 +873,7 @@ bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
static int measure_cycles_lat_fn(void *_plr)
{
struct pseudo_lock_region *plr = _plr;
+ u32 saved_low, saved_high;
unsigned long i;
u64 start, end;
void *mem_r;
@@ -879,6 +882,7 @@ static int measure_cycles_lat_fn(void *_plr)
/*
* Disable hardware prefetchers.
*/
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
mem_r = READ_ONCE(plr->kmem);
/*
@@ -895,7 +899,7 @@ static int measure_cycles_lat_fn(void *_plr)
end = rdtsc_ordered();
trace_pseudo_lock_mem_latency((u32)(end - start));
}
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
local_irq_enable();
plr->thread_done = 1;
wake_up_interruptible(&plr->lock_thread_wq);
@@ -940,6 +944,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
u64 hits_before = 0, hits_after = 0, miss_before = 0, miss_after = 0;
struct perf_event *miss_event, *hit_event;
int hit_pmcnum, miss_pmcnum;
+ u32 saved_low, saved_high;
unsigned int line_size;
unsigned int size;
unsigned long i;
@@ -973,6 +978,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
/*
* Disable hardware prefetchers.
*/
+ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
/* Initialize rest of local variables */
@@ -1031,7 +1037,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr,
*/
rmb();
/* Re-enable hardware prefetchers */
- wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
+ wrmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high);
local_irq_enable();
out_hit:
perf_event_release_kernel(hit_event);
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index f276aff521e8..e5a48f05e787 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1030,10 +1030,7 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
static int max_threshold_occ_show(struct kernfs_open_file *of,
struct seq_file *seq, void *v)
{
- struct rdt_resource *r = of->kn->parent->priv;
- struct rdt_hw_resource *hw_res = resctrl_to_arch_res(r);
-
- seq_printf(seq, "%u\n", resctrl_cqm_threshold * hw_res->mon_scale);
+ seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold);
return 0;
}
@@ -1055,7 +1052,6 @@ static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of,
static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct rdt_hw_resource *hw_res;
unsigned int bytes;
int ret;
@@ -1063,11 +1059,10 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
if (ret)
return ret;
- if (bytes > (boot_cpu_data.x86_cache_size * 1024))
+ if (bytes > resctrl_rmid_realloc_limit)
return -EINVAL;
- hw_res = resctrl_to_arch_res(of->kn->parent->priv);
- resctrl_cqm_threshold = bytes / hw_res->mon_scale;
+ resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes);
return nbytes;
}
@@ -1356,11 +1351,13 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
struct seq_file *s, void *v)
{
struct resctrl_schema *schema;
+ enum resctrl_conf_type type;
struct rdtgroup *rdtgrp;
struct rdt_resource *r;
struct rdt_domain *d;
unsigned int size;
int ret = 0;
+ u32 closid;
bool sep;
u32 ctrl;
@@ -1386,8 +1383,11 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
goto out;
}
+ closid = rdtgrp->closid;
+
list_for_each_entry(schema, &resctrl_schema_all, list) {
r = schema->res;
+ type = schema->conf_type;
sep = false;
seq_printf(s, "%*s:", max_name_width, schema->name);
list_for_each_entry(d, &r->domains, list) {
@@ -1396,9 +1396,12 @@ static int rdtgroup_size_show(struct kernfs_open_file *of,
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
size = 0;
} else {
- ctrl = resctrl_arch_get_config(r, d,
- rdtgrp->closid,
- schema->conf_type);
+ if (is_mba_sc(r))
+ ctrl = d->mbps_val[closid];
+ else
+ ctrl = resctrl_arch_get_config(r, d,
+ closid,
+ type);
if (r->rid == RDT_RESOURCE_MBA)
size = ctrl;
else
@@ -1756,7 +1759,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
if (ret)
goto out_destroy;
- /* loop over enabled controls, these are all alloc_enabled */
+ /* loop over enabled controls, these are all alloc_capable */
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
fflags = r->fflags | RF_CTRL_INFO;
@@ -1765,7 +1768,7 @@ static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
goto out_destroy;
}
- for_each_mon_enabled_rdt_resource(r) {
+ for_each_mon_capable_rdt_resource(r) {
fflags = r->fflags | RF_MON_INFO;
sprintf(name, "%s_MON", r->name);
ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
@@ -1889,26 +1892,61 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
l3_qos_cfg_update(&hw_res->cdp_enabled);
}
+static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_domain *d)
+{
+ u32 num_closid = resctrl_arch_get_num_closid(r);
+ int cpu = cpumask_any(&d->cpu_mask);
+ int i;
+
+ d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!d->mbps_val)
+ return -ENOMEM;
+
+ for (i = 0; i < num_closid; i++)
+ d->mbps_val[i] = MBA_MAX_MBPS;
+
+ return 0;
+}
+
+static void mba_sc_domain_destroy(struct rdt_resource *r,
+ struct rdt_domain *d)
+{
+ kfree(d->mbps_val);
+ d->mbps_val = NULL;
+}
+
/*
- * Enable or disable the MBA software controller
- * which helps user specify bandwidth in MBps.
* MBA software controller is supported only if
* MBM is supported and MBA is in linear scale.
*/
+static bool supports_mba_mbps(void)
+{
+ struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
+
+ return (is_mbm_local_enabled() &&
+ r->alloc_capable && is_mba_linear());
+}
+
+/*
+ * Enable or disable the MBA software controller
+ * which helps user specify bandwidth in MBps.
+ */
static int set_mba_sc(bool mba_sc)
{
struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_MBA].r_resctrl;
- struct rdt_hw_domain *hw_dom;
+ u32 num_closid = resctrl_arch_get_num_closid(r);
struct rdt_domain *d;
+ int i;
- if (!is_mbm_enabled() || !is_mba_linear() ||
- mba_sc == is_mba_sc(r))
+ if (!supports_mba_mbps() || mba_sc == is_mba_sc(r))
return -EINVAL;
r->membw.mba_sc = mba_sc;
+
list_for_each_entry(d, &r->domains, list) {
- hw_dom = resctrl_to_arch_dom(d);
- setup_default_ctrlval(r, hw_dom->ctrl_val, hw_dom->mbps_val);
+ for (i = 0; i < num_closid; i++)
+ d->mbps_val[i] = MBA_MAX_MBPS;
}
return 0;
@@ -2106,7 +2144,7 @@ static int schemata_list_create(void)
struct rdt_resource *r;
int ret = 0;
- for_each_alloc_enabled_rdt_resource(r) {
+ for_each_alloc_capable_rdt_resource(r) {
if (resctrl_arch_get_cdp_enabled(r->rid)) {
ret = schemata_list_add(r, CDP_CODE);
if (ret)
@@ -2261,7 +2299,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param)
ctx->enable_cdpl2 = true;
return 0;
case Opt_mba_mbps:
- if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL)
+ if (!supports_mba_mbps())
return -EINVAL;
ctx->enable_mba_mbps = true;
return 0;
@@ -2452,7 +2490,7 @@ static void rdt_kill_sb(struct super_block *sb)
set_mba_sc(false);
/*Put everything back to default values. */
- for_each_alloc_enabled_rdt_resource(r)
+ for_each_alloc_capable_rdt_resource(r)
reset_all_ctrls(r);
cdp_disable_all();
rmdir_all_sub();
@@ -2499,14 +2537,12 @@ static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
* Remove all subdirectories of mon_data of ctrl_mon groups
* and monitor groups with given domain id.
*/
-void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
+static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ unsigned int dom_id)
{
struct rdtgroup *prgrp, *crgrp;
char name[32];
- if (!r->mon_enabled)
- return;
-
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
sprintf(name, "mon_%s_%02d", r->name, dom_id);
kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
@@ -2565,16 +2601,13 @@ out_destroy:
* Add all subdirectories of mon_data for "ctrl_mon" groups
* and "monitor" groups with given domain id.
*/
-void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
- struct rdt_domain *d)
+static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
+ struct rdt_domain *d)
{
struct kernfs_node *parent_kn;
struct rdtgroup *prgrp, *crgrp;
struct list_head *head;
- if (!r->mon_enabled)
- return;
-
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
parent_kn = prgrp->mon.mon_data_kn;
mkdir_mondata_subdir(parent_kn, d, r, prgrp);
@@ -2642,7 +2675,7 @@ static int mkdir_mondata_all(struct kernfs_node *parent_kn,
* Create the subdirectories for each domain. Note that all events
* in a domain like L3 are grouped into a resource whose domain is L3
*/
- for_each_mon_enabled_rdt_resource(r) {
+ for_each_mon_capable_rdt_resource(r) {
ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
if (ret)
goto out_destroy;
@@ -2786,14 +2819,19 @@ static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid)
}
/* Initialize MBA resource with default values. */
-static void rdtgroup_init_mba(struct rdt_resource *r)
+static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid)
{
struct resctrl_staged_config *cfg;
struct rdt_domain *d;
list_for_each_entry(d, &r->domains, list) {
+ if (is_mba_sc(r)) {
+ d->mbps_val[closid] = MBA_MAX_MBPS;
+ continue;
+ }
+
cfg = &d->staged_config[CDP_NONE];
- cfg->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl;
+ cfg->new_ctrl = r->default_ctrl;
cfg->have_new_ctrl = true;
}
}
@@ -2808,7 +2846,9 @@ static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
list_for_each_entry(s, &resctrl_schema_all, list) {
r = s->res;
if (r->rid == RDT_RESOURCE_MBA) {
- rdtgroup_init_mba(r);
+ rdtgroup_init_mba(r, rdtgrp->closid);
+ if (is_mba_sc(r))
+ continue;
} else {
ret = rdtgroup_init_cat(s, rdtgrp->closid);
if (ret < 0)
@@ -3236,6 +3276,110 @@ out:
return ret;
}
+static void domain_destroy_mon_state(struct rdt_domain *d)
+{
+ bitmap_free(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ kfree(d->mbm_local);
+}
+
+void resctrl_offline_domain(struct rdt_resource *r, struct rdt_domain *d)
+{
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+ mba_sc_domain_destroy(r, d);
+
+ if (!r->mon_capable)
+ return;
+
+ /*
+ * If resctrl is mounted, remove all the
+ * per domain monitor data directories.
+ */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ rmdir_mondata_subdir_allrdtgrp(r, d->id);
+
+ if (is_mbm_enabled())
+ cancel_delayed_work(&d->mbm_over);
+ if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) {
+ /*
+ * When a package is going down, forcefully
+ * decrement rmid->ebusy. There is no way to know
+ * that the L3 was flushed and hence may lead to
+ * incorrect counts in rare scenarios, but leaving
+ * the RMID as busy creates RMID leaks if the
+ * package never comes back.
+ */
+ __check_limbo(d, true);
+ cancel_delayed_work(&d->cqm_limbo);
+ }
+
+ domain_destroy_mon_state(d);
+}
+
+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
+{
+ size_t tsize;
+
+ if (is_llc_occupancy_enabled()) {
+ d->rmid_busy_llc = bitmap_zalloc(r->num_rmid, GFP_KERNEL);
+ if (!d->rmid_busy_llc)
+ return -ENOMEM;
+ }
+ if (is_mbm_total_enabled()) {
+ tsize = sizeof(*d->mbm_total);
+ d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_total) {
+ bitmap_free(d->rmid_busy_llc);
+ return -ENOMEM;
+ }
+ }
+ if (is_mbm_local_enabled()) {
+ tsize = sizeof(*d->mbm_local);
+ d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
+ if (!d->mbm_local) {
+ bitmap_free(d->rmid_busy_llc);
+ kfree(d->mbm_total);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
+int resctrl_online_domain(struct rdt_resource *r, struct rdt_domain *d)
+{
+ int err;
+
+ lockdep_assert_held(&rdtgroup_mutex);
+
+ if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA)
+ /* RDT_RESOURCE_MBA is never mon_capable */
+ return mba_sc_domain_allocate(r, d);
+
+ if (!r->mon_capable)
+ return 0;
+
+ err = domain_setup_mon_state(r, d);
+ if (err)
+ return err;
+
+ if (is_mbm_enabled()) {
+ INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
+ mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
+ }
+
+ if (is_llc_occupancy_enabled())
+ INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
+
+ /* If resctrl is mounted, add per domain monitor data directories. */
+ if (static_branch_unlikely(&rdt_mon_enable_key))
+ mkdir_mondata_subdir_allrdtgrp(r, d);
+
+ return 0;
+}
+
/*
* rdtgroup_init - rdtgroup initialization
*
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 24c1bb8eb196..1ec20807de1e 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -12,6 +12,9 @@
#include "encls.h"
#include "sgx.h"
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+ struct sgx_backing *backing);
+
#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd))
/*
* 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to
@@ -344,8 +347,11 @@ static vm_fault_t sgx_encl_eaug_page(struct vm_area_struct *vma,
}
va_page = sgx_encl_grow(encl, false);
- if (IS_ERR(va_page))
+ if (IS_ERR(va_page)) {
+ if (PTR_ERR(va_page) == -EBUSY)
+ vmret = VM_FAULT_NOPAGE;
goto err_out_epc;
+ }
if (va_page)
list_add(&va_page->list, &encl->va_pages);
@@ -906,15 +912,14 @@ const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl)
static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
pgoff_t index)
{
- struct inode *inode = encl->backing->f_path.dentry->d_inode;
- struct address_space *mapping = inode->i_mapping;
+ struct address_space *mapping = encl->backing->f_mapping;
gfp_t gfpmask = mapping_gfp_mask(mapping);
return shmem_read_mapping_page_gfp(mapping, index, gfpmask);
}
/**
- * sgx_encl_get_backing() - Pin the backing storage
+ * __sgx_encl_get_backing() - Pin the backing storage
* @encl: an enclave pointer
* @page_index: enclave page index
* @backing: data for accessing backing storage for the page
@@ -926,7 +931,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl,
* 0 on success,
* -errno otherwise.
*/
-static int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
+static int __sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing)
{
pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
@@ -1001,7 +1006,7 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
}
/**
- * sgx_encl_alloc_backing() - allocate a new backing storage page
+ * sgx_encl_alloc_backing() - create a new backing storage page
* @encl: an enclave pointer
* @page_index: enclave page index
* @backing: data for accessing backing storage for the page
@@ -1009,7 +1014,9 @@ static struct mem_cgroup *sgx_encl_get_mem_cgroup(struct sgx_encl *encl)
* When called from ksgxd, sets the active memcg from one of the
* mms in the enclave's mm_list prior to any backing page allocation,
* in order to ensure that shmem page allocations are charged to the
- * enclave.
+ * enclave. Create a backing page for loading data back into an EPC page with
+ * ELDU. This function takes a reference on a new backing page which
+ * must be dropped with a corresponding call to sgx_encl_put_backing().
*
* Return:
* 0 on success,
@@ -1022,7 +1029,7 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
struct mem_cgroup *memcg = set_active_memcg(encl_memcg);
int ret;
- ret = sgx_encl_get_backing(encl, page_index, backing);
+ ret = __sgx_encl_get_backing(encl, page_index, backing);
set_active_memcg(memcg);
mem_cgroup_put(encl_memcg);
@@ -1040,15 +1047,17 @@ int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
* It is the caller's responsibility to ensure that it is appropriate to use
* sgx_encl_lookup_backing() rather than sgx_encl_alloc_backing(). If lookup is
* not used correctly, this will cause an allocation which is not accounted for.
+ * This function takes a reference on an existing backing page which must be
+ * dropped with a corresponding call to sgx_encl_put_backing().
*
* Return:
* 0 on success,
* -errno otherwise.
*/
-int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
+static int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing)
{
- return sgx_encl_get_backing(encl, page_index, backing);
+ return __sgx_encl_get_backing(encl, page_index, backing);
}
/**
diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h
index a65a952116fd..f94ff14c9486 100644
--- a/arch/x86/kernel/cpu/sgx/encl.h
+++ b/arch/x86/kernel/cpu/sgx/encl.h
@@ -107,8 +107,6 @@ bool current_is_ksgxd(void);
void sgx_encl_release(struct kref *ref);
int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm);
const cpumask_t *sgx_encl_cpumask(struct sgx_encl *encl);
-int sgx_encl_lookup_backing(struct sgx_encl *encl, unsigned long page_index,
- struct sgx_backing *backing);
int sgx_encl_alloc_backing(struct sgx_encl *encl, unsigned long page_index,
struct sgx_backing *backing);
void sgx_encl_put_backing(struct sgx_backing *backing);
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 515e2a5f25bb..0aad028f04d4 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -49,9 +49,13 @@ static LIST_HEAD(sgx_dirty_page_list);
* Reset post-kexec EPC pages to the uninitialized state. The pages are removed
* from the input list, and made available for the page allocator. SECS pages
* prepending their children in the input list are left intact.
+ *
+ * Return 0 when sanitization was successful or kthread was stopped, and the
+ * number of unsanitized pages otherwise.
*/
-static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
+static unsigned long __sgx_sanitize_pages(struct list_head *dirty_page_list)
{
+ unsigned long left_dirty = 0;
struct sgx_epc_page *page;
LIST_HEAD(dirty);
int ret;
@@ -59,7 +63,7 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
/* dirty_page_list is thread-local, no need for a lock: */
while (!list_empty(dirty_page_list)) {
if (kthread_should_stop())
- return;
+ return 0;
page = list_first_entry(dirty_page_list, struct sgx_epc_page, list);
@@ -92,12 +96,14 @@ static void __sgx_sanitize_pages(struct list_head *dirty_page_list)
} else {
/* The page is not yet clean - move to the dirty list. */
list_move_tail(&page->list, &dirty);
+ left_dirty++;
}
cond_resched();
}
list_splice(&dirty, dirty_page_list);
+ return left_dirty;
}
static bool sgx_reclaimer_age(struct sgx_epc_page *epc_page)
@@ -395,10 +401,7 @@ static int ksgxd(void *p)
* required for SECS pages, whose child pages blocked EREMOVE.
*/
__sgx_sanitize_pages(&sgx_dirty_page_list);
- __sgx_sanitize_pages(&sgx_dirty_page_list);
-
- /* sanity check: */
- WARN_ON(!list_empty(&sgx_dirty_page_list));
+ WARN_ON(__sgx_sanitize_pages(&sgx_dirty_page_list));
while (!kthread_should_stop()) {
if (try_to_freeze())
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index afae4dd77495..b3dba35f466e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -128,7 +128,7 @@ void show_opcodes(struct pt_regs *regs, const char *loglvl)
/* No access to the user space stack of other tasks. Ignore. */
break;
default:
- printk("%sCode: Unable to access opcode bytes at RIP 0x%lx.\n",
+ printk("%sCode: Unable to access opcode bytes at 0x%lx.\n",
loglvl, prologue);
break;
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 68b38925a74f..44f937015e1e 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -264,11 +264,11 @@ static __init void early_pci_serial_init(char *s)
bar0 = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
/*
- * Verify it is a UART type device
+ * Verify it is a 16550-UART type device
*/
if (((classcode >> 16 != PCI_CLASS_COMMUNICATION_MODEM) &&
(classcode >> 16 != PCI_CLASS_COMMUNICATION_SERIAL)) ||
- (((classcode >> 8) & 0xff) != 0x02)) /* 16550 I/F at BAR0 */ {
+ (((classcode >> 8) & 0xff) != PCI_SERIAL_16550_COMPATIBLE)) {
if (!force)
return;
}
@@ -276,22 +276,22 @@ static __init void early_pci_serial_init(char *s)
/*
* Determine if it is IO or memory mapped
*/
- if (bar0 & 0x01) {
+ if ((bar0 & PCI_BASE_ADDRESS_SPACE) == PCI_BASE_ADDRESS_SPACE_IO) {
/* it is IO mapped */
serial_in = io_serial_in;
serial_out = io_serial_out;
- early_serial_base = bar0&0xfffffffc;
+ early_serial_base = bar0 & PCI_BASE_ADDRESS_IO_MASK;
write_pci_config(bus, slot, func, PCI_COMMAND,
- cmdreg|PCI_COMMAND_IO);
+ cmdreg|PCI_COMMAND_IO);
} else {
/* It is memory mapped - assume 32-bit alignment */
serial_in = mem32_serial_in;
serial_out = mem32_serial_out;
/* WARNING! assuming the address is always in the first 4G */
early_serial_base =
- (unsigned long)early_ioremap(bar0 & 0xfffffff0, 0x10);
+ (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10);
write_pci_config(bus, slot, func, PCI_COMMAND,
- cmdreg|PCI_COMMAND_MEMORY);
+ cmdreg|PCI_COMMAND_MEMORY);
}
/*
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 586f718b8e95..349046434513 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -4,11 +4,8 @@
*/
#include <linux/platform_device.h>
#include <linux/mc146818rtc.h>
-#include <linux/acpi.h>
-#include <linux/bcd.h>
#include <linux/export.h>
#include <linux/pnp.h>
-#include <linux/of.h>
#include <asm/vsyscall.h>
#include <asm/x86_init.h>
@@ -20,26 +17,23 @@
/*
* This is a special lock that is owned by the CPU and holds the index
* register we are working with. It is required for NMI access to the
- * CMOS/RTC registers. See include/asm-i386/mc146818rtc.h for details.
+ * CMOS/RTC registers. See arch/x86/include/asm/mc146818rtc.h for details.
*/
volatile unsigned long cmos_lock;
EXPORT_SYMBOL(cmos_lock);
#endif /* CONFIG_X86_32 */
-/* For two digit years assume time is always after that */
-#define CMOS_YEARS_OFFS 2000
-
DEFINE_SPINLOCK(rtc_lock);
EXPORT_SYMBOL(rtc_lock);
/*
- * In order to set the CMOS clock precisely, set_rtc_mmss has to be
+ * In order to set the CMOS clock precisely, mach_set_cmos_time has to be
* called 500 ms after the second nowtime has started, because when
* nowtime is written into the registers of the CMOS clock, it will
* jump to the next second precisely 500 ms later. Check the Motorola
* MC146818A or Dallas DS12887 data sheet for details.
*/
-int mach_set_rtc_mmss(const struct timespec64 *now)
+int mach_set_cmos_time(const struct timespec64 *now)
{
unsigned long long nowtime = now->tv_sec;
struct rtc_time tm;
@@ -62,8 +56,7 @@ int mach_set_rtc_mmss(const struct timespec64 *now)
void mach_get_cmos_time(struct timespec64 *now)
{
- unsigned int status, year, mon, day, hour, min, sec, century = 0;
- unsigned long flags;
+ struct rtc_time tm;
/*
* If pm_trace abused the RTC as storage, set the timespec to 0,
@@ -74,51 +67,13 @@ void mach_get_cmos_time(struct timespec64 *now)
return;
}
- spin_lock_irqsave(&rtc_lock, flags);
-
- /*
- * If UIP is clear, then we have >= 244 microseconds before
- * RTC registers will be updated. Spec sheet says that this
- * is the reliable way to read RTC - registers. If UIP is set
- * then the register access might be invalid.
- */
- while ((CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP))
- cpu_relax();
-
- sec = CMOS_READ(RTC_SECONDS);
- min = CMOS_READ(RTC_MINUTES);
- hour = CMOS_READ(RTC_HOURS);
- day = CMOS_READ(RTC_DAY_OF_MONTH);
- mon = CMOS_READ(RTC_MONTH);
- year = CMOS_READ(RTC_YEAR);
-
-#ifdef CONFIG_ACPI
- if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
- acpi_gbl_FADT.century)
- century = CMOS_READ(acpi_gbl_FADT.century);
-#endif
-
- status = CMOS_READ(RTC_CONTROL);
- WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
-
- spin_unlock_irqrestore(&rtc_lock, flags);
-
- if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
- sec = bcd2bin(sec);
- min = bcd2bin(min);
- hour = bcd2bin(hour);
- day = bcd2bin(day);
- mon = bcd2bin(mon);
- year = bcd2bin(year);
+ if (mc146818_get_time(&tm)) {
+ pr_err("Unable to read current time from RTC\n");
+ now->tv_sec = now->tv_nsec = 0;
+ return;
}
- if (century) {
- century = bcd2bin(century);
- year += century * 100;
- } else
- year += CMOS_YEARS_OFFS;
-
- now->tv_sec = mktime64(year, mon, day, hour, min, sec);
+ now->tv_sec = rtc_tm_to_time64(&tm);
now->tv_nsec = 0;
}
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d62b2cb85cea..178015a820f0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -63,6 +63,7 @@
#include <asm/insn-eval.h>
#include <asm/vdso.h>
#include <asm/tdx.h>
+#include <asm/cfi.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -313,7 +314,8 @@ static noinstr bool handle_bug(struct pt_regs *regs)
*/
if (regs->flags & X86_EFLAGS_IF)
raw_local_irq_enable();
- if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN) {
+ if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN ||
+ handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) {
regs->ip += LEN_UD2;
handled = true;
}
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index e84ee5cdbd8c..57353519bc11 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -138,7 +138,7 @@ struct x86_platform_ops x86_platform __ro_after_init = {
.calibrate_cpu = native_calibrate_cpu_early,
.calibrate_tsc = native_calibrate_tsc,
.get_wallclock = mach_get_cmos_time,
- .set_wallclock = mach_set_rtc_mmss,
+ .set_wallclock = mach_set_cmos_time,
.iommu_shutdown = iommu_shutdown_noop,
.is_untracked_pat_range = is_ISA_range,
.nmi_init = default_nmi_init,
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 4c1c2c06e96b..7065462378e2 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -311,6 +311,15 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_update_cpuid_runtime);
+static bool kvm_cpuid_has_hyperv(struct kvm_cpuid_entry2 *entries, int nent)
+{
+ struct kvm_cpuid_entry2 *entry;
+
+ entry = cpuid_entry2_find(entries, nent, HYPERV_CPUID_INTERFACE,
+ KVM_CPUID_INDEX_NOT_SIGNIFICANT);
+ return entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX;
+}
+
static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct kvm_lapic *apic = vcpu->arch.apic;
@@ -346,7 +355,8 @@ static void kvm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
vcpu->arch.cr4_guest_rsvd_bits =
__cr4_reserved_bits(guest_cpuid_has, vcpu);
- kvm_hv_set_cpuid(vcpu);
+ kvm_hv_set_cpuid(vcpu, kvm_cpuid_has_hyperv(vcpu->arch.cpuid_entries,
+ vcpu->arch.cpuid_nent));
/* Invoke the vendor callback only after the above state is updated. */
static_call(kvm_x86_vcpu_after_set_cpuid)(vcpu);
@@ -409,6 +419,12 @@ static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2,
return 0;
}
+ if (kvm_cpuid_has_hyperv(e2, nent)) {
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
+ }
+
r = kvm_check_cpuid(vcpu, e2, nent);
if (r)
return r;
@@ -902,8 +918,6 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
entry->edx = 0;
}
break;
- case 9:
- break;
case 0xa: { /* Architectural Performance Monitoring */
union cpuid10_eax eax;
union cpuid10_edx edx;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index aacb28c83e43..3b27622d4642 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -1137,9 +1137,11 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)
static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
struct operand *op)
{
- unsigned reg = ctxt->modrm_reg;
+ unsigned int reg;
- if (!(ctxt->d & ModRM))
+ if (ctxt->d & ModRM)
+ reg = ctxt->modrm_reg;
+ else
reg = (ctxt->b & 7) | ((ctxt->rex_prefix & 1) << 3);
if (ctxt->d & Sse) {
@@ -1953,7 +1955,7 @@ static int em_pop_sreg(struct x86_emulate_ctxt *ctxt)
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->modrm_reg == VCPU_SREG_SS)
+ if (seg == VCPU_SREG_SS)
ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
if (ctxt->op_bytes > 2)
rsp_increment(ctxt, ctxt->op_bytes - 2);
@@ -3645,13 +3647,10 @@ static int em_wrmsr(struct x86_emulate_ctxt *ctxt)
| ((u64)reg_read(ctxt, VCPU_REGS_RDX) << 32);
r = ctxt->ops->set_msr_with_filter(ctxt, msr_index, msr_data);
- if (r == X86EMUL_IO_NEEDED)
- return r;
-
- if (r > 0)
+ if (r == X86EMUL_PROPAGATE_FAULT)
return emulate_gp(ctxt, 0);
- return r < 0 ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+ return r;
}
static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
@@ -3662,15 +3661,14 @@ static int em_rdmsr(struct x86_emulate_ctxt *ctxt)
r = ctxt->ops->get_msr_with_filter(ctxt, msr_index, &msr_data);
- if (r == X86EMUL_IO_NEEDED)
- return r;
-
- if (r)
+ if (r == X86EMUL_PROPAGATE_FAULT)
return emulate_gp(ctxt, 0);
- *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
- *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
- return X86EMUL_CONTINUE;
+ if (r == X86EMUL_CONTINUE) {
+ *reg_write(ctxt, VCPU_REGS_RAX) = (u32)msr_data;
+ *reg_write(ctxt, VCPU_REGS_RDX) = msr_data >> 32;
+ }
+ return r;
}
static int em_store_sreg(struct x86_emulate_ctxt *ctxt, int segment)
@@ -4171,8 +4169,7 @@ static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
ctxt->ops->get_dr(ctxt, 7, &dr7);
- /* Check if DR7.Global_Enable is set */
- return dr7 & (1 << 13);
+ return dr7 & DR7_GD;
}
static int check_dr_read(struct x86_emulate_ctxt *ctxt)
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index ed804447589c..0adf4a437e85 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -38,9 +38,6 @@
#include "irq.h"
#include "fpu.h"
-/* "Hv#1" signature */
-#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
-
#define KVM_HV_MAX_SPARSE_VCPU_SET_BITS DIV_ROUND_UP(KVM_MAX_VCPUS, 64)
static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
@@ -934,11 +931,14 @@ static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
stimer_prepare_msg(stimer);
}
-static int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
{
- struct kvm_vcpu_hv *hv_vcpu;
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
int i;
+ if (hv_vcpu)
+ return 0;
+
hv_vcpu = kzalloc(sizeof(struct kvm_vcpu_hv), GFP_KERNEL_ACCOUNT);
if (!hv_vcpu)
return -ENOMEM;
@@ -962,11 +962,9 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages)
struct kvm_vcpu_hv_synic *synic;
int r;
- if (!to_hv_vcpu(vcpu)) {
- r = kvm_hv_vcpu_init(vcpu);
- if (r)
- return r;
- }
+ r = kvm_hv_vcpu_init(vcpu);
+ if (r)
+ return r;
synic = to_hv_synic(vcpu);
@@ -1660,10 +1658,8 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
if (!host && !vcpu->arch.hyperv_enabled)
return 1;
- if (!to_hv_vcpu(vcpu)) {
- if (kvm_hv_vcpu_init(vcpu))
- return 1;
- }
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1683,10 +1679,8 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
if (!host && !vcpu->arch.hyperv_enabled)
return 1;
- if (!to_hv_vcpu(vcpu)) {
- if (kvm_hv_vcpu_init(vcpu))
- return 1;
- }
+ if (kvm_hv_vcpu_init(vcpu))
+ return 1;
if (kvm_hv_msr_partition_wide(msr)) {
int r;
@@ -1987,49 +1981,49 @@ ret_success:
return HV_STATUS_SUCCESS;
}
-void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu)
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled)
{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
struct kvm_cpuid_entry2 *entry;
- struct kvm_vcpu_hv *hv_vcpu;
- entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_INTERFACE);
- if (entry && entry->eax == HYPERV_CPUID_SIGNATURE_EAX) {
- vcpu->arch.hyperv_enabled = true;
- } else {
- vcpu->arch.hyperv_enabled = false;
+ vcpu->arch.hyperv_enabled = hyperv_enabled;
+
+ if (!hv_vcpu) {
+ /*
+ * KVM should have already allocated kvm_vcpu_hv if Hyper-V is
+ * enabled in CPUID.
+ */
+ WARN_ON_ONCE(vcpu->arch.hyperv_enabled);
return;
}
- if (!to_hv_vcpu(vcpu) && kvm_hv_vcpu_init(vcpu))
- return;
+ memset(&hv_vcpu->cpuid_cache, 0, sizeof(hv_vcpu->cpuid_cache));
- hv_vcpu = to_hv_vcpu(vcpu);
+ if (!vcpu->arch.hyperv_enabled)
+ return;
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_FEATURES);
if (entry) {
hv_vcpu->cpuid_cache.features_eax = entry->eax;
hv_vcpu->cpuid_cache.features_ebx = entry->ebx;
hv_vcpu->cpuid_cache.features_edx = entry->edx;
- } else {
- hv_vcpu->cpuid_cache.features_eax = 0;
- hv_vcpu->cpuid_cache.features_ebx = 0;
- hv_vcpu->cpuid_cache.features_edx = 0;
}
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_ENLIGHTMENT_INFO);
if (entry) {
hv_vcpu->cpuid_cache.enlightenments_eax = entry->eax;
hv_vcpu->cpuid_cache.enlightenments_ebx = entry->ebx;
- } else {
- hv_vcpu->cpuid_cache.enlightenments_eax = 0;
- hv_vcpu->cpuid_cache.enlightenments_ebx = 0;
}
entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES);
if (entry)
hv_vcpu->cpuid_cache.syndbg_cap_eax = entry->eax;
- else
- hv_vcpu->cpuid_cache.syndbg_cap_eax = 0;
+
+ entry = kvm_find_cpuid_entry(vcpu, HYPERV_CPUID_NESTED_FEATURES);
+ if (entry) {
+ hv_vcpu->cpuid_cache.nested_eax = entry->eax;
+ hv_vcpu->cpuid_cache.nested_ebx = entry->ebx;
+ }
}
int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce)
@@ -2552,7 +2546,7 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
case HYPERV_CPUID_NESTED_FEATURES:
ent->eax = evmcs_ver;
ent->eax |= HV_X64_NESTED_MSR_BITMAP;
-
+ ent->ebx |= HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
break;
case HYPERV_CPUID_SYNDBG_VENDOR_AND_MAX_FUNCTIONS:
diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
index da2737f2a956..1030b1b50552 100644
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -23,6 +23,9 @@
#include <linux/kvm_host.h>
+/* "Hv#1" signature */
+#define HYPERV_CPUID_SIGNATURE_EAX 0x31237648
+
/*
* The #defines related to the synthetic debugger are required by KDNet, but
* they are not documented in the Hyper-V TLFS because the synthetic debugger
@@ -141,7 +144,8 @@ void kvm_hv_request_tsc_page_update(struct kvm *kvm);
void kvm_hv_init_vm(struct kvm *kvm);
void kvm_hv_destroy_vm(struct kvm *kvm);
-void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu);
+int kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_set_cpuid(struct kvm_vcpu *vcpu, bool hyperv_enabled);
int kvm_hv_set_enforce_cpuid(struct kvm_vcpu *vcpu, bool enforce);
int kvm_vm_ioctl_hv_eventfd(struct kvm *kvm, struct kvm_hyperv_eventfd *args);
int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid,
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9dda989a1cf0..d7639d126e6c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -3025,17 +3025,8 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
struct kvm_lapic *apic = vcpu->arch.apic;
u8 sipi_vector;
int r;
- unsigned long pe;
- if (!lapic_in_kernel(vcpu))
- return 0;
-
- /*
- * Read pending events before calling the check_events
- * callback.
- */
- pe = smp_load_acquire(&apic->pending_events);
- if (!pe)
+ if (!kvm_apic_has_pending_init_or_sipi(vcpu))
return 0;
if (is_guest_mode(vcpu)) {
@@ -3043,38 +3034,31 @@ int kvm_apic_accept_events(struct kvm_vcpu *vcpu)
if (r < 0)
return r == -EBUSY ? 0 : r;
/*
- * If an event has happened and caused a vmexit,
- * we know INITs are latched and therefore
- * we will not incorrectly deliver an APIC
- * event instead of a vmexit.
+ * Continue processing INIT/SIPI even if a nested VM-Exit
+ * occurred, e.g. pending SIPIs should be dropped if INIT+SIPI
+ * are blocked as a result of transitioning to VMX root mode.
*/
}
/*
- * INITs are latched while CPU is in specific states
- * (SMM, VMX root mode, SVM with GIF=0).
- * Because a CPU cannot be in these states immediately
- * after it has processed an INIT signal (and thus in
- * KVM_MP_STATE_INIT_RECEIVED state), just eat SIPIs
- * and leave the INIT pending.
+ * INITs are blocked while CPU is in specific states (SMM, VMX root
+ * mode, SVM with GIF=0), while SIPIs are dropped if the CPU isn't in
+ * wait-for-SIPI (WFS).
*/
- if (kvm_vcpu_latch_init(vcpu)) {
+ if (!kvm_apic_init_sipi_allowed(vcpu)) {
WARN_ON_ONCE(vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED);
- if (test_bit(KVM_APIC_SIPI, &pe))
- clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ clear_bit(KVM_APIC_SIPI, &apic->pending_events);
return 0;
}
- if (test_bit(KVM_APIC_INIT, &pe)) {
- clear_bit(KVM_APIC_INIT, &apic->pending_events);
+ if (test_and_clear_bit(KVM_APIC_INIT, &apic->pending_events)) {
kvm_vcpu_reset(vcpu, true);
if (kvm_vcpu_is_bsp(apic->vcpu))
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
else
vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
}
- if (test_bit(KVM_APIC_SIPI, &pe)) {
- clear_bit(KVM_APIC_SIPI, &apic->pending_events);
+ if (test_and_clear_bit(KVM_APIC_SIPI, &apic->pending_events)) {
if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
/* evaluate pending_events before reading the vector */
smp_rmb();
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 117a46df5cc1..a5ac4a5a5179 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -7,6 +7,7 @@
#include <linux/kvm_host.h>
#include "hyperv.h"
+#include "kvm_cache_regs.h"
#define KVM_APIC_INIT 0
#define KVM_APIC_SIPI 1
@@ -223,11 +224,17 @@ static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && vcpu->arch.apic->apicv_active;
}
-static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
+static inline bool kvm_apic_has_pending_init_or_sipi(struct kvm_vcpu *vcpu)
{
return lapic_in_kernel(vcpu) && vcpu->arch.apic->pending_events;
}
+static inline bool kvm_apic_init_sipi_allowed(struct kvm_vcpu *vcpu)
+{
+ return !is_smm(vcpu) &&
+ !static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
+}
+
static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
{
return (irq->delivery_mode == APIC_DM_LOWEST ||
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3552e6af3684..6f81539061d6 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -1667,6 +1667,18 @@ static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
}
+static void kvm_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_mod_used_mmu_pages(kvm, +1);
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+}
+
+static void kvm_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_mod_used_mmu_pages(kvm, -1);
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+}
+
static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
{
MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
@@ -2124,7 +2136,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_shadow_page(struct kvm *kvm,
*/
sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
list_add(&sp->link, &kvm->arch.active_mmu_pages);
- kvm_mod_used_mmu_pages(kvm, +1);
+ kvm_account_mmu_page(kvm, sp);
sp->gfn = gfn;
sp->role = role;
@@ -2458,7 +2470,7 @@ static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
list_add(&sp->link, invalid_list);
else
list_move(&sp->link, invalid_list);
- kvm_mod_used_mmu_pages(kvm, -1);
+ kvm_unaccount_mmu_page(kvm, sp);
} else {
/*
* Remove the active root from the active page list, the root
@@ -4292,7 +4304,7 @@ int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
vcpu->arch.l1tf_flush_l1d = true;
if (!flags) {
- trace_kvm_page_fault(fault_address, error_code);
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
if (kvm_event_needs_reinjection(vcpu))
kvm_mmu_unprotect_page_virt(vcpu, fault_address);
@@ -6704,10 +6716,12 @@ int kvm_mmu_vendor_module_init(void)
ret = register_shrinker(&mmu_shrinker, "x86-mmu");
if (ret)
- goto out;
+ goto out_shrinker;
return 0;
+out_shrinker:
+ percpu_counter_destroy(&kvm_total_used_mmu_pages);
out:
mmu_destroy_caches();
return ret;
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 39e0205e7300..5ab5f94dcb6f 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -472,7 +472,7 @@ error:
#if PTTYPE == PTTYPE_EPT
/*
- * Use PFERR_RSVD_MASK in error_code to to tell if EPT
+ * Use PFERR_RSVD_MASK in error_code to tell if EPT
* misconfiguration requires to be injected. The detection is
* done by is_rsvd_bits_set() above.
*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index bf2ccf9debca..672f0432d777 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -372,6 +372,16 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
}
}
+static void tdp_account_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, +1);
+}
+
+static void tdp_unaccount_mmu_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+{
+ kvm_account_pgtable_pages((void *)sp->spt, -1);
+}
+
/**
* tdp_mmu_unlink_sp() - Remove a shadow page from the list of used pages
*
@@ -384,6 +394,7 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn,
static void tdp_mmu_unlink_sp(struct kvm *kvm, struct kvm_mmu_page *sp,
bool shared)
{
+ tdp_unaccount_mmu_page(kvm, sp);
if (shared)
spin_lock(&kvm->arch.tdp_mmu_pages_lock);
else
@@ -1132,6 +1143,7 @@ static int tdp_mmu_link_sp(struct kvm *kvm, struct tdp_iter *iter,
if (account_nx)
account_huge_nx_page(kvm, sp);
spin_unlock(&kvm->arch.tdp_mmu_pages_lock);
+ tdp_account_mmu_page(kvm, sp);
return 0;
}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 76dcc8a3e849..4c620999d230 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -55,28 +55,6 @@ static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
nested_svm_vmexit(svm);
}
-static bool nested_svm_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- struct vcpu_svm *svm = to_svm(vcpu);
- struct vmcb *vmcb = svm->vmcb;
-
- WARN_ON(!is_guest_mode(vcpu));
-
- if (vmcb12_is_intercept(&svm->nested.ctl,
- INTERCEPT_EXCEPTION_OFFSET + PF_VECTOR) &&
- !WARN_ON_ONCE(svm->nested.nested_run_pending)) {
- vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + PF_VECTOR;
- vmcb->control.exit_code_hi = 0;
- vmcb->control.exit_info_1 = fault->error_code;
- vmcb->control.exit_info_2 = fault->address;
- nested_svm_vmexit(svm);
- return true;
- }
-
- return false;
-}
-
static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -468,7 +446,7 @@ static void nested_save_pending_event_to_vmcb12(struct vcpu_svm *svm,
unsigned int nr;
if (vcpu->arch.exception.injected) {
- nr = vcpu->arch.exception.nr;
+ nr = vcpu->arch.exception.vector;
exit_int_info = nr | SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_EXEPT;
if (vcpu->arch.exception.has_error_code) {
@@ -781,11 +759,15 @@ int enter_svm_guest_mode(struct kvm_vcpu *vcpu, u64 vmcb12_gpa,
struct vcpu_svm *svm = to_svm(vcpu);
int ret;
- trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb12_gpa,
- vmcb12->save.rip,
- vmcb12->control.int_ctl,
- vmcb12->control.event_inj,
- vmcb12->control.nested_ctl);
+ trace_kvm_nested_vmenter(svm->vmcb->save.rip,
+ vmcb12_gpa,
+ vmcb12->save.rip,
+ vmcb12->control.int_ctl,
+ vmcb12->control.event_inj,
+ vmcb12->control.nested_ctl,
+ vmcb12->control.nested_cr3,
+ vmcb12->save.cr3,
+ KVM_ISA_SVM);
trace_kvm_nested_intercepts(vmcb12->control.intercepts[INTERCEPT_CR] & 0xffff,
vmcb12->control.intercepts[INTERCEPT_CR] >> 16,
@@ -1304,44 +1286,46 @@ int nested_svm_check_permissions(struct kvm_vcpu *vcpu)
return 0;
}
-static bool nested_exit_on_exception(struct vcpu_svm *svm)
+static bool nested_svm_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
{
- unsigned int nr = svm->vcpu.arch.exception.nr;
+ struct vcpu_svm *svm = to_svm(vcpu);
- return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(nr));
+ return (svm->nested.ctl.intercepts[INTERCEPT_EXCEPTION] & BIT(vector));
}
-static void nested_svm_inject_exception_vmexit(struct vcpu_svm *svm)
+static void nested_svm_inject_exception_vmexit(struct kvm_vcpu *vcpu)
{
- unsigned int nr = svm->vcpu.arch.exception.nr;
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
+ vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + ex->vector;
vmcb->control.exit_code_hi = 0;
- if (svm->vcpu.arch.exception.has_error_code)
- vmcb->control.exit_info_1 = svm->vcpu.arch.exception.error_code;
+ if (ex->has_error_code)
+ vmcb->control.exit_info_1 = ex->error_code;
/*
* EXITINFO2 is undefined for all exception intercepts other
* than #PF.
*/
- if (nr == PF_VECTOR) {
- if (svm->vcpu.arch.exception.nested_apf)
- vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
- else if (svm->vcpu.arch.exception.has_payload)
- vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
+ if (ex->vector == PF_VECTOR) {
+ if (ex->has_payload)
+ vmcb->control.exit_info_2 = ex->payload;
else
- vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
- } else if (nr == DB_VECTOR) {
- /* See inject_pending_event. */
- kvm_deliver_exception_payload(&svm->vcpu);
- if (svm->vcpu.arch.dr7 & DR7_GD) {
- svm->vcpu.arch.dr7 &= ~DR7_GD;
- kvm_update_dr7(&svm->vcpu);
+ vmcb->control.exit_info_2 = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ /* See kvm_check_and_inject_events(). */
+ kvm_deliver_exception_payload(vcpu, ex);
+
+ if (vcpu->arch.dr7 & DR7_GD) {
+ vcpu->arch.dr7 &= ~DR7_GD;
+ kvm_update_dr7(vcpu);
}
- } else
- WARN_ON(svm->vcpu.arch.exception.has_payload);
+ } else {
+ WARN_ON(ex->has_payload);
+ }
nested_svm_vmexit(svm);
}
@@ -1353,10 +1337,22 @@ static inline bool nested_exit_on_init(struct vcpu_svm *svm)
static int svm_check_nested_events(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
- bool block_nested_events =
- kvm_event_needs_reinjection(vcpu) || svm->nested.nested_run_pending;
struct kvm_lapic *apic = vcpu->arch.apic;
+ struct vcpu_svm *svm = to_svm(vcpu);
+ /*
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
+ */
+ bool block_nested_exceptions = svm->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
@@ -1368,18 +1364,16 @@ static int svm_check_nested_events(struct kvm_vcpu *vcpu)
return 0;
}
- if (vcpu->arch.exception.pending) {
- /*
- * Only a pending nested run can block a pending exception.
- * Otherwise an injected NMI/interrupt should either be
- * lost or delivered to the nested hypervisor in the EXITINTINFO
- * vmcb field, while delivering the pending exception.
- */
- if (svm->nested.nested_run_pending)
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
return -EBUSY;
- if (!nested_exit_on_exception(svm))
- return 0;
- nested_svm_inject_exception_vmexit(svm);
+ nested_svm_inject_exception_vmexit(vcpu);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
return 0;
}
@@ -1720,8 +1714,8 @@ static bool svm_get_nested_state_pages(struct kvm_vcpu *vcpu)
struct kvm_x86_nested_ops svm_nested_ops = {
.leave_nested = svm_leave_nested,
+ .is_exception_vmexit = nested_svm_is_exception_vmexit,
.check_events = svm_check_nested_events,
- .handle_page_fault_workaround = nested_svm_handle_page_fault_workaround,
.triple_fault = nested_svm_triple_fault,
.get_nested_state_pages = svm_get_nested_state_pages,
.get_state = svm_get_nested_state,
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index f3813dbacb9f..58f0077d9357 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -461,24 +461,22 @@ static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu)
return 0;
}
-static void svm_queue_exception(struct kvm_vcpu *vcpu)
+static void svm_inject_exception(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct vcpu_svm *svm = to_svm(vcpu);
- unsigned nr = vcpu->arch.exception.nr;
- bool has_error_code = vcpu->arch.exception.has_error_code;
- u32 error_code = vcpu->arch.exception.error_code;
- kvm_deliver_exception_payload(vcpu);
+ kvm_deliver_exception_payload(vcpu, ex);
- if (kvm_exception_is_soft(nr) &&
+ if (kvm_exception_is_soft(ex->vector) &&
svm_update_soft_interrupt_rip(vcpu))
return;
- svm->vmcb->control.event_inj = nr
+ svm->vmcb->control.event_inj = ex->vector
| SVM_EVTINJ_VALID
- | (has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
+ | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0)
| SVM_EVTINJ_TYPE_EXEPT;
- svm->vmcb->control.event_inj_err = error_code;
+ svm->vmcb->control.event_inj_err = ex->error_code;
}
static void svm_init_erratum_383(void)
@@ -1975,7 +1973,7 @@ static int npf_interception(struct kvm_vcpu *vcpu)
u64 fault_address = svm->vmcb->control.exit_info_2;
u64 error_code = svm->vmcb->control.exit_info_1;
- trace_kvm_page_fault(fault_address, error_code);
+ trace_kvm_page_fault(vcpu, fault_address, error_code);
return kvm_mmu_page_fault(vcpu, fault_address, error_code,
static_cpu_has(X86_FEATURE_DECODEASSISTS) ?
svm->vmcb->control.insn_bytes : NULL,
@@ -2341,7 +2339,8 @@ void svm_set_gif(struct vcpu_svm *svm, bool value)
enable_gif(svm);
if (svm->vcpu.arch.smi_pending ||
svm->vcpu.arch.nmi_pending ||
- kvm_cpu_has_injectable_intr(&svm->vcpu))
+ kvm_cpu_has_injectable_intr(&svm->vcpu) ||
+ kvm_apic_has_pending_init_or_sipi(&svm->vcpu))
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
} else {
disable_gif(svm);
@@ -3522,7 +3521,7 @@ void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode,
/* Note, this is called iff the local APIC is in-kernel. */
if (!READ_ONCE(vcpu->arch.apic->apicv_active)) {
- /* Process the interrupt via inject_pending_event */
+ /* Process the interrupt via kvm_check_and_inject_events(). */
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
return;
@@ -4697,15 +4696,7 @@ static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- /*
- * TODO: Last condition latch INIT signals on vCPU when
- * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
- * To properly emulate the INIT intercept,
- * svm_check_nested_events() should call nested_svm_vmexit()
- * if an INIT signal is pending.
- */
- return !gif_set(svm) ||
- (vmcb_is_intercept(&svm->vmcb->control, INTERCEPT_INIT));
+ return !gif_set(svm);
}
static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -4798,7 +4789,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.patch_hypercall = svm_patch_hypercall,
.inject_irq = svm_inject_irq,
.inject_nmi = svm_inject_nmi,
- .queue_exception = svm_queue_exception,
+ .inject_exception = svm_inject_exception,
.cancel_injection = svm_cancel_injection,
.interrupt_allowed = svm_interrupt_allowed,
.nmi_allowed = svm_nmi_allowed,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 2120d7c060a9..bc25589ad588 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -394,20 +394,25 @@ TRACE_EVENT(kvm_inj_exception,
* Tracepoint for page fault.
*/
TRACE_EVENT(kvm_page_fault,
- TP_PROTO(unsigned long fault_address, unsigned int error_code),
- TP_ARGS(fault_address, error_code),
+ TP_PROTO(struct kvm_vcpu *vcpu, u64 fault_address, u64 error_code),
+ TP_ARGS(vcpu, fault_address, error_code),
TP_STRUCT__entry(
- __field( unsigned long, fault_address )
- __field( unsigned int, error_code )
+ __field( unsigned int, vcpu_id )
+ __field( unsigned long, guest_rip )
+ __field( u64, fault_address )
+ __field( u64, error_code )
),
TP_fast_assign(
+ __entry->vcpu_id = vcpu->vcpu_id;
+ __entry->guest_rip = kvm_rip_read(vcpu);
__entry->fault_address = fault_address;
__entry->error_code = error_code;
),
- TP_printk("address %lx error_code %x",
+ TP_printk("vcpu %u rip 0x%lx address 0x%016llx error_code 0x%llx",
+ __entry->vcpu_id, __entry->guest_rip,
__entry->fault_address, __entry->error_code)
);
@@ -589,10 +594,12 @@ TRACE_EVENT(kvm_pv_eoi,
/*
* Tracepoint for nested VMRUN
*/
-TRACE_EVENT(kvm_nested_vmrun,
+TRACE_EVENT(kvm_nested_vmenter,
TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
- __u32 event_inj, bool npt),
- TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
+ __u32 event_inj, bool tdp_enabled, __u64 guest_tdp_pgd,
+ __u64 guest_cr3, __u32 isa),
+ TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, tdp_enabled,
+ guest_tdp_pgd, guest_cr3, isa),
TP_STRUCT__entry(
__field( __u64, rip )
@@ -600,7 +607,9 @@ TRACE_EVENT(kvm_nested_vmrun,
__field( __u64, nested_rip )
__field( __u32, int_ctl )
__field( __u32, event_inj )
- __field( bool, npt )
+ __field( bool, tdp_enabled )
+ __field( __u64, guest_pgd )
+ __field( __u32, isa )
),
TP_fast_assign(
@@ -609,14 +618,24 @@ TRACE_EVENT(kvm_nested_vmrun,
__entry->nested_rip = nested_rip;
__entry->int_ctl = int_ctl;
__entry->event_inj = event_inj;
- __entry->npt = npt;
- ),
-
- TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
- "event_inj: 0x%08x npt: %s",
- __entry->rip, __entry->vmcb, __entry->nested_rip,
- __entry->int_ctl, __entry->event_inj,
- __entry->npt ? "on" : "off")
+ __entry->tdp_enabled = tdp_enabled;
+ __entry->guest_pgd = tdp_enabled ? guest_tdp_pgd : guest_cr3;
+ __entry->isa = isa;
+ ),
+
+ TP_printk("rip: 0x%016llx %s: 0x%016llx nested_rip: 0x%016llx "
+ "int_ctl: 0x%08x event_inj: 0x%08x nested_%s=%s %s: 0x%016llx",
+ __entry->rip,
+ __entry->isa == KVM_ISA_VMX ? "vmcs" : "vmcb",
+ __entry->vmcb,
+ __entry->nested_rip,
+ __entry->int_ctl,
+ __entry->event_inj,
+ __entry->isa == KVM_ISA_VMX ? "ept" : "npt",
+ __entry->tdp_enabled ? "y" : "n",
+ !__entry->tdp_enabled ? "guest_cr3" :
+ __entry->isa == KVM_ISA_VMX ? "nested_eptp" : "nested_cr3",
+ __entry->guest_pgd)
);
TRACE_EVENT(kvm_nested_intercepts,
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index c5e5dfef69c7..87c4e46daf37 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -65,6 +65,7 @@ struct vmcs_config {
u64 cpu_based_3rd_exec_ctrl;
u32 vmexit_ctrl;
u32 vmentry_ctrl;
+ u64 misc;
struct nested_vmx_msrs nested;
};
extern struct vmcs_config vmcs_config;
@@ -82,7 +83,8 @@ static inline bool cpu_has_vmx_basic_inout(void)
static inline bool cpu_has_virtual_nmis(void)
{
- return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
+ return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS &&
+ vmcs_config.cpu_based_exec_ctrl & CPU_BASED_NMI_WINDOW_EXITING;
}
static inline bool cpu_has_vmx_preemption_timer(void)
@@ -224,11 +226,8 @@ static inline bool cpu_has_vmx_vmfunc(void)
static inline bool cpu_has_vmx_shadow_vmcs(void)
{
- u64 vmx_msr;
-
/* check if the cpu supports writing r/o exit information fields */
- rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
- if (!(vmx_msr & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
+ if (!(vmcs_config.misc & MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS))
return false;
return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -370,10 +369,7 @@ static inline bool cpu_has_vmx_invvpid_global(void)
static inline bool cpu_has_vmx_intel_pt(void)
{
- u64 vmx_msr;
-
- rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
- return (vmx_msr & MSR_IA32_VMX_MISC_INTEL_PT) &&
+ return (vmcs_config.misc & MSR_IA32_VMX_MISC_INTEL_PT) &&
(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_PT_USE_GPA) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 6a61b1ae7942..d8b23c96d627 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -10,6 +10,8 @@
#include "vmx.h"
#include "trace.h"
+#define CC KVM_NESTED_VMENTER_CONSISTENCY_CHECK
+
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
#define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x)
@@ -28,6 +30,8 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
EVMCS1_FIELD(HOST_IA32_EFER, host_ia32_efer,
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ EVMCS1_FIELD(HOST_IA32_PERF_GLOBAL_CTRL, host_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
EVMCS1_FIELD(HOST_CR0, host_cr0,
HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
EVMCS1_FIELD(HOST_CR3, host_cr3,
@@ -78,6 +82,8 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
EVMCS1_FIELD(GUEST_IA32_EFER, guest_ia32_efer,
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ EVMCS1_FIELD(GUEST_IA32_PERF_GLOBAL_CTRL, guest_ia32_perf_global_ctrl,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
EVMCS1_FIELD(GUEST_PDPTR0, guest_pdptr0,
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
EVMCS1_FIELD(GUEST_PDPTR1, guest_pdptr1,
@@ -126,6 +132,28 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
EVMCS1_FIELD(XSS_EXIT_BITMAP, xss_exit_bitmap,
HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(ENCLS_EXITING_BITMAP, encls_exiting_bitmap,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ EVMCS1_FIELD(TSC_MULTIPLIER, tsc_multiplier,
+ HV_VMX_ENLIGHTENED_CLEAN_FIELD_CONTROL_GRP2),
+ /*
+ * Not used by KVM:
+ *
+ * EVMCS1_FIELD(0x00006828, guest_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x0000682A, guest_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_BASIC),
+ * EVMCS1_FIELD(0x0000682C, guest_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00002816, guest_ia32_lbr_ctl,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_GUEST_GRP1),
+ * EVMCS1_FIELD(0x00006C18, host_ia32_s_cet,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1A, host_ssp,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ * EVMCS1_FIELD(0x00006C1C, host_ia32_int_ssp_table_addr,
+ * HV_VMX_ENLIGHTENED_CLEAN_FIELD_HOST_GRP1),
+ */
/* 64 bit read only */
EVMCS1_FIELD(GUEST_PHYSICAL_ADDRESS, guest_physical_address,
@@ -294,19 +322,6 @@ const struct evmcs_field vmcs_field_to_evmcs_1[] = {
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-#if IS_ENABLED(CONFIG_HYPERV)
-__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
-{
- vmcs_conf->cpu_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_EXEC_CTRL;
- vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
- vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
- vmcs_conf->cpu_based_3rd_exec_ctrl = 0;
-
- vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
- vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
-}
-#endif
-
bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa)
{
struct hv_vp_assist_page assist_page;
@@ -334,6 +349,9 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
* versions: lower 8 bits is the minimal version, higher 8 bits is the
* maximum supported version. KVM supports versions from 1 to
* KVM_EVMCS_VERSION.
+ *
+ * Note, do not check the Hyper-V is fully enabled in guest CPUID, this
+ * helper is used to _get_ the vCPU's supported CPUID.
*/
if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
(!vcpu || to_vmx(vcpu)->nested.enlightened_vmcs_enabled))
@@ -342,10 +360,67 @@ uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
return 0;
}
-void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
+enum evmcs_revision {
+ EVMCSv1_LEGACY,
+ NR_EVMCS_REVISIONS,
+};
+
+enum evmcs_ctrl_type {
+ EVMCS_EXIT_CTRLS,
+ EVMCS_ENTRY_CTRLS,
+ EVMCS_2NDEXEC,
+ EVMCS_PINCTRL,
+ EVMCS_VMFUNC,
+ NR_EVMCS_CTRLS,
+};
+
+static const u32 evmcs_unsupported_ctrls[NR_EVMCS_CTRLS][NR_EVMCS_REVISIONS] = {
+ [EVMCS_EXIT_CTRLS] = {
+ [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMEXIT_CTRL,
+ },
+ [EVMCS_ENTRY_CTRLS] = {
+ [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMENTRY_CTRL,
+ },
+ [EVMCS_2NDEXEC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_2NDEXEC,
+ },
+ [EVMCS_PINCTRL] = {
+ [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_PINCTRL,
+ },
+ [EVMCS_VMFUNC] = {
+ [EVMCSv1_LEGACY] = EVMCS1_UNSUPPORTED_VMFUNC,
+ },
+};
+
+static u32 evmcs_get_unsupported_ctls(enum evmcs_ctrl_type ctrl_type)
+{
+ enum evmcs_revision evmcs_rev = EVMCSv1_LEGACY;
+
+ return evmcs_unsupported_ctrls[ctrl_type][evmcs_rev];
+}
+
+static bool evmcs_has_perf_global_ctrl(struct kvm_vcpu *vcpu)
+{
+ struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu);
+
+ /*
+ * PERF_GLOBAL_CTRL has a quirk where some Windows guests may fail to
+ * boot if a PV CPUID feature flag is not also set. Treat the fields
+ * as unsupported if the flag is not set in guest CPUID. This should
+ * be called only for guest accesses, and all guest accesses should be
+ * gated on Hyper-V being enabled and initialized.
+ */
+ if (WARN_ON_ONCE(!hv_vcpu))
+ return false;
+
+ return hv_vcpu->cpuid_cache.nested_ebx & HV_X64_NESTED_EVMCS1_PERF_GLOBAL_CTRL;
+}
+
+void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
{
u32 ctl_low = (u32)*pdata;
u32 ctl_high = (u32)(*pdata >> 32);
+ u32 unsupported_ctrls;
/*
* Hyper-V 2016 and 2019 try using these features even when eVMCS
@@ -354,77 +429,70 @@ void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
switch (msr_index) {
case MSR_IA32_VMX_EXIT_CTLS:
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+ unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_EXIT_CTRLS);
+ if (!evmcs_has_perf_global_ctrl(vcpu))
+ unsupported_ctrls |= VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= ~unsupported_ctrls;
break;
case MSR_IA32_VMX_ENTRY_CTLS:
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+ unsupported_ctrls = evmcs_get_unsupported_ctls(EVMCS_ENTRY_CTRLS);
+ if (!evmcs_has_perf_global_ctrl(vcpu))
+ unsupported_ctrls |= VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ ctl_high &= ~unsupported_ctrls;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+ ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_2NDEXEC);
break;
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
case MSR_IA32_VMX_PINBASED_CTLS:
- ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+ ctl_high &= ~evmcs_get_unsupported_ctls(EVMCS_PINCTRL);
break;
case MSR_IA32_VMX_VMFUNC:
- ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC;
+ ctl_low &= ~evmcs_get_unsupported_ctls(EVMCS_VMFUNC);
break;
}
*pdata = ctl_low | ((u64)ctl_high << 32);
}
+static bool nested_evmcs_is_valid_controls(enum evmcs_ctrl_type ctrl_type,
+ u32 val)
+{
+ return !(val & evmcs_get_unsupported_ctls(ctrl_type));
+}
+
int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
{
- int ret = 0;
- u32 unsupp_ctl;
-
- unsupp_ctl = vmcs12->pin_based_vm_exec_control &
- EVMCS1_UNSUPPORTED_PINCTRL;
- if (unsupp_ctl) {
- trace_kvm_nested_vmenter_failed(
- "eVMCS: unsupported pin-based VM-execution controls",
- unsupp_ctl);
- ret = -EINVAL;
- }
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_PINCTRL,
+ vmcs12->pin_based_vm_exec_control)))
+ return -EINVAL;
- unsupp_ctl = vmcs12->secondary_vm_exec_control &
- EVMCS1_UNSUPPORTED_2NDEXEC;
- if (unsupp_ctl) {
- trace_kvm_nested_vmenter_failed(
- "eVMCS: unsupported secondary VM-execution controls",
- unsupp_ctl);
- ret = -EINVAL;
- }
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_2NDEXEC,
+ vmcs12->secondary_vm_exec_control)))
+ return -EINVAL;
- unsupp_ctl = vmcs12->vm_exit_controls &
- EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
- if (unsupp_ctl) {
- trace_kvm_nested_vmenter_failed(
- "eVMCS: unsupported VM-exit controls",
- unsupp_ctl);
- ret = -EINVAL;
- }
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_EXIT_CTRLS,
+ vmcs12->vm_exit_controls)))
+ return -EINVAL;
- unsupp_ctl = vmcs12->vm_entry_controls &
- EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
- if (unsupp_ctl) {
- trace_kvm_nested_vmenter_failed(
- "eVMCS: unsupported VM-entry controls",
- unsupp_ctl);
- ret = -EINVAL;
- }
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_ENTRY_CTRLS,
+ vmcs12->vm_entry_controls)))
+ return -EINVAL;
- unsupp_ctl = vmcs12->vm_function_control & EVMCS1_UNSUPPORTED_VMFUNC;
- if (unsupp_ctl) {
- trace_kvm_nested_vmenter_failed(
- "eVMCS: unsupported VM-function controls",
- unsupp_ctl);
- ret = -EINVAL;
- }
+ /*
+ * VM-Func controls are 64-bit, but KVM currently doesn't support any
+ * controls in bits 63:32, i.e. dropping those bits on the consistency
+ * check is intentional.
+ */
+ if (WARN_ON_ONCE(vmcs12->vm_function_control >> 32))
+ return -EINVAL;
- return ret;
+ if (CC(!nested_evmcs_is_valid_controls(EVMCS_VMFUNC,
+ vmcs12->vm_function_control)))
+ return -EINVAL;
+
+ return 0;
}
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index f886a8ff0342..6f746ef3c038 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -42,8 +42,6 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
* PLE_GAP = 0x00004020,
* PLE_WINDOW = 0x00004022,
* VMX_PREEMPTION_TIMER_VALUE = 0x0000482E,
- * GUEST_IA32_PERF_GLOBAL_CTRL = 0x00002808,
- * HOST_IA32_PERF_GLOBAL_CTRL = 0x00002c04,
*
* Currently unsupported in KVM:
* GUEST_IA32_RTIT_CTL = 0x00002814,
@@ -61,9 +59,8 @@ DECLARE_STATIC_KEY_FALSE(enable_evmcs);
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
- (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
- VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
-#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
+ (VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
+#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (0)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
struct evmcs_field {
@@ -212,7 +209,6 @@ static inline void evmcs_load(u64 phys_addr)
vp_ap->enlighten_vmentry = 1;
}
-__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static __always_inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
@@ -243,7 +239,7 @@ bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
-void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata);
+void nested_evmcs_filter_control_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index ddd4367d4826..0c62352dda6a 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -439,61 +439,22 @@ static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
return inequality ^ bit;
}
-
-/*
- * KVM wants to inject page-faults which it got to the guest. This function
- * checks whether in a nested guest, we need to inject them to L1 or L2.
- */
-static int nested_vmx_check_exception(struct kvm_vcpu *vcpu, unsigned long *exit_qual)
+static bool nested_vmx_is_exception_vmexit(struct kvm_vcpu *vcpu, u8 vector,
+ u32 error_code)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- unsigned int nr = vcpu->arch.exception.nr;
- bool has_payload = vcpu->arch.exception.has_payload;
- unsigned long payload = vcpu->arch.exception.payload;
-
- if (nr == PF_VECTOR) {
- if (vcpu->arch.exception.nested_apf) {
- *exit_qual = vcpu->arch.apf.nested_apf_token;
- return 1;
- }
- if (nested_vmx_is_page_fault_vmexit(vmcs12,
- vcpu->arch.exception.error_code)) {
- *exit_qual = has_payload ? payload : vcpu->arch.cr2;
- return 1;
- }
- } else if (vmcs12->exception_bitmap & (1u << nr)) {
- if (nr == DB_VECTOR) {
- if (!has_payload) {
- payload = vcpu->arch.dr6;
- payload &= ~DR6_BT;
- payload ^= DR6_ACTIVE_LOW;
- }
- *exit_qual = payload;
- } else
- *exit_qual = 0;
- return 1;
- }
- return 0;
-}
-
-static bool nested_vmx_handle_page_fault_workaround(struct kvm_vcpu *vcpu,
- struct x86_exception *fault)
-{
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- WARN_ON(!is_guest_mode(vcpu));
+ /*
+ * Drop bits 31:16 of the error code when performing the #PF mask+match
+ * check. All VMCS fields involved are 32 bits, but Intel CPUs never
+ * set bits 31:16 and VMX disallows setting bits 31:16 in the injected
+ * error code. Including the to-be-dropped bits in the check might
+ * result in an "impossible" or missed exit from L1's perspective.
+ */
+ if (vector == PF_VECTOR)
+ return nested_vmx_is_page_fault_vmexit(vmcs12, (u16)error_code);
- if (nested_vmx_is_page_fault_vmexit(vmcs12, fault->error_code) &&
- !WARN_ON_ONCE(to_vmx(vcpu)->nested.nested_run_pending)) {
- vmcs12->vm_exit_intr_error_code = fault->error_code;
- nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
- PF_VECTOR | INTR_TYPE_HARD_EXCEPTION |
- INTR_INFO_DELIVER_CODE_MASK | INTR_INFO_VALID_MASK,
- fault->address);
- return true;
- }
- return false;
+ return (vmcs12->exception_bitmap & (1u << vector));
}
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
@@ -1607,6 +1568,10 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
vmcs12->guest_rflags = evmcs->guest_rflags;
vmcs12->guest_interruptibility_info =
evmcs->guest_interruptibility_info;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ssp = evmcs->guest_ssp;
+ */
}
if (unlikely(!(hv_clean_fields &
@@ -1653,6 +1618,13 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
vmcs12->host_fs_selector = evmcs->host_fs_selector;
vmcs12->host_gs_selector = evmcs->host_gs_selector;
vmcs12->host_tr_selector = evmcs->host_tr_selector;
+ vmcs12->host_ia32_perf_global_ctrl = evmcs->host_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->host_ia32_s_cet = evmcs->host_ia32_s_cet;
+ * vmcs12->host_ssp = evmcs->host_ssp;
+ * vmcs12->host_ia32_int_ssp_table_addr = evmcs->host_ia32_int_ssp_table_addr;
+ */
}
if (unlikely(!(hv_clean_fields &
@@ -1720,6 +1692,8 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
vmcs12->tsc_offset = evmcs->tsc_offset;
vmcs12->virtual_apic_page_addr = evmcs->virtual_apic_page_addr;
vmcs12->xss_exit_bitmap = evmcs->xss_exit_bitmap;
+ vmcs12->encls_exiting_bitmap = evmcs->encls_exiting_bitmap;
+ vmcs12->tsc_multiplier = evmcs->tsc_multiplier;
}
if (unlikely(!(hv_clean_fields &
@@ -1767,6 +1741,13 @@ static void copy_enlightened_to_vmcs12(struct vcpu_vmx *vmx, u32 hv_clean_fields
vmcs12->guest_bndcfgs = evmcs->guest_bndcfgs;
vmcs12->guest_activity_state = evmcs->guest_activity_state;
vmcs12->guest_sysenter_cs = evmcs->guest_sysenter_cs;
+ vmcs12->guest_ia32_perf_global_ctrl = evmcs->guest_ia32_perf_global_ctrl;
+ /*
+ * Not present in struct vmcs12:
+ * vmcs12->guest_ia32_s_cet = evmcs->guest_ia32_s_cet;
+ * vmcs12->guest_ia32_lbr_ctl = evmcs->guest_ia32_lbr_ctl;
+ * vmcs12->guest_ia32_int_ssp_table_addr = evmcs->guest_ia32_int_ssp_table_addr;
+ */
}
/*
@@ -1869,12 +1850,23 @@ static void copy_vmcs12_to_enlightened(struct vcpu_vmx *vmx)
* evmcs->vm_exit_msr_store_count = vmcs12->vm_exit_msr_store_count;
* evmcs->vm_exit_msr_load_count = vmcs12->vm_exit_msr_load_count;
* evmcs->vm_entry_msr_load_count = vmcs12->vm_entry_msr_load_count;
+ * evmcs->guest_ia32_perf_global_ctrl = vmcs12->guest_ia32_perf_global_ctrl;
+ * evmcs->host_ia32_perf_global_ctrl = vmcs12->host_ia32_perf_global_ctrl;
+ * evmcs->encls_exiting_bitmap = vmcs12->encls_exiting_bitmap;
+ * evmcs->tsc_multiplier = vmcs12->tsc_multiplier;
*
* Not present in struct vmcs12:
* evmcs->exit_io_instruction_ecx = vmcs12->exit_io_instruction_ecx;
* evmcs->exit_io_instruction_esi = vmcs12->exit_io_instruction_esi;
* evmcs->exit_io_instruction_edi = vmcs12->exit_io_instruction_edi;
* evmcs->exit_io_instruction_eip = vmcs12->exit_io_instruction_eip;
+ * evmcs->host_ia32_s_cet = vmcs12->host_ia32_s_cet;
+ * evmcs->host_ssp = vmcs12->host_ssp;
+ * evmcs->host_ia32_int_ssp_table_addr = vmcs12->host_ia32_int_ssp_table_addr;
+ * evmcs->guest_ia32_s_cet = vmcs12->guest_ia32_s_cet;
+ * evmcs->guest_ia32_lbr_ctl = vmcs12->guest_ia32_lbr_ctl;
+ * evmcs->guest_ia32_int_ssp_table_addr = vmcs12->guest_ia32_int_ssp_table_addr;
+ * evmcs->guest_ssp = vmcs12->guest_ssp;
*/
evmcs->guest_es_selector = vmcs12->guest_es_selector;
@@ -1982,7 +1974,7 @@ static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
- if (likely(!vmx->nested.enlightened_vmcs_enabled))
+ if (likely(!guest_cpuid_has_evmcs(vcpu)))
return EVMPTRLD_DISABLED;
if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa)) {
@@ -2328,9 +2320,14 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs0
* are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
* on the related bits (if supported by the CPU) in the hope that
* we can avoid VMWrites during vmx_set_efer().
+ *
+ * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
+ * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
+ * do the same for L2.
*/
exec_control = __vm_entry_controls_get(vmcs01);
- exec_control |= vmcs12->vm_entry_controls;
+ exec_control |= (vmcs12->vm_entry_controls &
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
@@ -2570,7 +2567,7 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
* bits which we consider mandatory enabled.
* The CR0_READ_SHADOW is what L2 should have expected to read given
* the specifications by L1; It's not enough to take
- * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
+ * vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we
* have more bits than L1 expected.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
@@ -2863,7 +2860,7 @@ static int nested_vmx_check_controls(struct kvm_vcpu *vcpu,
nested_check_vm_entry_controls(vcpu, vmcs12))
return -EINVAL;
- if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
+ if (guest_cpuid_has_evmcs(vcpu))
return nested_evmcs_check_controls(vmcs12);
return 0;
@@ -3145,7 +3142,7 @@ static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
* L2 was running), map it here to make sure vmcs12 changes are
* properly reflected.
*/
- if (vmx->nested.enlightened_vmcs_enabled &&
+ if (guest_cpuid_has_evmcs(vcpu) &&
vmx->nested.hv_evmcs_vmptr == EVMPTR_MAP_PENDING) {
enum nested_evmptrld_status evmptrld_status =
nested_vmx_handle_enlightened_vmptrld(vcpu, false);
@@ -3364,12 +3361,24 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
+ trace_kvm_nested_vmenter(kvm_rip_read(vcpu),
+ vmx->nested.current_vmptr,
+ vmcs12->guest_rip,
+ vmcs12->guest_intr_status,
+ vmcs12->vm_entry_intr_info_field,
+ vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_ENABLE_EPT,
+ vmcs12->ept_pointer,
+ vmcs12->guest_cr3,
+ KVM_ISA_VMX);
+
kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
+ if (!evaluate_pending_interrupts)
+ evaluate_pending_interrupts |= kvm_apic_has_pending_init_or_sipi(vcpu);
if (!vmx->nested.nested_run_pending ||
!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
@@ -3450,18 +3459,10 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
}
/*
- * If L1 had a pending IRQ/NMI until it executed
- * VMLAUNCH/VMRESUME which wasn't delivered because it was
- * disallowed (e.g. interrupts disabled), L0 needs to
- * evaluate if this pending event should cause an exit from L2
- * to L1 or delivered directly to L2 (e.g. In case L1 don't
- * intercept EXTERNAL_INTERRUPT).
- *
- * Usually this would be handled by the processor noticing an
- * IRQ/NMI window request, or checking RVI during evaluation of
- * pending virtual interrupts. However, this setting was done
- * on VMCS01 and now VMCS02 is active instead. Thus, we force L0
- * to perform pending event evaluation by requesting a KVM_REQ_EVENT.
+ * Re-evaluate pending events if L1 had a pending IRQ/NMI/INIT/SIPI
+ * when it executed VMLAUNCH/VMRESUME, as entering non-root mode can
+ * effectively unblock various events, e.g. INIT/SIPI cause VM-Exit
+ * unconditionally.
*/
if (unlikely(evaluate_pending_interrupts))
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -3718,7 +3719,7 @@ static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
is_double_fault(exit_intr_info))) {
vmcs12->idt_vectoring_info_field = 0;
} else if (vcpu->arch.exception.injected) {
- nr = vcpu->arch.exception.nr;
+ nr = vcpu->arch.exception.vector;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
if (kvm_exception_is_soft(nr)) {
@@ -3819,19 +3820,40 @@ mmio_needed:
return -ENXIO;
}
-static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
- unsigned long exit_qual)
+static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- unsigned int nr = vcpu->arch.exception.nr;
- u32 intr_info = nr | INTR_INFO_VALID_MASK;
+ unsigned long exit_qual;
+
+ if (ex->has_payload) {
+ exit_qual = ex->payload;
+ } else if (ex->vector == PF_VECTOR) {
+ exit_qual = vcpu->arch.cr2;
+ } else if (ex->vector == DB_VECTOR) {
+ exit_qual = vcpu->arch.dr6;
+ exit_qual &= ~DR6_BT;
+ exit_qual ^= DR6_ACTIVE_LOW;
+ } else {
+ exit_qual = 0;
+ }
- if (vcpu->arch.exception.has_error_code) {
- vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ if (ex->has_error_code) {
+ /*
+ * Intel CPUs do not generate error codes with bits 31:16 set,
+ * and more importantly VMX disallows setting bits 31:16 in the
+ * injected error code for VM-Entry. Drop the bits to mimic
+ * hardware and avoid inducing failure on nested VM-Entry if L1
+ * chooses to inject the exception back to L2. AMD CPUs _do_
+ * generate "full" 32-bit error codes, so KVM allows userspace
+ * to inject exception error codes with bits 31:16 set.
+ */
+ vmcs12->vm_exit_intr_error_code = (u16)ex->error_code;
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
- if (kvm_exception_is_soft(nr))
+ if (kvm_exception_is_soft(ex->vector))
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
else
intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -3844,16 +3866,39 @@ static void nested_vmx_inject_exception_vmexit(struct kvm_vcpu *vcpu,
}
/*
- * Returns true if a debug trap is pending delivery.
+ * Returns true if a debug trap is (likely) pending delivery. Infer the class
+ * of a #DB (trap-like vs. fault-like) from the exception payload (to-be-DR6).
+ * Using the payload is flawed because code breakpoints (fault-like) and data
+ * breakpoints (trap-like) set the same bits in DR6 (breakpoint detected), i.e.
+ * this will return false positives if a to-be-injected code breakpoint #DB is
+ * pending (from KVM's perspective, but not "pending" across an instruction
+ * boundary). ICEBP, a.k.a. INT1, is also not reflected here even though it
+ * too is trap-like.
*
- * In KVM, debug traps bear an exception payload. As such, the class of a #DB
- * exception may be inferred from the presence of an exception payload.
+ * KVM "works" despite these flaws as ICEBP isn't currently supported by the
+ * emulator, Monitor Trap Flag is not marked pending on intercepted #DBs (the
+ * #DB has already happened), and MTF isn't marked pending on code breakpoints
+ * from the emulator (because such #DBs are fault-like and thus don't trigger
+ * actions that fire on instruction retire).
+ */
+static unsigned long vmx_get_pending_dbg_trap(struct kvm_queued_exception *ex)
+{
+ if (!ex->pending || ex->vector != DB_VECTOR)
+ return 0;
+
+ /* General Detect #DBs are always fault-like. */
+ return ex->payload & ~DR6_BD;
+}
+
+/*
+ * Returns true if there's a pending #DB exception that is lower priority than
+ * a pending Monitor Trap Flag VM-Exit. TSS T-flag #DBs are not emulated by
+ * KVM, but could theoretically be injected by userspace. Note, this code is
+ * imperfect, see above.
*/
-static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
+static bool vmx_is_low_priority_db_trap(struct kvm_queued_exception *ex)
{
- return vcpu->arch.exception.pending &&
- vcpu->arch.exception.nr == DB_VECTOR &&
- vcpu->arch.exception.payload;
+ return vmx_get_pending_dbg_trap(ex) & ~DR6_BT;
}
/*
@@ -3865,9 +3910,11 @@ static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
*/
static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
{
- if (vmx_pending_dbg_trap(vcpu))
- vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
- vcpu->arch.exception.payload);
+ unsigned long pending_dbg;
+
+ pending_dbg = vmx_get_pending_dbg_trap(&vcpu->arch.exception);
+ if (pending_dbg)
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS, pending_dbg);
}
static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
@@ -3876,21 +3923,113 @@ static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
to_vmx(vcpu)->nested.preemption_timer_expired;
}
+static bool vmx_has_nested_events(struct kvm_vcpu *vcpu)
+{
+ return nested_vmx_preemption_timer_pending(vcpu) ||
+ to_vmx(vcpu)->nested.mtf_pending;
+}
+
+/*
+ * Per the Intel SDM's table "Priority Among Concurrent Events", with minor
+ * edits to fill in missing examples, e.g. #DB due to split-lock accesses,
+ * and less minor edits to splice in the priority of VMX Non-Root specific
+ * events, e.g. MTF and NMI/INTR-window exiting.
+ *
+ * 1 Hardware Reset and Machine Checks
+ * - RESET
+ * - Machine Check
+ *
+ * 2 Trap on Task Switch
+ * - T flag in TSS is set (on task switch)
+ *
+ * 3 External Hardware Interventions
+ * - FLUSH
+ * - STOPCLK
+ * - SMI
+ * - INIT
+ *
+ * 3.5 Monitor Trap Flag (MTF) VM-exit[1]
+ *
+ * 4 Traps on Previous Instruction
+ * - Breakpoints
+ * - Trap-class Debug Exceptions (#DB due to TF flag set, data/I-O
+ * breakpoint, or #DB due to a split-lock access)
+ *
+ * 4.3 VMX-preemption timer expired VM-exit
+ *
+ * 4.6 NMI-window exiting VM-exit[2]
+ *
+ * 5 Nonmaskable Interrupts (NMI)
+ *
+ * 5.5 Interrupt-window exiting VM-exit and Virtual-interrupt delivery
+ *
+ * 6 Maskable Hardware Interrupts
+ *
+ * 7 Code Breakpoint Fault
+ *
+ * 8 Faults from Fetching Next Instruction
+ * - Code-Segment Limit Violation
+ * - Code Page Fault
+ * - Control protection exception (missing ENDBRANCH at target of indirect
+ * call or jump)
+ *
+ * 9 Faults from Decoding Next Instruction
+ * - Instruction length > 15 bytes
+ * - Invalid Opcode
+ * - Coprocessor Not Available
+ *
+ *10 Faults on Executing Instruction
+ * - Overflow
+ * - Bound error
+ * - Invalid TSS
+ * - Segment Not Present
+ * - Stack fault
+ * - General Protection
+ * - Data Page Fault
+ * - Alignment Check
+ * - x86 FPU Floating-point exception
+ * - SIMD floating-point exception
+ * - Virtualization exception
+ * - Control protection exception
+ *
+ * [1] Per the "Monitor Trap Flag" section: System-management interrupts (SMIs),
+ * INIT signals, and higher priority events take priority over MTF VM exits.
+ * MTF VM exits take priority over debug-trap exceptions and lower priority
+ * events.
+ *
+ * [2] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by the VMX-preemption timer. VM exits caused by the VMX-preemption
+ * timer take priority over VM exits caused by the "NMI-window exiting"
+ * VM-execution control and lower priority events.
+ *
+ * [3] Debug-trap exceptions and higher priority events take priority over VM exits
+ * caused by "NMI-window exiting". VM exits caused by this control take
+ * priority over non-maskable interrupts (NMIs) and lower priority events.
+ *
+ * [4] Virtual-interrupt delivery has the same priority as that of VM exits due to
+ * the 1-setting of the "interrupt-window exiting" VM-execution control. Thus,
+ * non-maskable interrupts (NMIs) and higher priority events take priority over
+ * delivery of a virtual interrupt; delivery of a virtual interrupt takes
+ * priority over external interrupts and lower priority events.
+ */
static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qual;
- bool block_nested_events =
- vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
- bool mtf_pending = vmx->nested.mtf_pending;
struct kvm_lapic *apic = vcpu->arch.apic;
-
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * Clear the MTF state. If a higher priority VM-exit is delivered first,
- * this state is discarded.
+ * Only a pending nested run blocks a pending exception. If there is a
+ * previously injected event, the pending exception occurred while said
+ * event was being delivered and thus needs to be handled.
*/
- if (!block_nested_events)
- vmx->nested.mtf_pending = false;
+ bool block_nested_exceptions = vmx->nested.nested_run_pending;
+ /*
+ * New events (not exceptions) are only recognized at instruction
+ * boundaries. If an event needs reinjection, then KVM is handling a
+ * VM-Exit that occurred _during_ instruction execution; new events are
+ * blocked until the instruction completes.
+ */
+ bool block_nested_events = block_nested_exceptions ||
+ kvm_event_needs_reinjection(vcpu);
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
@@ -3900,6 +4039,9 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
clear_bit(KVM_APIC_INIT, &apic->pending_events);
if (vcpu->arch.mp_state != KVM_MP_STATE_INIT_RECEIVED)
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
+
+ /* MTF is discarded if the vCPU is in WFS. */
+ vmx->nested.mtf_pending = false;
return 0;
}
@@ -3909,31 +4051,41 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
return -EBUSY;
clear_bit(KVM_APIC_SIPI, &apic->pending_events);
- if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
+ if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
nested_vmx_vmexit(vcpu, EXIT_REASON_SIPI_SIGNAL, 0,
apic->sipi_vector & 0xFFUL);
- return 0;
+ return 0;
+ }
+ /* Fallthrough, the SIPI is completely ignored. */
}
/*
- * Process any exceptions that are not debug traps before MTF.
+ * Process exceptions that are higher priority than Monitor Trap Flag:
+ * fault-like exceptions, TSS T flag #DB (not emulated by KVM, but
+ * could theoretically come in from userspace), and ICEBP (INT1).
*
- * Note that only a pending nested run can block a pending exception.
- * Otherwise an injected NMI/interrupt should either be
- * lost or delivered to the nested hypervisor in the IDT_VECTORING_INFO,
- * while delivering the pending exception.
+ * TODO: SMIs have higher priority than MTF and trap-like #DBs (except
+ * for TSS T flag #DBs). KVM also doesn't save/restore pending MTF
+ * across SMI/RSM as it should; that needs to be addressed in order to
+ * prioritize SMI over MTF and trap-like #DBs.
*/
-
- if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
- if (vmx->nested.nested_run_pending)
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception_vmexit)) {
+ if (block_nested_exceptions)
return -EBUSY;
- if (!nested_vmx_check_exception(vcpu, &exit_qual))
- goto no_vmexit;
- nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+
+ nested_vmx_inject_exception_vmexit(vcpu);
return 0;
}
- if (mtf_pending) {
+ if (vcpu->arch.exception.pending &&
+ !vmx_is_low_priority_db_trap(&vcpu->arch.exception)) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (vmx->nested.mtf_pending) {
if (block_nested_events)
return -EBUSY;
nested_vmx_update_pending_dbg(vcpu);
@@ -3941,15 +4093,20 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
return 0;
}
- if (vcpu->arch.exception.pending) {
- if (vmx->nested.nested_run_pending)
+ if (vcpu->arch.exception_vmexit.pending) {
+ if (block_nested_exceptions)
return -EBUSY;
- if (!nested_vmx_check_exception(vcpu, &exit_qual))
- goto no_vmexit;
- nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+
+ nested_vmx_inject_exception_vmexit(vcpu);
return 0;
}
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_exceptions)
+ return -EBUSY;
+ goto no_vmexit;
+ }
+
if (nested_vmx_preemption_timer_pending(vcpu)) {
if (block_nested_events)
return -EBUSY;
@@ -4255,14 +4412,6 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
nested_vmx_abort(vcpu,
VMX_ABORT_SAVE_GUEST_MSR_FAIL);
}
-
- /*
- * Drop what we picked up for L2 via vmx_complete_interrupts. It is
- * preserved above and would only end up incorrectly in L1.
- */
- vcpu->arch.nmi_injected = false;
- kvm_clear_exception_queue(vcpu);
- kvm_clear_interrupt_queue(vcpu);
}
/*
@@ -4538,6 +4687,9 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ /* Pending MTF traps are discarded on VM-Exit. */
+ vmx->nested.mtf_pending = false;
+
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
@@ -4602,6 +4754,17 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
WARN_ON_ONCE(nested_early_check);
}
+ /*
+ * Drop events/exceptions that were queued for re-injection to L2
+ * (picked up via vmx_complete_interrupts()), as well as exceptions
+ * that were pending for L2. Note, this must NOT be hoisted above
+ * prepare_vmcs12(), events/exceptions queued for re-injection need to
+ * be captured in vmcs12 (see vmcs12_save_pending_event()).
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
/* Update any VMCS fields that might have changed while L2 ran */
@@ -5030,8 +5193,8 @@ static int handle_vmxoff(struct kvm_vcpu *vcpu)
free_nested(vcpu);
- /* Process a latched INIT during time CPU was in VMX operation */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
+ if (kvm_apic_has_pending_init_or_sipi(vcpu))
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
return nested_vmx_succeed(vcpu);
}
@@ -5067,7 +5230,7 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
* state. It is possible that the area will stay mapped as
* vmx->nested.hv_evmcs but this shouldn't be a problem.
*/
- if (likely(!vmx->nested.enlightened_vmcs_enabled ||
+ if (likely(!guest_cpuid_has_evmcs(vcpu) ||
!nested_enlightened_vmentry(vcpu, &evmcs_gpa))) {
if (vmptr == vmx->nested.current_vmptr)
nested_release_vmcs12(vcpu);
@@ -6463,6 +6626,9 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
if (ret)
goto error_guest_mode;
+ if (vmx->nested.mtf_pending)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+
return 0;
error_guest_mode:
@@ -6522,8 +6688,10 @@ static u64 nested_vmx_calc_vmcs_enum_msr(void)
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
*/
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps)
{
+ struct nested_vmx_msrs *msrs = &vmcs_conf->nested;
+
/*
* Note that as a general rule, the high half of the MSRs (bits in
* the control fields which may be 1) should be initialized by the
@@ -6540,11 +6708,10 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
*/
/* pin-based controls */
- rdmsr(MSR_IA32_VMX_PINBASED_CTLS,
- msrs->pinbased_ctls_low,
- msrs->pinbased_ctls_high);
- msrs->pinbased_ctls_low |=
+ msrs->pinbased_ctls_low =
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->pinbased_ctls_high = vmcs_conf->pin_based_exec_ctrl;
msrs->pinbased_ctls_high &=
PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING |
@@ -6555,50 +6722,47 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
PIN_BASED_VMX_PREEMPTION_TIMER;
/* exit controls */
- rdmsr(MSR_IA32_VMX_EXIT_CTLS,
- msrs->exit_ctls_low,
- msrs->exit_ctls_high);
msrs->exit_ctls_low =
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR;
+ msrs->exit_ctls_high = vmcs_conf->vmexit_ctrl;
msrs->exit_ctls_high &=
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
- VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ VM_EXIT_CLEAR_BNDCFGS;
msrs->exit_ctls_high |=
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
- VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT;
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | VM_EXIT_ACK_INTR_ON_EXIT |
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
/* We support free control of debug control saving. */
msrs->exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS;
/* entry controls */
- rdmsr(MSR_IA32_VMX_ENTRY_CTLS,
- msrs->entry_ctls_low,
- msrs->entry_ctls_high);
msrs->entry_ctls_low =
VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->entry_ctls_high = vmcs_conf->vmentry_ctrl;
msrs->entry_ctls_high &=
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
- VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
- VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS;
msrs->entry_ctls_high |=
- (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
+ (VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
/* We support free control of debug control loading. */
msrs->entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS;
/* cpu-based controls */
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS,
- msrs->procbased_ctls_low,
- msrs->procbased_ctls_high);
msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
+
+ msrs->procbased_ctls_high = vmcs_conf->cpu_based_exec_ctrl;
msrs->procbased_ctls_high &=
CPU_BASED_INTR_WINDOW_EXITING |
CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
@@ -6632,12 +6796,9 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
* depend on CPUID bits, they are added later by
* vmx_vcpu_after_set_cpuid.
*/
- if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
- rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
- msrs->secondary_ctls_low,
- msrs->secondary_ctls_high);
-
msrs->secondary_ctls_low = 0;
+
+ msrs->secondary_ctls_high = vmcs_conf->cpu_based_2nd_exec_ctrl;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
SECONDARY_EXEC_ENABLE_RDTSCP |
@@ -6717,10 +6878,7 @@ void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
msrs->secondary_ctls_high |= SECONDARY_EXEC_ENCLS_EXITING;
/* miscellaneous data */
- rdmsr(MSR_IA32_VMX_MISC,
- msrs->misc_low,
- msrs->misc_high);
- msrs->misc_low &= VMX_MISC_SAVE_EFER_LMA;
+ msrs->misc_low = (u32)vmcs_conf->misc & VMX_MISC_SAVE_EFER_LMA;
msrs->misc_low |=
MSR_IA32_VMX_MISC_VMWRITE_SHADOW_RO_FIELDS |
VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE |
@@ -6814,9 +6972,9 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
struct kvm_x86_nested_ops vmx_nested_ops = {
.leave_nested = vmx_leave_nested,
+ .is_exception_vmexit = nested_vmx_is_exception_vmexit,
.check_events = vmx_check_nested_events,
- .handle_page_fault_workaround = nested_vmx_handle_page_fault_workaround,
- .hv_timer_pending = nested_vmx_preemption_timer_pending,
+ .has_events = vmx_has_nested_events,
.triple_fault = nested_vmx_triple_fault,
.get_state = vmx_get_nested_state,
.set_state = vmx_set_nested_state,
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 88b00a7359e4..6312c9541c3c 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -17,7 +17,7 @@ enum nvmx_vmentry_status {
};
void vmx_leave_nested(struct kvm_vcpu *vcpu);
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
+void nested_vmx_setup_ctls_msrs(struct vmcs_config *vmcs_conf, u32 ept_caps);
void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
void nested_vmx_set_vmcs_shadowing_bitmap(void);
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index aba8cebdc587..8f95c7c01433 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -129,7 +129,7 @@ static int sgx_inject_fault(struct kvm_vcpu *vcpu, gva_t gva, int trapnr)
ex.address = gva;
ex.error_code_valid = true;
ex.nested_page_fault = false;
- kvm_inject_page_fault(vcpu, &ex);
+ kvm_inject_emulated_page_fault(vcpu, &ex);
} else {
kvm_inject_gp(vcpu, 0);
}
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 6de96b943804..8477d8bdd69c 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -189,13 +189,16 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
xor %ebx, %ebx
.Lclear_regs:
+ /* Discard @regs. The register is irrelevant, it just can't be RBX. */
+ pop %_ASM_AX
+
/*
* Clear all general purpose registers except RSP and RBX to prevent
* speculative use of the guest's values, even those that are reloaded
* via the stack. In theory, an L1 cache miss when restoring registers
* could lead to speculative execution with the guest's values.
* Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
- * free. RSP and RAX are exempt as RSP is restored by hardware during
+ * free. RSP and RBX are exempt as RSP is restored by hardware during
* VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return
* value.
*/
@@ -216,9 +219,6 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
xor %r15d, %r15d
#endif
- /* "POP" @regs. */
- add $WORD_SIZE, %_ASM_SP
-
/*
* IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
* the first unbalanced RET after vmexit!
@@ -234,7 +234,6 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
X86_FEATURE_RSB_VMEXIT_LITE
-
pop %_ASM_ARG2 /* @flags */
pop %_ASM_ARG1 /* @vmx */
@@ -293,22 +292,13 @@ SYM_FUNC_START(vmread_error_trampoline)
push %r10
push %r11
#endif
-#ifdef CONFIG_X86_64
+
/* Load @field and @fault to arg1 and arg2 respectively. */
- mov 3*WORD_SIZE(%rbp), %_ASM_ARG2
- mov 2*WORD_SIZE(%rbp), %_ASM_ARG1
-#else
- /* Parameters are passed on the stack for 32-bit (see asmlinkage). */
- push 3*WORD_SIZE(%ebp)
- push 2*WORD_SIZE(%ebp)
-#endif
+ mov 3*WORD_SIZE(%_ASM_BP), %_ASM_ARG2
+ mov 2*WORD_SIZE(%_ASM_BP), %_ASM_ARG1
call vmread_error
-#ifndef CONFIG_X86_64
- add $8, %esp
-#endif
-
/* Zero out @fault, which will be popped into the result register. */
_ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index c9b49a09e6b5..9dba04b6b019 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -439,7 +439,7 @@ do { \
pr_warn_ratelimited(fmt); \
} while (0)
-asmlinkage void vmread_error(unsigned long field, bool fault)
+void vmread_error(unsigned long field, bool fault)
{
if (fault)
kvm_spurious_fault();
@@ -864,7 +864,7 @@ unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
return flags;
}
-static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
vm_entry_controls_clearbit(vmx, entry);
@@ -922,7 +922,7 @@ skip_guest:
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, m->host.nr);
}
-static void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
+static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit,
unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
u64 guest_val, u64 host_val)
@@ -1652,17 +1652,25 @@ static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
/*
* Per the SDM, MTF takes priority over debug-trap exceptions besides
- * T-bit traps. As instruction emulation is completed (i.e. at the
- * instruction boundary), any #DB exception pending delivery must be a
- * debug-trap. Record the pending MTF state to be delivered in
+ * TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
+ * or ICEBP (in the emulator proper), and skipping of ICEBP after an
+ * intercepted #DB deliberately avoids single-step #DB and MTF updates
+ * as ICEBP is higher priority than both. As instruction emulation is
+ * completed at this point (i.e. KVM is at the instruction boundary),
+ * any #DB exception pending delivery must be a debug-trap of lower
+ * priority than MTF. Record the pending MTF state to be delivered in
* vmx_check_nested_events().
*/
if (nested_cpu_has_mtf(vmcs12) &&
(!vcpu->arch.exception.pending ||
- vcpu->arch.exception.nr == DB_VECTOR))
+ vcpu->arch.exception.vector == DB_VECTOR) &&
+ (!vcpu->arch.exception_vmexit.pending ||
+ vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
vmx->nested.mtf_pending = true;
- else
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ } else {
vmx->nested.mtf_pending = false;
+ }
}
static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
@@ -1684,32 +1692,40 @@ static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
}
-static void vmx_queue_exception(struct kvm_vcpu *vcpu)
+static void vmx_inject_exception(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
+ u32 intr_info = ex->vector | INTR_INFO_VALID_MASK;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned nr = vcpu->arch.exception.nr;
- bool has_error_code = vcpu->arch.exception.has_error_code;
- u32 error_code = vcpu->arch.exception.error_code;
- u32 intr_info = nr | INTR_INFO_VALID_MASK;
- kvm_deliver_exception_payload(vcpu);
+ kvm_deliver_exception_payload(vcpu, ex);
- if (has_error_code) {
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ if (ex->has_error_code) {
+ /*
+ * Despite the error code being architecturally defined as 32
+ * bits, and the VMCS field being 32 bits, Intel CPUs and thus
+ * VMX don't actually supporting setting bits 31:16. Hardware
+ * will (should) never provide a bogus error code, but AMD CPUs
+ * do generate error codes with bits 31:16 set, and so KVM's
+ * ABI lets userspace shove in arbitrary 32-bit values. Drop
+ * the upper bits to avoid VM-Fail, losing information that
+ * does't really exist is preferable to killing the VM.
+ */
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)ex->error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
if (vmx->rmode.vm86_active) {
int inc_eip = 0;
- if (kvm_exception_is_soft(nr))
+ if (kvm_exception_is_soft(ex->vector))
inc_eip = vcpu->arch.event_exit_inst_len;
- kvm_inject_realmode_interrupt(vcpu, nr, inc_eip);
+ kvm_inject_realmode_interrupt(vcpu, ex->vector, inc_eip);
return;
}
WARN_ON_ONCE(vmx->emulation_required);
- if (kvm_exception_is_soft(nr)) {
+ if (kvm_exception_is_soft(ex->vector)) {
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmx->vcpu.arch.event_exit_inst_len);
intr_info |= INTR_TYPE_SOFT_EXCEPTION;
@@ -1930,9 +1946,8 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
* sanity checking and refuse to boot. Filter all unsupported
* features out.
*/
- if (!msr_info->host_initiated &&
- vmx->nested.enlightened_vmcs_enabled)
- nested_evmcs_filter_control_msr(msr_info->index,
+ if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
+ nested_evmcs_filter_control_msr(vcpu, msr_info->index,
&msr_info->data);
break;
case MSR_IA32_RTIT_CTL:
@@ -2494,6 +2509,30 @@ static bool cpu_has_sgx(void)
return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
}
+/*
+ * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
+ * can't be used due to errata where VM Exit may incorrectly clear
+ * IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
+ * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
+ */
+static bool cpu_has_perf_global_ctrl_bug(void)
+{
+ if (boot_cpu_data.x86 == 0x6) {
+ switch (boot_cpu_data.x86_model) {
+ case INTEL_FAM6_NEHALEM_EP: /* AAK155 */
+ case INTEL_FAM6_NEHALEM: /* AAP115 */
+ case INTEL_FAM6_WESTMERE: /* AAT100 */
+ case INTEL_FAM6_WESTMERE_EP: /* BC86,AAY89,BD102 */
+ case INTEL_FAM6_NEHALEM_EX: /* BA97 */
+ return true;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
+
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
@@ -2526,13 +2565,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
struct vmx_capability *vmx_cap)
{
u32 vmx_msr_low, vmx_msr_high;
- u32 min, opt, min2, opt2;
u32 _pin_based_exec_control = 0;
u32 _cpu_based_exec_control = 0;
u32 _cpu_based_2nd_exec_control = 0;
u64 _cpu_based_3rd_exec_control = 0;
u32 _vmexit_control = 0;
u32 _vmentry_control = 0;
+ u64 misc_msr;
int i;
/*
@@ -2552,64 +2591,17 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
};
memset(vmcs_conf, 0, sizeof(*vmcs_conf));
- min = CPU_BASED_HLT_EXITING |
-#ifdef CONFIG_X86_64
- CPU_BASED_CR8_LOAD_EXITING |
- CPU_BASED_CR8_STORE_EXITING |
-#endif
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_UNCOND_IO_EXITING |
- CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETTING |
- CPU_BASED_MWAIT_EXITING |
- CPU_BASED_MONITOR_EXITING |
- CPU_BASED_INVLPG_EXITING |
- CPU_BASED_RDPMC_EXITING;
-
- opt = CPU_BASED_TPR_SHADOW |
- CPU_BASED_USE_MSR_BITMAPS |
- CPU_BASED_ACTIVATE_SECONDARY_CONTROLS |
- CPU_BASED_ACTIVATE_TERTIARY_CONTROLS;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
- &_cpu_based_exec_control) < 0)
+
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PROCBASED_CTLS,
+ &_cpu_based_exec_control))
return -EIO;
-#ifdef CONFIG_X86_64
- if (_cpu_based_exec_control & CPU_BASED_TPR_SHADOW)
- _cpu_based_exec_control &= ~CPU_BASED_CR8_LOAD_EXITING &
- ~CPU_BASED_CR8_STORE_EXITING;
-#endif
if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
- min2 = 0;
- opt2 = SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
- SECONDARY_EXEC_WBINVD_EXITING |
- SECONDARY_EXEC_ENABLE_VPID |
- SECONDARY_EXEC_ENABLE_EPT |
- SECONDARY_EXEC_UNRESTRICTED_GUEST |
- SECONDARY_EXEC_PAUSE_LOOP_EXITING |
- SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_ENABLE_RDTSCP |
- SECONDARY_EXEC_ENABLE_INVPCID |
- SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
- SECONDARY_EXEC_SHADOW_VMCS |
- SECONDARY_EXEC_XSAVES |
- SECONDARY_EXEC_RDSEED_EXITING |
- SECONDARY_EXEC_RDRAND_EXITING |
- SECONDARY_EXEC_ENABLE_PML |
- SECONDARY_EXEC_TSC_SCALING |
- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
- SECONDARY_EXEC_PT_USE_GPA |
- SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_BUS_LOCK_DETECTION |
- SECONDARY_EXEC_NOTIFY_VM_EXITING;
- if (cpu_has_sgx())
- opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
- if (adjust_vmx_controls(min2, opt2,
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
MSR_IA32_VMX_PROCBASED_CTLS2,
- &_cpu_based_2nd_exec_control) < 0)
+ &_cpu_based_2nd_exec_control))
return -EIO;
}
#ifndef CONFIG_X86_64
@@ -2627,13 +2619,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
&vmx_cap->ept, &vmx_cap->vpid);
- if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
- /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
- enabled */
- _cpu_based_exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_INVLPG_EXITING);
- } else if (vmx_cap->ept) {
+ if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
+ vmx_cap->ept) {
pr_warn_once("EPT CAP should not exist if not support "
"1-setting enable EPT VM-execution control\n");
@@ -2653,32 +2640,24 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmx_cap->vpid = 0;
}
- if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS) {
- u64 opt3 = TERTIARY_EXEC_IPI_VIRT;
+ if (!cpu_has_sgx())
+ _cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
- _cpu_based_3rd_exec_control = adjust_vmx_controls64(opt3,
+ if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+ _cpu_based_3rd_exec_control =
+ adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
MSR_IA32_VMX_PROCBASED_CTLS3);
- }
- min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
-#ifdef CONFIG_X86_64
- min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-#endif
- opt = VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL |
- VM_EXIT_LOAD_IA32_PAT |
- VM_EXIT_LOAD_IA32_EFER |
- VM_EXIT_CLEAR_BNDCFGS |
- VM_EXIT_PT_CONCEAL_PIP |
- VM_EXIT_CLEAR_IA32_RTIT_CTL;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
- &_vmexit_control) < 0)
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
+ MSR_IA32_VMX_EXIT_CTLS,
+ &_vmexit_control))
return -EIO;
- min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
- PIN_BASED_VMX_PREEMPTION_TIMER;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
- &_pin_based_exec_control) < 0)
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
+ MSR_IA32_VMX_PINBASED_CTLS,
+ &_pin_based_exec_control))
return -EIO;
if (cpu_has_broken_vmx_preemption_timer())
@@ -2687,15 +2666,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
- min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
- opt = VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
- VM_ENTRY_LOAD_IA32_PAT |
- VM_ENTRY_LOAD_IA32_EFER |
- VM_ENTRY_LOAD_BNDCFGS |
- VM_ENTRY_PT_CONCEAL_PIP |
- VM_ENTRY_LOAD_IA32_RTIT_CTL;
- if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_ENTRY_CTLS,
- &_vmentry_control) < 0)
+ if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
+ KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
+ MSR_IA32_VMX_ENTRY_CTLS,
+ &_vmentry_control))
return -EIO;
for (i = 0; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
@@ -2715,30 +2689,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
_vmexit_control &= ~x_ctrl;
}
- /*
- * Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
- * can't be used due to an errata where VM Exit may incorrectly clear
- * IA32_PERF_GLOBAL_CTRL[34:32]. Workaround the errata by using the
- * MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
- */
- if (boot_cpu_data.x86 == 0x6) {
- switch (boot_cpu_data.x86_model) {
- case 26: /* AAK155 */
- case 30: /* AAP115 */
- case 37: /* AAT100 */
- case 44: /* BC86,AAY89,BD102 */
- case 46: /* BA97 */
- _vmentry_control &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
- _vmexit_control &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
- pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
- "does not work properly. Using workaround\n");
- break;
- default:
- break;
- }
- }
-
-
rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
/* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
@@ -2755,6 +2705,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
if (((vmx_msr_high >> 18) & 15) != 6)
return -EIO;
+ rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
+
vmcs_conf->size = vmx_msr_high & 0x1fff;
vmcs_conf->basic_cap = vmx_msr_high & ~0x1fff;
@@ -2766,11 +2718,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf,
vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
-
-#if IS_ENABLED(CONFIG_HYPERV)
- if (enlightened_vmcs)
- evmcs_sanitize_exec_ctrls(vmcs_conf);
-#endif
+ vmcs_conf->misc = misc_msr;
return 0;
}
@@ -3037,10 +2985,15 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
return 0;
vcpu->arch.efer = efer;
+#ifdef CONFIG_X86_64
if (efer & EFER_LMA)
vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
else
vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
+#else
+ if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
+ return 1;
+#endif
vmx_setup_uret_msrs(vmx);
return 0;
@@ -4327,18 +4280,37 @@ static u32 vmx_vmentry_ctrl(void)
if (vmx_pt_mode_is_system())
vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
VM_ENTRY_LOAD_IA32_RTIT_CTL);
- /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
- return vmentry_ctrl &
- ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER);
+ /*
+ * IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
+ */
+ vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL |
+ VM_ENTRY_LOAD_IA32_EFER |
+ VM_ENTRY_IA32E_MODE);
+
+ if (cpu_has_perf_global_ctrl_bug())
+ vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+
+ return vmentry_ctrl;
}
static u32 vmx_vmexit_ctrl(void)
{
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
+ /*
+ * Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
+ * nested virtualization and thus allowed to be set in vmcs12.
+ */
+ vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER |
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
+
if (vmx_pt_mode_is_system())
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL);
+
+ if (cpu_has_perf_global_ctrl_bug())
+ vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
return vmexit_ctrl &
~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER);
@@ -4376,20 +4348,38 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
{
u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
+ /*
+ * Not used by KVM, but fully supported for nesting, i.e. are allowed in
+ * vmcs12 and propagated to vmcs02 when set in vmcs12.
+ */
+ exec_control &= ~(CPU_BASED_RDTSC_EXITING |
+ CPU_BASED_USE_IO_BITMAPS |
+ CPU_BASED_MONITOR_TRAP_FLAG |
+ CPU_BASED_PAUSE_EXITING);
+
+ /* INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically */
+ exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING);
+
if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
exec_control &= ~CPU_BASED_MOV_DR_EXITING;
- if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
+ if (!cpu_need_tpr_shadow(&vmx->vcpu))
exec_control &= ~CPU_BASED_TPR_SHADOW;
+
#ifdef CONFIG_X86_64
+ if (exec_control & CPU_BASED_TPR_SHADOW)
+ exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING |
+ CPU_BASED_CR8_STORE_EXITING);
+ else
exec_control |= CPU_BASED_CR8_STORE_EXITING |
CPU_BASED_CR8_LOAD_EXITING;
#endif
- }
- if (!enable_ept)
- exec_control |= CPU_BASED_CR3_STORE_EXITING |
- CPU_BASED_CR3_LOAD_EXITING |
- CPU_BASED_INVLPG_EXITING;
+ /* No need to intercept CR3 access or INVPLG when using EPT. */
+ if (enable_ept)
+ exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING |
+ CPU_BASED_CR3_STORE_EXITING |
+ CPU_BASED_INVLPG_EXITING);
if (kvm_mwait_in_guest(vmx->vcpu.kvm))
exec_control &= ~(CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING);
@@ -5155,8 +5145,10 @@ static int handle_exception_nmi(struct kvm_vcpu *vcpu)
* instruction. ICEBP generates a trap-like #DB, but
* despite its interception control being tied to #DB,
* is an instruction intercept, i.e. the VM-Exit occurs
- * on the ICEBP itself. Note, skipping ICEBP also
- * clears STI and MOVSS blocking.
+ * on the ICEBP itself. Use the inner "skip" helper to
+ * avoid single-step #DB and MTF updates, as ICEBP is
+ * higher priority. Note, skipping ICEBP still clears
+ * STI and MOVSS blocking.
*
* For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
* if single-step is enabled in RFLAGS and STI or MOVSS
@@ -5638,7 +5630,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
- trace_kvm_page_fault(gpa, exit_qualification);
+ trace_kvm_page_fault(vcpu, gpa, exit_qualification);
/* Is it a read fault? */
error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
@@ -5710,7 +5702,7 @@ static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
struct vcpu_vmx *vmx = to_vmx(vcpu);
return vmx->emulation_required && !vmx->rmode.vm86_active &&
- (vcpu->arch.exception.pending || vcpu->arch.exception.injected);
+ (kvm_is_exception_pending(vcpu) || vcpu->arch.exception.injected);
}
static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
@@ -7430,7 +7422,7 @@ static int __init vmx_check_processor_compat(void)
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
return -EIO;
if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
+ nested_vmx_setup_ctls_msrs(&vmcs_conf, vmx_cap.ept);
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
smp_processor_id());
@@ -8070,7 +8062,7 @@ static struct kvm_x86_ops vmx_x86_ops __initdata = {
.patch_hypercall = vmx_patch_hypercall,
.inject_irq = vmx_inject_irq,
.inject_nmi = vmx_inject_nmi,
- .queue_exception = vmx_queue_exception,
+ .inject_exception = vmx_inject_exception,
.cancel_injection = vmx_cancel_injection,
.interrupt_allowed = vmx_interrupt_allowed,
.nmi_allowed = vmx_nmi_allowed,
@@ -8227,6 +8219,10 @@ static __init int hardware_setup(void)
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
+ if (cpu_has_perf_global_ctrl_bug())
+ pr_warn_once("kvm: VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
+ "does not work properly. Using workaround\n");
+
if (boot_cpu_has(X86_FEATURE_NX))
kvm_enable_efer_bits(EFER_NX);
@@ -8341,11 +8337,9 @@ static __init int hardware_setup(void)
if (enable_preemption_timer) {
u64 use_timer_freq = 5000ULL * 1000 * 1000;
- u64 vmx_msr;
- rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
cpu_preemption_timer_multi =
- vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
if (tsc_khz)
use_timer_freq = (u64)tsc_khz * 1000;
@@ -8381,8 +8375,7 @@ static __init int hardware_setup(void)
setup_default_sgx_lepubkeyhash();
if (nested) {
- nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
- vmx_capability.ept);
+ nested_vmx_setup_ctls_msrs(&vmcs_config, vmx_capability.ept);
r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
if (r)
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 24d58c2ffaa3..a3da84f4ea45 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -477,29 +477,145 @@ static inline u8 vmx_get_rvi(void)
return vmcs_read16(GUEST_INTR_STATUS) & 0xff;
}
-#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \
-static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \
-{ \
- if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
- vmcs_write##bits(uname, val); \
- vmx->loaded_vmcs->controls_shadow.lname = val; \
- } \
-} \
-static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \
-{ \
- return vmcs->controls_shadow.lname; \
-} \
-static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \
-{ \
- return __##lname##_controls_get(vmx->loaded_vmcs); \
-} \
-static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \
-{ \
- lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
-} \
-static inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \
-{ \
- lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
+#define __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS)
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ (__KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS | \
+ VM_ENTRY_IA32E_MODE)
+#else
+ #define KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS \
+ __KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS
+#endif
+#define KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS \
+ (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_ENTRY_LOAD_IA32_PAT | \
+ VM_ENTRY_LOAD_IA32_EFER | \
+ VM_ENTRY_LOAD_BNDCFGS | \
+ VM_ENTRY_PT_CONCEAL_PIP | \
+ VM_ENTRY_LOAD_IA32_RTIT_CTL)
+
+#define __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ (VM_EXIT_SAVE_DEBUG_CONTROLS | \
+ VM_EXIT_ACK_INTR_ON_EXIT)
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ (__KVM_REQUIRED_VMX_VM_EXIT_CONTROLS | \
+ VM_EXIT_HOST_ADDR_SPACE_SIZE)
+#else
+ #define KVM_REQUIRED_VMX_VM_EXIT_CONTROLS \
+ __KVM_REQUIRED_VMX_VM_EXIT_CONTROLS
+#endif
+#define KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_IA32_PAT | \
+ VM_EXIT_LOAD_IA32_PAT | \
+ VM_EXIT_SAVE_IA32_EFER | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER | \
+ VM_EXIT_LOAD_IA32_EFER | \
+ VM_EXIT_CLEAR_BNDCFGS | \
+ VM_EXIT_PT_CONCEAL_PIP | \
+ VM_EXIT_CLEAR_IA32_RTIT_CTL)
+
+#define KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL \
+ (PIN_BASED_EXT_INTR_MASK | \
+ PIN_BASED_NMI_EXITING)
+#define KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL \
+ (PIN_BASED_VIRTUAL_NMIS | \
+ PIN_BASED_POSTED_INTR | \
+ PIN_BASED_VMX_PREEMPTION_TIMER)
+
+#define __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (CPU_BASED_HLT_EXITING | \
+ CPU_BASED_CR3_LOAD_EXITING | \
+ CPU_BASED_CR3_STORE_EXITING | \
+ CPU_BASED_UNCOND_IO_EXITING | \
+ CPU_BASED_MOV_DR_EXITING | \
+ CPU_BASED_USE_TSC_OFFSETTING | \
+ CPU_BASED_MWAIT_EXITING | \
+ CPU_BASED_MONITOR_EXITING | \
+ CPU_BASED_INVLPG_EXITING | \
+ CPU_BASED_RDPMC_EXITING | \
+ CPU_BASED_INTR_WINDOW_EXITING)
+
+#ifdef CONFIG_X86_64
+ #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (__KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL | \
+ CPU_BASED_CR8_LOAD_EXITING | \
+ CPU_BASED_CR8_STORE_EXITING)
+#else
+ #define KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ __KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL
+#endif
+
+#define KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL \
+ (CPU_BASED_RDTSC_EXITING | \
+ CPU_BASED_TPR_SHADOW | \
+ CPU_BASED_USE_IO_BITMAPS | \
+ CPU_BASED_MONITOR_TRAP_FLAG | \
+ CPU_BASED_USE_MSR_BITMAPS | \
+ CPU_BASED_NMI_WINDOW_EXITING | \
+ CPU_BASED_PAUSE_EXITING | \
+ CPU_BASED_ACTIVATE_SECONDARY_CONTROLS | \
+ CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
+
+#define KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL 0
+#define KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL \
+ (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | \
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | \
+ SECONDARY_EXEC_WBINVD_EXITING | \
+ SECONDARY_EXEC_ENABLE_VPID | \
+ SECONDARY_EXEC_ENABLE_EPT | \
+ SECONDARY_EXEC_UNRESTRICTED_GUEST | \
+ SECONDARY_EXEC_PAUSE_LOOP_EXITING | \
+ SECONDARY_EXEC_DESC | \
+ SECONDARY_EXEC_ENABLE_RDTSCP | \
+ SECONDARY_EXEC_ENABLE_INVPCID | \
+ SECONDARY_EXEC_APIC_REGISTER_VIRT | \
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | \
+ SECONDARY_EXEC_SHADOW_VMCS | \
+ SECONDARY_EXEC_XSAVES | \
+ SECONDARY_EXEC_RDSEED_EXITING | \
+ SECONDARY_EXEC_RDRAND_EXITING | \
+ SECONDARY_EXEC_ENABLE_PML | \
+ SECONDARY_EXEC_TSC_SCALING | \
+ SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE | \
+ SECONDARY_EXEC_PT_USE_GPA | \
+ SECONDARY_EXEC_PT_CONCEAL_VMX | \
+ SECONDARY_EXEC_ENABLE_VMFUNC | \
+ SECONDARY_EXEC_BUS_LOCK_DETECTION | \
+ SECONDARY_EXEC_NOTIFY_VM_EXITING | \
+ SECONDARY_EXEC_ENCLS_EXITING)
+
+#define KVM_REQUIRED_VMX_TERTIARY_VM_EXEC_CONTROL 0
+#define KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL \
+ (TERTIARY_EXEC_IPI_VIRT)
+
+#define BUILD_CONTROLS_SHADOW(lname, uname, bits) \
+static inline void lname##_controls_set(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ if (vmx->loaded_vmcs->controls_shadow.lname != val) { \
+ vmcs_write##bits(uname, val); \
+ vmx->loaded_vmcs->controls_shadow.lname = val; \
+ } \
+} \
+static inline u##bits __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
+static inline u##bits lname##_controls_get(struct vcpu_vmx *vmx) \
+{ \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
+} \
+static __always_inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) | val); \
+} \
+static __always_inline void lname##_controls_clearbit(struct vcpu_vmx *vmx, u##bits val) \
+{ \
+ BUILD_BUG_ON(!(val & (KVM_REQUIRED_VMX_##uname | KVM_OPTIONAL_VMX_##uname))); \
+ lname##_controls_set(vmx, lname##_controls_get(vmx) & ~val); \
}
BUILD_CONTROLS_SHADOW(vm_entry, VM_ENTRY_CONTROLS, 32)
BUILD_CONTROLS_SHADOW(vm_exit, VM_EXIT_CONTROLS, 32)
@@ -626,4 +742,14 @@ static inline bool vmx_can_use_ipiv(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && enable_ipiv;
}
+static inline bool guest_cpuid_has_evmcs(struct kvm_vcpu *vcpu)
+{
+ /*
+ * eVMCS is exposed to the guest if Hyper-V is enabled in CPUID and
+ * eVMCS has been explicitly enabled by userspace.
+ */
+ return vcpu->arch.hyperv_enabled &&
+ to_vmx(vcpu)->nested.enlightened_vmcs_enabled;
+}
+
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h
index 5cfc49ddb1b4..ec268df83ed6 100644
--- a/arch/x86/kvm/vmx/vmx_ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -10,7 +10,7 @@
#include "vmcs.h"
#include "../x86.h"
-asmlinkage void vmread_error(unsigned long field, bool fault);
+void vmread_error(unsigned long field, bool fault);
__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b0c47b41c264..4bd5f8a751de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -173,8 +173,13 @@ bool __read_mostly enable_vmware_backdoor = false;
module_param(enable_vmware_backdoor, bool, S_IRUGO);
EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
-static bool __read_mostly force_emulation_prefix = false;
-module_param(force_emulation_prefix, bool, S_IRUGO);
+/*
+ * Flags to manipulate forced emulation behavior (any non-zero value will
+ * enable forced emulation).
+ */
+#define KVM_FEP_CLEAR_RFLAGS_RF BIT(1)
+static int __read_mostly force_emulation_prefix;
+module_param(force_emulation_prefix, int, 0644);
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
@@ -528,6 +533,7 @@ static int exception_class(int vector)
#define EXCPT_TRAP 1
#define EXCPT_ABORT 2
#define EXCPT_INTERRUPT 3
+#define EXCPT_DB 4
static int exception_type(int vector)
{
@@ -538,8 +544,14 @@ static int exception_type(int vector)
mask = 1 << vector;
- /* #DB is trap, as instruction watchpoints are handled elsewhere */
- if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ /*
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+ */
+ if (mask & (1 << DB_VECTOR))
+ return EXCPT_DB;
+
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
return EXCPT_TRAP;
if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
@@ -549,16 +561,13 @@ static int exception_type(int vector)
return EXCPT_FAULT;
}
-void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+ struct kvm_queued_exception *ex)
{
- unsigned nr = vcpu->arch.exception.nr;
- bool has_payload = vcpu->arch.exception.has_payload;
- unsigned long payload = vcpu->arch.exception.payload;
-
- if (!has_payload)
+ if (!ex->has_payload)
return;
- switch (nr) {
+ switch (ex->vector) {
case DB_VECTOR:
/*
* "Certain debug exceptions may clear bit 0-3. The
@@ -583,8 +592,8 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
* So they need to be flipped for DR6.
*/
vcpu->arch.dr6 |= DR6_ACTIVE_LOW;
- vcpu->arch.dr6 |= payload;
- vcpu->arch.dr6 ^= payload & DR6_ACTIVE_LOW;
+ vcpu->arch.dr6 |= ex->payload;
+ vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW;
/*
* The #DB payload is defined as compatible with the 'pending
@@ -595,15 +604,30 @@ void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
vcpu->arch.dr6 &= ~BIT(12);
break;
case PF_VECTOR:
- vcpu->arch.cr2 = payload;
+ vcpu->arch.cr2 = ex->payload;
break;
}
- vcpu->arch.exception.has_payload = false;
- vcpu->arch.exception.payload = 0;
+ ex->has_payload = false;
+ ex->payload = 0;
}
EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
+static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector,
+ bool has_error_code, u32 error_code,
+ bool has_payload, unsigned long payload)
+{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit;
+
+ ex->vector = vector;
+ ex->injected = false;
+ ex->pending = true;
+ ex->has_error_code = has_error_code;
+ ex->error_code = error_code;
+ ex->has_payload = has_payload;
+ ex->payload = payload;
+}
+
static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
unsigned nr, bool has_error, u32 error_code,
bool has_payload, unsigned long payload, bool reinject)
@@ -613,18 +637,31 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
kvm_make_request(KVM_REQ_EVENT, vcpu);
+ /*
+ * If the exception is destined for L2 and isn't being reinjected,
+ * morph it to a VM-Exit if L1 wants to intercept the exception. A
+ * previously injected exception is not checked because it was checked
+ * when it was original queued, and re-checking is incorrect if _L1_
+ * injected the exception, in which case it's exempt from interception.
+ */
+ if (!reinject && is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) {
+ kvm_queue_exception_vmexit(vcpu, nr, has_error, error_code,
+ has_payload, payload);
+ return;
+ }
+
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
if (reinject) {
/*
- * On vmentry, vcpu->arch.exception.pending is only
- * true if an event injection was blocked by
- * nested_run_pending. In that case, however,
- * vcpu_enter_guest requests an immediate exit,
- * and the guest shouldn't proceed far enough to
- * need reinjection.
+ * On VM-Entry, an exception can be pending if and only
+ * if event injection was blocked by nested_run_pending.
+ * In that case, however, vcpu_enter_guest() requests an
+ * immediate exit, and the guest shouldn't proceed far
+ * enough to need reinjection.
*/
- WARN_ON_ONCE(vcpu->arch.exception.pending);
+ WARN_ON_ONCE(kvm_is_exception_pending(vcpu));
vcpu->arch.exception.injected = true;
if (WARN_ON_ONCE(has_payload)) {
/*
@@ -639,17 +676,18 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
vcpu->arch.exception.injected = false;
}
vcpu->arch.exception.has_error_code = has_error;
- vcpu->arch.exception.nr = nr;
+ vcpu->arch.exception.vector = nr;
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = has_payload;
vcpu->arch.exception.payload = payload;
if (!is_guest_mode(vcpu))
- kvm_deliver_exception_payload(vcpu);
+ kvm_deliver_exception_payload(vcpu,
+ &vcpu->arch.exception);
return;
}
/* to check exception */
- prev_nr = vcpu->arch.exception.nr;
+ prev_nr = vcpu->arch.exception.vector;
if (prev_nr == DF_VECTOR) {
/* triple fault -> shutdown */
kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
@@ -657,25 +695,22 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
}
class1 = exception_class(prev_nr);
class2 = exception_class(nr);
- if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
- || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
+ if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) ||
+ (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
/*
- * Generate double fault per SDM Table 5-5. Set
- * exception.pending = true so that the double fault
- * can trigger a nested vmexit.
+ * Synthesize #DF. Clear the previously injected or pending
+ * exception so as not to incorrectly trigger shutdown.
*/
- vcpu->arch.exception.pending = true;
vcpu->arch.exception.injected = false;
- vcpu->arch.exception.has_error_code = true;
- vcpu->arch.exception.nr = DF_VECTOR;
- vcpu->arch.exception.error_code = 0;
- vcpu->arch.exception.has_payload = false;
- vcpu->arch.exception.payload = 0;
- } else
+ vcpu->arch.exception.pending = false;
+
+ kvm_queue_exception_e(vcpu, DF_VECTOR, 0);
+ } else {
/* replace previous exception with a new one in a hope
that instruction re-execution will regenerate lost
exception */
goto queue;
+ }
}
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
@@ -729,20 +764,22 @@ static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err)
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
{
++vcpu->stat.pf_guest;
- vcpu->arch.exception.nested_apf =
- is_guest_mode(vcpu) && fault->async_page_fault;
- if (vcpu->arch.exception.nested_apf) {
- vcpu->arch.apf.nested_apf_token = fault->address;
- kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
- } else {
+
+ /*
+ * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of
+ * whether or not L1 wants to intercept "regular" #PF.
+ */
+ if (is_guest_mode(vcpu) && fault->async_page_fault)
+ kvm_queue_exception_vmexit(vcpu, PF_VECTOR,
+ true, fault->error_code,
+ true, fault->address);
+ else
kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
fault->address);
- }
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
-/* Returns true if the page fault was immediately morphed into a VM-Exit. */
-bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
struct x86_exception *fault)
{
struct kvm_mmu *fault_mmu;
@@ -760,26 +797,7 @@ bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
fault_mmu->root.hpa);
- /*
- * A workaround for KVM's bad exception handling. If KVM injected an
- * exception into L2, and L2 encountered a #PF while vectoring the
- * injected exception, manually check to see if L1 wants to intercept
- * #PF, otherwise queuing the #PF will lead to #DF or a lost exception.
- * In all other cases, defer the check to nested_ops->check_events(),
- * which will correctly handle priority (this does not). Note, other
- * exceptions, e.g. #GP, are theoretically affected, #PF is simply the
- * most problematic, e.g. when L0 and L1 are both intercepting #PF for
- * shadow paging.
- *
- * TODO: Rewrite exception handling to track injected and pending
- * (VM-Exit) exceptions separately.
- */
- if (unlikely(vcpu->arch.exception.injected && is_guest_mode(vcpu)) &&
- kvm_x86_ops.nested_ops->handle_page_fault_workaround(vcpu, fault))
- return true;
-
fault_mmu->inject_page_fault(vcpu, fault);
- return false;
}
EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
@@ -4841,7 +4859,7 @@ static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
return (kvm_arch_interrupt_allowed(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu) &&
!kvm_event_needs_reinjection(vcpu) &&
- !vcpu->arch.exception.pending);
+ !kvm_is_exception_pending(vcpu));
}
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@ -5016,25 +5034,38 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
struct kvm_vcpu_events *events)
{
+ struct kvm_queued_exception *ex;
+
process_nmi(vcpu);
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
/*
- * In guest mode, payload delivery should be deferred,
- * so that the L1 hypervisor can intercept #PF before
- * CR2 is modified (or intercept #DB before DR6 is
- * modified under nVMX). Unless the per-VM capability,
- * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
- * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
- * opportunistically defer the exception payload, deliver it if the
- * capability hasn't been requested before processing a
- * KVM_GET_VCPU_EVENTS.
+ * KVM's ABI only allows for one exception to be migrated. Luckily,
+ * the only time there can be two queued exceptions is if there's a
+ * non-exiting _injected_ exception, and a pending exiting exception.
+ * In that case, ignore the VM-Exiting exception as it's an extension
+ * of the injected exception.
+ */
+ if (vcpu->arch.exception_vmexit.pending &&
+ !vcpu->arch.exception.pending &&
+ !vcpu->arch.exception.injected)
+ ex = &vcpu->arch.exception_vmexit;
+ else
+ ex = &vcpu->arch.exception;
+
+ /*
+ * In guest mode, payload delivery should be deferred if the exception
+ * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1
+ * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not
+ * propagate the payload and so it cannot be safely deferred. Deliver
+ * the payload if the capability hasn't been requested.
*/
if (!vcpu->kvm->arch.exception_payload_enabled &&
- vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
- kvm_deliver_exception_payload(vcpu);
+ ex->pending && ex->has_payload)
+ kvm_deliver_exception_payload(vcpu, ex);
/*
* The API doesn't provide the instruction length for software
@@ -5042,26 +5073,25 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
* isn't advanced, we should expect to encounter the exception
* again.
*/
- if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
+ if (kvm_exception_is_soft(ex->vector)) {
events->exception.injected = 0;
events->exception.pending = 0;
} else {
- events->exception.injected = vcpu->arch.exception.injected;
- events->exception.pending = vcpu->arch.exception.pending;
+ events->exception.injected = ex->injected;
+ events->exception.pending = ex->pending;
/*
* For ABI compatibility, deliberately conflate
* pending and injected exceptions when
* KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
*/
if (!vcpu->kvm->arch.exception_payload_enabled)
- events->exception.injected |=
- vcpu->arch.exception.pending;
+ events->exception.injected |= ex->pending;
}
- events->exception.nr = vcpu->arch.exception.nr;
- events->exception.has_error_code = vcpu->arch.exception.has_error_code;
- events->exception.error_code = vcpu->arch.exception.error_code;
- events->exception_has_payload = vcpu->arch.exception.has_payload;
- events->exception_payload = vcpu->arch.exception.payload;
+ events->exception.nr = ex->vector;
+ events->exception.has_error_code = ex->has_error_code;
+ events->exception.error_code = ex->error_code;
+ events->exception_has_payload = ex->has_payload;
+ events->exception_payload = ex->payload;
events->interrupt.injected =
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
@@ -5131,9 +5161,22 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
return -EINVAL;
process_nmi(vcpu);
+
+ /*
+ * Flag that userspace is stuffing an exception, the next KVM_RUN will
+ * morph the exception to a VM-Exit if appropriate. Do this only for
+ * pending exceptions, already-injected exceptions are not subject to
+ * intercpetion. Note, userspace that conflates pending and injected
+ * is hosed, and will incorrectly convert an injected exception into a
+ * pending exception, which in turn may cause a spurious VM-Exit.
+ */
+ vcpu->arch.exception_from_userspace = events->exception.pending;
+
+ vcpu->arch.exception_vmexit.pending = false;
+
vcpu->arch.exception.injected = events->exception.injected;
vcpu->arch.exception.pending = events->exception.pending;
- vcpu->arch.exception.nr = events->exception.nr;
+ vcpu->arch.exception.vector = events->exception.nr;
vcpu->arch.exception.has_error_code = events->exception.has_error_code;
vcpu->arch.exception.error_code = events->exception.error_code;
vcpu->arch.exception.has_payload = events->exception_has_payload;
@@ -7257,6 +7300,7 @@ static int kvm_can_emulate_insn(struct kvm_vcpu *vcpu, int emul_type,
int handle_ud(struct kvm_vcpu *vcpu)
{
static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
+ int fep_flags = READ_ONCE(force_emulation_prefix);
int emul_type = EMULTYPE_TRAP_UD;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
@@ -7264,10 +7308,12 @@ int handle_ud(struct kvm_vcpu *vcpu)
if (unlikely(!kvm_can_emulate_insn(vcpu, emul_type, NULL, 0)))
return 1;
- if (force_emulation_prefix &&
+ if (fep_flags &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
+ if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF)
+ kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF);
kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
emul_type = EMULTYPE_TRAP_UD_FORCED;
}
@@ -7933,14 +7979,20 @@ static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt,
int r;
r = kvm_get_msr_with_filter(vcpu, msr_index, pdata);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
- if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
- complete_emulated_rdmsr, r)) {
- /* Bounce to user space */
- return X86EMUL_IO_NEEDED;
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_read_ex(msr_index);
+ return X86EMUL_PROPAGATE_FAULT;
}
- return r;
+ trace_kvm_msr_read(msr_index, *pdata);
+ return X86EMUL_CONTINUE;
}
static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
@@ -7950,14 +8002,20 @@ static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt,
int r;
r = kvm_set_msr_with_filter(vcpu, msr_index, data);
+ if (r < 0)
+ return X86EMUL_UNHANDLEABLE;
- if (r && kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
- complete_emulated_msr_access, r)) {
- /* Bounce to user space */
- return X86EMUL_IO_NEEDED;
+ if (r) {
+ if (kvm_msr_user_space(vcpu, msr_index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_msr_access, r))
+ return X86EMUL_IO_NEEDED;
+
+ trace_kvm_msr_write_ex(msr_index, data);
+ return X86EMUL_PROPAGATE_FAULT;
}
- return r;
+ trace_kvm_msr_write(msr_index, data);
+ return X86EMUL_CONTINUE;
}
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
@@ -8161,18 +8219,17 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
}
}
-static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
{
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
- if (ctxt->exception.vector == PF_VECTOR)
- return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
- if (ctxt->exception.error_code_valid)
+ if (ctxt->exception.vector == PF_VECTOR)
+ kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
+ else if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
ctxt->exception.error_code);
else
kvm_queue_exception(vcpu, ctxt->exception.vector);
- return false;
}
static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
@@ -8548,8 +8605,46 @@ int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
-static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
+static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu)
{
+ u32 shadow;
+
+ if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF)
+ return true;
+
+ /*
+ * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active,
+ * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first
+ * to avoid the relatively expensive CPUID lookup.
+ */
+ shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu);
+ return (shadow & KVM_X86_SHADOW_INT_MOV_SS) &&
+ guest_cpuid_is_intel(vcpu);
+}
+
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu,
+ int emulation_type, int *r)
+{
+ WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE);
+
+ /*
+ * Do not check for code breakpoints if hardware has already done the
+ * checks, as inferred from the emulation type. On NO_DECODE and SKIP,
+ * the instruction has passed all exception checks, and all intercepted
+ * exceptions that trigger emulation have lower priority than code
+ * breakpoints, i.e. the fact that the intercepted exception occurred
+ * means any code breakpoints have already been serviced.
+ *
+ * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as
+ * hardware has checked the RIP of the magic prefix, but not the RIP of
+ * the instruction being emulated. The intent of forced emulation is
+ * to behave as if KVM intercepted the instruction without an exception
+ * and without a prefix.
+ */
+ if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP |
+ EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF))
+ return false;
+
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
struct kvm_run *kvm_run = vcpu->run;
@@ -8569,7 +8664,7 @@ static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
}
if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
- !(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
+ !kvm_is_code_breakpoint_inhibited(vcpu)) {
unsigned long eip = kvm_get_linear_rip(vcpu);
u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
vcpu->arch.dr7,
@@ -8671,8 +8766,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
* are fault-like and are higher priority than any faults on
* the code fetch itself.
*/
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_code_breakpoint(vcpu, &r))
+ if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, &r))
return r;
r = x86_decode_emulated_instruction(vcpu, emulation_type,
@@ -8770,8 +8864,7 @@ restart:
if (ctxt->have_exception) {
r = 1;
- if (inject_emulated_exception(vcpu))
- return r;
+ inject_emulated_exception(vcpu);
} else if (vcpu->arch.pio.count) {
if (!vcpu->arch.pio.in) {
/* FIXME: return into emulator if single-stepping. */
@@ -8801,6 +8894,12 @@ writeback:
unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ /*
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
+ * only supports code breakpoints and general detect #DB, both
+ * of which are fault-like.
+ */
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_pmu_trigger_event(vcpu, PERF_COUNT_HW_INSTRUCTIONS);
@@ -9662,74 +9761,155 @@ int kvm_check_nested_events(struct kvm_vcpu *vcpu)
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ trace_kvm_inj_exception(vcpu->arch.exception.vector,
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code,
vcpu->arch.exception.injected);
if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
vcpu->arch.exception.error_code = false;
- static_call(kvm_x86_queue_exception)(vcpu);
+ static_call(kvm_x86_inject_exception)(vcpu);
}
-static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
+/*
+ * Check for any event (interrupt or exception) that is ready to be injected,
+ * and if there is at least one event, inject the event with the highest
+ * priority. This handles both "pending" events, i.e. events that have never
+ * been injected into the guest, and "injected" events, i.e. events that were
+ * injected as part of a previous VM-Enter, but weren't successfully delivered
+ * and need to be re-injected.
+ *
+ * Note, this is not guaranteed to be invoked on a guest instruction boundary,
+ * i.e. doesn't guarantee that there's an event window in the guest. KVM must
+ * be able to inject exceptions in the "middle" of an instruction, and so must
+ * also be able to re-inject NMIs and IRQs in the middle of an instruction.
+ * I.e. for exceptions and re-injected events, NOT invoking this on instruction
+ * boundaries is necessary and correct.
+ *
+ * For simplicity, KVM uses a single path to inject all events (except events
+ * that are injected directly from L1 to L2) and doesn't explicitly track
+ * instruction boundaries for asynchronous events. However, because VM-Exits
+ * that can occur during instruction execution typically result in KVM skipping
+ * the instruction or injecting an exception, e.g. instruction and exception
+ * intercepts, and because pending exceptions have higher priority than pending
+ * interrupts, KVM still honors instruction boundaries in most scenarios.
+ *
+ * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip
+ * the instruction or inject an exception, then KVM can incorrecty inject a new
+ * asynchrounous event if the event became pending after the CPU fetched the
+ * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation)
+ * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be
+ * injected on the restarted instruction instead of being deferred until the
+ * instruction completes.
+ *
+ * In practice, this virtualization hole is unlikely to be observed by the
+ * guest, and even less likely to cause functional problems. To detect the
+ * hole, the guest would have to trigger an event on a side effect of an early
+ * phase of instruction execution, e.g. on the instruction fetch from memory.
+ * And for it to be a functional problem, the guest would need to depend on the
+ * ordering between that side effect, the instruction completing, _and_ the
+ * delivery of the asynchronous event.
+ */
+static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu,
+ bool *req_immediate_exit)
{
+ bool can_inject;
int r;
- bool can_inject = true;
- /* try to reinject previous events if any */
+ /*
+ * Process nested events first, as nested VM-Exit supercedes event
+ * re-injection. If there's an event queued for re-injection, it will
+ * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit.
+ */
+ if (is_guest_mode(vcpu))
+ r = kvm_check_nested_events(vcpu);
+ else
+ r = 0;
- if (vcpu->arch.exception.injected) {
- kvm_inject_exception(vcpu);
- can_inject = false;
- }
/*
- * Do not inject an NMI or interrupt if there is a pending
- * exception. Exceptions and interrupts are recognized at
- * instruction boundaries, i.e. the start of an instruction.
- * Trap-like exceptions, e.g. #DB, have higher priority than
- * NMIs and interrupts, i.e. traps are recognized before an
- * NMI/interrupt that's pending on the same instruction.
- * Fault-like exceptions, e.g. #GP and #PF, are the lowest
- * priority, but are only generated (pended) during instruction
- * execution, i.e. a pending fault-like exception means the
- * fault occurred on the *previous* instruction and must be
- * serviced prior to recognizing any new events in order to
- * fully complete the previous instruction.
+ * Re-inject exceptions and events *especially* if immediate entry+exit
+ * to/from L2 is needed, as any event that has already been injected
+ * into L2 needs to complete its lifecycle before injecting a new event.
+ *
+ * Don't re-inject an NMI or interrupt if there is a pending exception.
+ * This collision arises if an exception occurred while vectoring the
+ * injected event, KVM intercepted said exception, and KVM ultimately
+ * determined the fault belongs to the guest and queues the exception
+ * for injection back into the guest.
+ *
+ * "Injected" interrupts can also collide with pending exceptions if
+ * userspace ignores the "ready for injection" flag and blindly queues
+ * an interrupt. In that case, prioritizing the exception is correct,
+ * as the exception "occurred" before the exit to userspace. Trap-like
+ * exceptions, e.g. most #DBs, have higher priority than interrupts.
+ * And while fault-like exceptions, e.g. #GP and #PF, are the lowest
+ * priority, they're only generated (pended) during instruction
+ * execution, and interrupts are recognized at instruction boundaries.
+ * Thus a pending fault-like exception means the fault occurred on the
+ * *previous* instruction and must be serviced prior to recognizing any
+ * new events in order to fully complete the previous instruction.
*/
- else if (!vcpu->arch.exception.pending) {
- if (vcpu->arch.nmi_injected) {
- static_call(kvm_x86_inject_nmi)(vcpu);
- can_inject = false;
- } else if (vcpu->arch.interrupt.injected) {
- static_call(kvm_x86_inject_irq)(vcpu, true);
- can_inject = false;
- }
- }
+ if (vcpu->arch.exception.injected)
+ kvm_inject_exception(vcpu);
+ else if (kvm_is_exception_pending(vcpu))
+ ; /* see above */
+ else if (vcpu->arch.nmi_injected)
+ static_call(kvm_x86_inject_nmi)(vcpu);
+ else if (vcpu->arch.interrupt.injected)
+ static_call(kvm_x86_inject_irq)(vcpu, true);
+ /*
+ * Exceptions that morph to VM-Exits are handled above, and pending
+ * exceptions on top of injected exceptions that do not VM-Exit should
+ * either morph to #DF or, sadly, override the injected exception.
+ */
WARN_ON_ONCE(vcpu->arch.exception.injected &&
vcpu->arch.exception.pending);
/*
- * Call check_nested_events() even if we reinjected a previous event
- * in order for caller to determine if it should require immediate-exit
- * from L2 to L1 due to pending L1 events which require exit
- * from L2 to L1.
+ * Bail if immediate entry+exit to/from the guest is needed to complete
+ * nested VM-Enter or event re-injection so that a different pending
+ * event can be serviced (or if KVM needs to exit to userspace).
+ *
+ * Otherwise, continue processing events even if VM-Exit occurred. The
+ * VM-Exit will have cleared exceptions that were meant for L2, but
+ * there may now be events that can be injected into L1.
*/
- if (is_guest_mode(vcpu)) {
- r = kvm_check_nested_events(vcpu);
- if (r < 0)
- goto out;
- }
+ if (r < 0)
+ goto out;
+
+ /*
+ * A pending exception VM-Exit should either result in nested VM-Exit
+ * or force an immediate re-entry and exit to/from L2, and exception
+ * VM-Exits cannot be injected (flag should _never_ be set).
+ */
+ WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected ||
+ vcpu->arch.exception_vmexit.pending);
+
+ /*
+ * New events, other than exceptions, cannot be injected if KVM needs
+ * to re-inject a previous event. See above comments on re-injecting
+ * for why pending exceptions get priority.
+ */
+ can_inject = !kvm_event_needs_reinjection(vcpu);
- /* try to inject new event if pending */
if (vcpu->arch.exception.pending) {
- if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
+ /*
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+ * value pushed on the stack. Trap-like exception and all #DBs
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
+ *
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
+ * describe the behavior of General Detect #DBs, which are
+ * fault-like. They do _not_ set RF, a la code breakpoints.
+ */
+ if (exception_type(vcpu->arch.exception.vector) == EXCPT_FAULT)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
- if (vcpu->arch.exception.nr == DB_VECTOR) {
- kvm_deliver_exception_payload(vcpu);
+ if (vcpu->arch.exception.vector == DB_VECTOR) {
+ kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception);
if (vcpu->arch.dr7 & DR7_GD) {
vcpu->arch.dr7 &= ~DR7_GD;
kvm_update_dr7(vcpu);
@@ -9801,11 +9981,11 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
}
if (is_guest_mode(vcpu) &&
- kvm_x86_ops.nested_ops->hv_timer_pending &&
- kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu))
*req_immediate_exit = true;
- WARN_ON(vcpu->arch.exception.pending);
+ WARN_ON(kvm_is_exception_pending(vcpu));
return 0;
out:
@@ -10110,7 +10290,7 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
* When APICv gets disabled, we may still have injected interrupts
* pending. At the same time, KVM_REQ_EVENT may not be set as APICv was
* still active when the interrupt got accepted. Make sure
- * inject_pending_event() is called to check for that.
+ * kvm_check_and_inject_events() is called to check for that.
*/
if (!apic->apicv_active)
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -10407,7 +10587,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
goto out;
}
- r = inject_pending_event(vcpu, &req_immediate_exit);
+ r = kvm_check_and_inject_events(vcpu, &req_immediate_exit);
if (r < 0) {
r = 0;
goto out;
@@ -10646,10 +10826,26 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
if (hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
- if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+ /*
+ * If the vCPU is not runnable, a signal or another host event
+ * of some kind is pending; service it without changing the
+ * vCPU's activity state.
+ */
+ if (!kvm_arch_vcpu_runnable(vcpu))
return 1;
}
+ /*
+ * Evaluate nested events before exiting the halted state. This allows
+ * the halt state to be recorded properly in the VMCS12's activity
+ * state field (AMD does not have a similar field and a VM-Exit always
+ * causes a spurious wakeup from HLT).
+ */
+ if (is_guest_mode(vcpu)) {
+ if (kvm_check_nested_events(vcpu) < 0)
+ return 0;
+ }
+
if (kvm_apic_accept_events(vcpu) < 0)
return 0;
switch(vcpu->arch.mp_state) {
@@ -10673,9 +10869,6 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu)
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu))
- kvm_check_nested_events(vcpu);
-
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
}
@@ -10824,6 +11017,7 @@ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_queued_exception *ex = &vcpu->arch.exception;
struct kvm_run *kvm_run = vcpu->run;
int r;
@@ -10852,7 +11046,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
r = 0;
goto out;
}
- kvm_clear_request(KVM_REQ_UNHALT, vcpu);
r = -EAGAIN;
if (signal_pending(current)) {
r = -EINTR;
@@ -10882,6 +11075,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
}
}
+ /*
+ * If userspace set a pending exception and L2 is active, convert it to
+ * a pending VM-Exit if L1 wants to intercept the exception.
+ */
+ if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector,
+ ex->error_code)) {
+ kvm_queue_exception_vmexit(vcpu, ex->vector,
+ ex->has_error_code, ex->error_code,
+ ex->has_payload, ex->payload);
+ ex->injected = false;
+ ex->pending = false;
+ }
+ vcpu->arch.exception_from_userspace = false;
+
if (unlikely(vcpu->arch.complete_userspace_io)) {
int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io;
vcpu->arch.complete_userspace_io = NULL;
@@ -10988,6 +11196,7 @@ static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
vcpu->arch.exception.pending = false;
+ vcpu->arch.exception_vmexit.pending = false;
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -11125,11 +11334,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
}
/*
- * KVM_MP_STATE_INIT_RECEIVED means the processor is in
- * INIT state; latched init should be reported using
- * KVM_SET_VCPU_EVENTS, so reject it here.
+ * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow
+ * forcing the guest into INIT/SIPI if those events are supposed to be
+ * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state
+ * if an SMI is pending as well.
*/
- if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
+ if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) &&
(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
goto out;
@@ -11368,7 +11578,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
r = -EBUSY;
- if (vcpu->arch.exception.pending)
+ if (kvm_is_exception_pending(vcpu))
goto out;
if (dbg->control & KVM_GUESTDBG_INJECT_DB)
kvm_queue_exception(vcpu, DB_VECTOR);
@@ -11750,8 +11960,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
/*
- * To avoid have the INIT path from kvm_apic_has_events() that be
- * called with loaded FPU and does not let userspace fix the state.
+ * All paths that lead to INIT are required to load the guest's
+ * FPU state (because most paths are buried in KVM_RUN).
*/
if (init_event)
kvm_put_guest_fpu(vcpu);
@@ -12080,6 +12290,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out_page_track;
+ ret = static_call(kvm_x86_vm_init)(kvm);
+ if (ret)
+ goto out_uninit_mmu;
+
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
atomic_set(&kvm->arch.noncoherent_dma_count, 0);
@@ -12115,8 +12329,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm_hv_init_vm(kvm);
kvm_xen_init_vm(kvm);
- return static_call(kvm_x86_vm_init)(kvm);
+ return 0;
+out_uninit_mmu:
+ kvm_mmu_uninit_vm(kvm);
out_page_track:
kvm_page_track_cleanup(kvm);
out:
@@ -12589,13 +12805,14 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
if (!list_empty_careful(&vcpu->async_pf.done))
return true;
- if (kvm_apic_has_events(vcpu))
+ if (kvm_apic_has_pending_init_or_sipi(vcpu) &&
+ kvm_apic_init_sipi_allowed(vcpu))
return true;
if (vcpu->arch.pv.pv_unhalted)
return true;
- if (vcpu->arch.exception.pending)
+ if (kvm_is_exception_pending(vcpu))
return true;
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
@@ -12617,16 +12834,13 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
return true;
if (is_guest_mode(vcpu) &&
- kvm_x86_ops.nested_ops->hv_timer_pending &&
- kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+ kvm_x86_ops.nested_ops->has_events &&
+ kvm_x86_ops.nested_ops->has_events(vcpu))
return true;
if (kvm_xen_has_pending_events(vcpu))
return true;
- if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu))
- return true;
-
return false;
}
@@ -12850,7 +13064,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
{
if (unlikely(!lapic_in_kernel(vcpu) ||
kvm_event_needs_reinjection(vcpu) ||
- vcpu->arch.exception.pending))
+ kvm_is_exception_pending(vcpu)))
return false;
if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
@@ -13401,7 +13615,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
-EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 1926d2cb8e79..829d3134c1eb 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -82,10 +82,18 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
+static inline bool kvm_is_exception_pending(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending ||
+ vcpu->arch.exception_vmexit.pending ||
+ kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+}
+
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
{
vcpu->arch.exception.pending = false;
vcpu->arch.exception.injected = false;
+ vcpu->arch.exception_vmexit.pending = false;
}
static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
@@ -267,11 +275,6 @@ static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
return !(kvm->arch.disabled_quirks & quirk);
}
-static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu)
-{
- return is_smm(vcpu) || static_call(kvm_x86_apic_init_signal_blocked)(vcpu);
-}
-
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
u64 get_kvmclock_ns(struct kvm *kvm);
@@ -286,7 +289,8 @@ int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu,
int handle_ud(struct kvm_vcpu *vcpu);
-void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu);
+void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu,
+ struct kvm_queued_exception *ex);
void kvm_vcpu_mtrr_init(struct kvm_vcpu *vcpu);
u8 kvm_mtrr_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
index 280cb5dc7341..93c628d3e3a9 100644
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@@ -1065,7 +1065,6 @@ static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
del_timer(&vcpu->arch.xen.poll_timer);
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- kvm_clear_request(KVM_REQ_UNHALT, vcpu);
}
vcpu->arch.xen.poll_evtchn = 0;
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index fe59b8ac4fcc..ecbfb4dd3b01 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/linkage.h>
+#include <asm/asm.h>
#include <asm/export.h>
/*
@@ -50,3 +51,140 @@ SYM_FUNC_START(clear_page_erms)
RET
SYM_FUNC_END(clear_page_erms)
EXPORT_SYMBOL_GPL(clear_page_erms)
+
+/*
+ * Default clear user-space.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ */
+SYM_FUNC_START(clear_user_original)
+ /*
+ * Copy only the lower 32 bits of size as that is enough to handle the rest bytes,
+ * i.e., no need for a 'q' suffix and thus a REX prefix.
+ */
+ mov %ecx,%eax
+ shr $3,%rcx
+ jz .Lrest_bytes
+
+ # do the qwords first
+ .p2align 4
+.Lqwords:
+ movq $0,(%rdi)
+ lea 8(%rdi),%rdi
+ dec %rcx
+ jnz .Lqwords
+
+.Lrest_bytes:
+ and $7, %eax
+ jz .Lexit
+
+ # now do the rest bytes
+.Lbytes:
+ movb $0,(%rdi)
+ inc %rdi
+ dec %eax
+ jnz .Lbytes
+
+.Lexit:
+ /*
+ * %rax still needs to be cleared in the exception case because this function is called
+ * from inline asm and the compiler expects %rax to be zero when exiting the inline asm,
+ * in case it might reuse it somewhere.
+ */
+ xor %eax,%eax
+ RET
+
+.Lqwords_exception:
+ # convert remaining qwords back into bytes to return to caller
+ shl $3, %rcx
+ and $7, %eax
+ add %rax,%rcx
+ jmp .Lexit
+
+.Lbytes_exception:
+ mov %eax,%ecx
+ jmp .Lexit
+
+ _ASM_EXTABLE_UA(.Lqwords, .Lqwords_exception)
+ _ASM_EXTABLE_UA(.Lbytes, .Lbytes_exception)
+SYM_FUNC_END(clear_user_original)
+EXPORT_SYMBOL(clear_user_original)
+
+/*
+ * Alternative clear user-space when CPU feature X86_FEATURE_REP_GOOD is
+ * present.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ */
+SYM_FUNC_START(clear_user_rep_good)
+ # call the original thing for less than a cacheline
+ cmp $64, %rcx
+ jb clear_user_original
+
+.Lprep:
+ # copy lower 32-bits for rest bytes
+ mov %ecx, %edx
+ shr $3, %rcx
+ jz .Lrep_good_rest_bytes
+
+.Lrep_good_qwords:
+ rep stosq
+
+.Lrep_good_rest_bytes:
+ and $7, %edx
+ jz .Lrep_good_exit
+
+.Lrep_good_bytes:
+ mov %edx, %ecx
+ rep stosb
+
+.Lrep_good_exit:
+ # see .Lexit comment above
+ xor %eax, %eax
+ RET
+
+.Lrep_good_qwords_exception:
+ # convert remaining qwords back into bytes to return to caller
+ shl $3, %rcx
+ and $7, %edx
+ add %rdx, %rcx
+ jmp .Lrep_good_exit
+
+ _ASM_EXTABLE_UA(.Lrep_good_qwords, .Lrep_good_qwords_exception)
+ _ASM_EXTABLE_UA(.Lrep_good_bytes, .Lrep_good_exit)
+SYM_FUNC_END(clear_user_rep_good)
+EXPORT_SYMBOL(clear_user_rep_good)
+
+/*
+ * Alternative clear user-space when CPU feature X86_FEATURE_ERMS is present.
+ * Input:
+ * rdi destination
+ * rcx count
+ *
+ * Output:
+ * rcx: uncleared bytes or 0 if successful.
+ *
+ */
+SYM_FUNC_START(clear_user_erms)
+ # call the original thing for less than a cacheline
+ cmp $64, %rcx
+ jb clear_user_original
+
+.Lerms_bytes:
+ rep stosb
+
+.Lerms_exit:
+ xorl %eax,%eax
+ RET
+
+ _ASM_EXTABLE_UA(.Lerms_bytes, .Lerms_exit)
+SYM_FUNC_END(clear_user_erms)
+EXPORT_SYMBOL(clear_user_erms)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index d0d7b9bc6cad..dd8cd8831251 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -2,6 +2,7 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
+#include <linux/cfi_types.h>
#include <asm/errno.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
@@ -27,7 +28,7 @@
* Output:
* rax original destination
*/
-SYM_FUNC_START(__memcpy)
+SYM_TYPED_FUNC_START(__memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memcpy_erms", X86_FEATURE_ERMS
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index ad0139d25401..f1bb18617156 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -44,7 +44,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
* called from other contexts.
*/
pagefault_disable();
- ret = __copy_from_user_inatomic(to, from, n);
+ ret = raw_copy_from_user(to, from, n);
pagefault_enable();
return ret;
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index 0ae6cf804197..6c1f8ac5e721 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -14,46 +14,6 @@
* Zero Userspace
*/
-unsigned long __clear_user(void __user *addr, unsigned long size)
-{
- long __d0;
- might_fault();
- /* no memory constraint because it doesn't change any memory gcc knows
- about */
- stac();
- asm volatile(
- " testq %[size8],%[size8]\n"
- " jz 4f\n"
- " .align 16\n"
- "0: movq $0,(%[dst])\n"
- " addq $8,%[dst]\n"
- " decl %%ecx ; jnz 0b\n"
- "4: movq %[size1],%%rcx\n"
- " testl %%ecx,%%ecx\n"
- " jz 2f\n"
- "1: movb $0,(%[dst])\n"
- " incq %[dst]\n"
- " decl %%ecx ; jnz 1b\n"
- "2:\n"
-
- _ASM_EXTABLE_TYPE_REG(0b, 2b, EX_TYPE_UCOPY_LEN8, %[size1])
- _ASM_EXTABLE_UA(1b, 2b)
-
- : [size8] "=&c"(size), [dst] "=&D" (__d0)
- : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr));
- clac();
- return size;
-}
-EXPORT_SYMBOL(__clear_user);
-
-unsigned long clear_user(void __user *to, unsigned long n)
-{
- if (access_ok(to, n))
- return __clear_user(to, n);
- return n;
-}
-EXPORT_SYMBOL(clear_user);
-
#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE
/**
* clean_cache_range - write back a cache range with CLWB
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f8220fd2c169..829c1409ffbd 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -4,10 +4,12 @@ KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
+KCOV_INSTRUMENT_pgprot.o := n
KASAN_SANITIZE_mem_encrypt.o := n
KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
+KASAN_SANITIZE_pgprot.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
# reference __initdata sections.
@@ -17,6 +19,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
+CFLAGS_REMOVE_pgprot.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fa71a5d12e87..a498ae1fbe66 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -769,6 +769,8 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
unsigned long address, struct task_struct *tsk)
{
const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
+ /* This is a racy snapshot, but it's better than nothing. */
+ int cpu = raw_smp_processor_id();
if (!unhandled_signal(tsk, SIGSEGV))
return;
@@ -782,6 +784,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
print_vma_addr(KERN_CONT " in ", regs->ip);
+ /*
+ * Dump the likely CPU where the fatal segfault happened.
+ * This can help identify faulty hardware.
+ */
+ printk(KERN_CONT " likely on CPU %d (core %d, socket %d)", cpu,
+ topology_core_id(cpu), topology_physical_package_id(cpu));
+
+
printk(KERN_CONT "\n");
show_opcodes(regs, loglvl);
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index c1f6c1c51d99..99620428ad78 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -419,7 +419,9 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip)
OPTIMIZER_HIDE_VAR(reg);
emit_jump(&prog, &__x86_indirect_thunk_array[reg], ip);
} else {
- EMIT2(0xFF, 0xE0 + reg);
+ EMIT2(0xFF, 0xE0 + reg); /* jmp *%\reg */
+ if (IS_ENABLED(CONFIG_RETPOLINE) || IS_ENABLED(CONFIG_SLS))
+ EMIT1(0xCC); /* int3 */
}
*pprog = prog;
@@ -662,7 +664,7 @@ static void emit_mov_imm64(u8 **pprog, u32 dst_reg,
*/
emit_mov_imm32(&prog, false, dst_reg, imm32_lo);
} else {
- /* movabsq %rax, imm64 */
+ /* movabsq rax, imm64 */
EMIT2(add_1mod(0x48, dst_reg), add_1reg(0xB8, dst_reg));
EMIT(imm32_lo, 4);
EMIT(imm32_hi, 4);
@@ -1751,34 +1753,60 @@ emit_jmp:
static void save_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
int stack_size)
{
- int i;
+ int i, j, arg_size, nr_regs;
/* Store function arguments to stack.
* For a function that accepts two pointers the sequence will be:
* mov QWORD PTR [rbp-0x10],rdi
* mov QWORD PTR [rbp-0x8],rsi
*/
- for (i = 0; i < min(nr_args, 6); i++)
- emit_stx(prog, bytes_to_bpf_size(m->arg_size[i]),
- BPF_REG_FP,
- i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
- -(stack_size - i * 8));
+ for (i = 0, j = 0; i < min(nr_args, 6); i++) {
+ if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
+ nr_regs = (m->arg_size[i] + 7) / 8;
+ arg_size = 8;
+ } else {
+ nr_regs = 1;
+ arg_size = m->arg_size[i];
+ }
+
+ while (nr_regs) {
+ emit_stx(prog, bytes_to_bpf_size(arg_size),
+ BPF_REG_FP,
+ j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
+ -(stack_size - j * 8));
+ nr_regs--;
+ j++;
+ }
+ }
}
static void restore_regs(const struct btf_func_model *m, u8 **prog, int nr_args,
int stack_size)
{
- int i;
+ int i, j, arg_size, nr_regs;
/* Restore function arguments from stack.
* For a function that accepts two pointers the sequence will be:
* EMIT4(0x48, 0x8B, 0x7D, 0xF0); mov rdi,QWORD PTR [rbp-0x10]
* EMIT4(0x48, 0x8B, 0x75, 0xF8); mov rsi,QWORD PTR [rbp-0x8]
*/
- for (i = 0; i < min(nr_args, 6); i++)
- emit_ldx(prog, bytes_to_bpf_size(m->arg_size[i]),
- i == 5 ? X86_REG_R9 : BPF_REG_1 + i,
- BPF_REG_FP,
- -(stack_size - i * 8));
+ for (i = 0, j = 0; i < min(nr_args, 6); i++) {
+ if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG) {
+ nr_regs = (m->arg_size[i] + 7) / 8;
+ arg_size = 8;
+ } else {
+ nr_regs = 1;
+ arg_size = m->arg_size[i];
+ }
+
+ while (nr_regs) {
+ emit_ldx(prog, bytes_to_bpf_size(arg_size),
+ j == 5 ? X86_REG_R9 : BPF_REG_1 + j,
+ BPF_REG_FP,
+ -(stack_size - j * 8));
+ nr_regs--;
+ j++;
+ }
+ }
}
static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
@@ -1810,6 +1838,9 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog,
if (p->aux->sleepable) {
enter = __bpf_prog_enter_sleepable;
exit = __bpf_prog_exit_sleepable;
+ } else if (p->type == BPF_PROG_TYPE_STRUCT_OPS) {
+ enter = __bpf_prog_enter_struct_ops;
+ exit = __bpf_prog_exit_struct_ops;
} else if (p->expected_attach_type == BPF_LSM_CGROUP) {
enter = __bpf_prog_enter_lsm_cgroup;
exit = __bpf_prog_exit_lsm_cgroup;
@@ -2013,13 +2044,14 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog,
int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end,
const struct btf_func_model *m, u32 flags,
struct bpf_tramp_links *tlinks,
- void *orig_call)
+ void *func_addr)
{
- int ret, i, nr_args = m->nr_args;
+ int ret, i, nr_args = m->nr_args, extra_nregs = 0;
int regs_off, ip_off, args_off, stack_size = nr_args * 8, run_ctx_off;
struct bpf_tramp_links *fentry = &tlinks[BPF_TRAMP_FENTRY];
struct bpf_tramp_links *fexit = &tlinks[BPF_TRAMP_FEXIT];
struct bpf_tramp_links *fmod_ret = &tlinks[BPF_TRAMP_MODIFY_RETURN];
+ void *orig_call = func_addr;
u8 **branches = NULL;
u8 *prog;
bool save_ret;
@@ -2028,6 +2060,14 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
if (nr_args > 6)
return -ENOTSUPP;
+ for (i = 0; i < MAX_BPF_FUNC_ARGS; i++) {
+ if (m->arg_flags[i] & BTF_FMODEL_STRUCT_ARG)
+ extra_nregs += (m->arg_size[i] + 7) / 8 - 1;
+ }
+ if (nr_args + extra_nregs > 6)
+ return -ENOTSUPP;
+ stack_size += extra_nregs * 8;
+
/* Generated trampoline stack layout:
*
* RBP + 8 [ return address ]
@@ -2040,7 +2080,7 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
* [ ... ]
* RBP - regs_off [ reg_arg1 ] program's ctx pointer
*
- * RBP - args_off [ args count ] always
+ * RBP - args_off [ arg regs count ] always
*
* RBP - ip_off [ traced function ] BPF_TRAMP_F_IP_ARG flag
*
@@ -2083,21 +2123,19 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i
EMIT4(0x48, 0x83, 0xEC, stack_size); /* sub rsp, stack_size */
EMIT1(0x53); /* push rbx */
- /* Store number of arguments of the traced function:
- * mov rax, nr_args
+ /* Store number of argument registers of the traced function:
+ * mov rax, nr_args + extra_nregs
* mov QWORD PTR [rbp - args_off], rax
*/
- emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args);
+ emit_mov_imm64(&prog, BPF_REG_0, 0, (u32) nr_args + extra_nregs);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -args_off);
if (flags & BPF_TRAMP_F_IP_ARG) {
/* Store IP address of the traced function:
- * mov rax, QWORD PTR [rbp + 8]
- * sub rax, X86_PATCH_SIZE
+ * movabsq rax, func_addr
* mov QWORD PTR [rbp - ip_off], rax
*/
- emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, 8);
- EMIT4(0x48, 0x83, 0xe8, X86_PATCH_SIZE);
+ emit_mov_imm64(&prog, BPF_REG_0, (long) func_addr >> 32, (u32) (long) func_addr);
emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -ip_off);
}
@@ -2209,7 +2247,7 @@ cleanup:
return ret;
}
-static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
+static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs, u8 *image, u8 *buf)
{
u8 *jg_reloc, *prog = *pprog;
int pivot, err, jg_bytes = 1;
@@ -2225,12 +2263,12 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
EMIT2_off32(0x81, add_1reg(0xF8, BPF_REG_3),
progs[a]);
err = emit_cond_near_jump(&prog, /* je func */
- (void *)progs[a], prog,
+ (void *)progs[a], image + (prog - buf),
X86_JE);
if (err)
return err;
- emit_indirect_jump(&prog, 2 /* rdx */, prog);
+ emit_indirect_jump(&prog, 2 /* rdx */, image + (prog - buf));
*pprog = prog;
return 0;
@@ -2255,7 +2293,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
jg_reloc = prog;
err = emit_bpf_dispatcher(&prog, a, a + pivot, /* emit lower_part */
- progs);
+ progs, image, buf);
if (err)
return err;
@@ -2269,7 +2307,7 @@ static int emit_bpf_dispatcher(u8 **pprog, int a, int b, s64 *progs)
emit_code(jg_reloc - jg_bytes, jg_offset, jg_bytes);
err = emit_bpf_dispatcher(&prog, a + pivot + 1, /* emit upper_part */
- b, progs);
+ b, progs, image, buf);
if (err)
return err;
@@ -2289,12 +2327,12 @@ static int cmp_ips(const void *a, const void *b)
return 0;
}
-int arch_prepare_bpf_dispatcher(void *image, s64 *funcs, int num_funcs)
+int arch_prepare_bpf_dispatcher(void *image, void *buf, s64 *funcs, int num_funcs)
{
- u8 *prog = image;
+ u8 *prog = buf;
sort(funcs, num_funcs, sizeof(funcs[0]), cmp_ips, NULL);
- return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs);
+ return emit_bpf_dispatcher(&prog, 0, num_funcs - 1, funcs, image, buf);
}
struct x64_jit_data {
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 6e598bd78eef..ebc98a68c400 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -169,7 +169,7 @@ static void __init do_add_efi_memmap(void)
}
/*
- * Given add_efi_memmap defaults to 0 and there there is no alternative
+ * Given add_efi_memmap defaults to 0 and there is no alternative
* e820 mechanism for soft-reserved memory, import the full EFI memory
* map if soft reservations are present and enabled. Otherwise, the
* mechanism to disable the kernel's consideration of EFI_MEMORY_SP is
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 1f3675453a57..b36596bf0fc3 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -176,7 +176,8 @@ virt_to_phys_or_null_size(void *va, unsigned long size)
int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
{
- unsigned long pfn, text, pf, rodata;
+ extern const u8 __efi64_thunk_ret_tramp[];
+ unsigned long pfn, text, pf, rodata, tramp;
struct page *page;
unsigned npages;
pgd_t *pgd = efi_mm.pgd;
@@ -238,11 +239,9 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
npages = (_etext - _text) >> PAGE_SHIFT;
text = __pa(_text);
- pfn = text >> PAGE_SHIFT;
- pf = _PAGE_ENC;
- if (kernel_map_pages_in_pgd(pgd, pfn, text, npages, pf)) {
- pr_err("Failed to map kernel text 1:1\n");
+ if (kernel_unmap_pages_in_pgd(pgd, text, npages)) {
+ pr_err("Failed to unmap kernel text 1:1 mapping\n");
return 1;
}
@@ -256,6 +255,15 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
return 1;
}
+ tramp = __pa(__efi64_thunk_ret_tramp);
+ pfn = tramp >> PAGE_SHIFT;
+
+ pf = _PAGE_ENC;
+ if (kernel_map_pages_in_pgd(pgd, pfn, tramp, 1, pf)) {
+ pr_err("Failed to map mixed mode return trampoline\n");
+ return 1;
+ }
+
return 0;
}
diff --git a/arch/x86/platform/efi/efi_thunk_64.S b/arch/x86/platform/efi/efi_thunk_64.S
index 4e5257a4811b..c4b1144f99f6 100644
--- a/arch/x86/platform/efi/efi_thunk_64.S
+++ b/arch/x86/platform/efi/efi_thunk_64.S
@@ -23,7 +23,6 @@
#include <linux/objtool.h>
#include <asm/page_types.h>
#include <asm/segment.h>
-#include <asm/nospec-branch.h>
.text
.code64
@@ -73,10 +72,18 @@ STACK_FRAME_NON_STANDARD __efi64_thunk
pushq %rdi /* EFI runtime service address */
lretq
+ // This return instruction is not needed for correctness, as it will
+ // never be reached. It only exists to make objtool happy, which will
+ // otherwise complain about unreachable instructions in the callers.
+ RET
+SYM_FUNC_END(__efi64_thunk)
+
+ .section ".rodata", "a", @progbits
+ .balign 16
+SYM_DATA_START(__efi64_thunk_ret_tramp)
1: movq 0x20(%rsp), %rsp
pop %rbx
pop %rbp
- ANNOTATE_UNRET_SAFE
ret
int3
@@ -84,7 +91,7 @@ STACK_FRAME_NON_STANDARD __efi64_thunk
2: pushl $__KERNEL_CS
pushl %ebp
lret
-SYM_FUNC_END(__efi64_thunk)
+SYM_DATA_END(__efi64_thunk_ret_tramp)
.bss
.balign 8
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 31c634a22818..58a200dc762d 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -55,6 +55,10 @@ ifdef CONFIG_RETPOLINE
PURGATORY_CFLAGS_REMOVE += $(RETPOLINE_CFLAGS)
endif
+ifdef CONFIG_CFI_CLANG
+PURGATORY_CFLAGS_REMOVE += $(CC_FLAGS_CFI)
+endif
+
CFLAGS_REMOVE_purgatory.o += $(PURGATORY_CFLAGS_REMOVE)
CFLAGS_purgatory.o += $(PURGATORY_CFLAGS)
diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c
index e2c5b296120d..2925074b9a58 100644
--- a/arch/x86/tools/relocs.c
+++ b/arch/x86/tools/relocs.c
@@ -56,6 +56,7 @@ static const char * const sym_regex_kernel[S_NSYMTYPES] = {
"^(xen_irq_disable_direct_reloc$|"
"xen_save_fl_direct_reloc$|"
"VDSO|"
+ "__kcfi_typeid_|"
"__crc_)",
/*
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 0ed2e487a693..9b1a58dda935 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -765,6 +765,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
{
static DEFINE_SPINLOCK(lock);
static struct trap_info traps[257];
+ static const struct trap_info zero = { };
unsigned out;
trace_xen_cpu_load_idt(desc);
@@ -774,7 +775,7 @@ static void xen_load_idt(const struct desc_ptr *desc)
memcpy(this_cpu_ptr(&idt_desc), desc, sizeof(idt_desc));
out = xen_convert_trap_info(desc, traps, false);
- memset(&traps[out], 0, sizeof(traps[0]));
+ traps[out] = zero;
xen_mc_flush();
if (HYPERVISOR_set_trap_table(traps))