summaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/boot/compressed/acpi.c48
-rw-r--r--arch/x86/boot/compressed/eboot.c4
-rw-r--r--arch/x86/boot/compressed/misc.c25
-rw-r--r--arch/x86/events/amd/core.c30
-rw-r--r--arch/x86/events/amd/ibs.c8
-rw-r--r--arch/x86/events/intel/core.c4
-rw-r--r--arch/x86/events/intel/cstate.c44
-rw-r--r--arch/x86/events/intel/pt.c2
-rw-r--r--arch/x86/events/intel/uncore.c44
-rw-r--r--arch/x86/events/intel/uncore.h12
-rw-r--r--arch/x86/events/msr.c7
-rw-r--r--arch/x86/hyperv/hv_apic.c20
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h2
-rw-r--r--arch/x86/include/asm/intel-family.h3
-rw-r--r--arch/x86/include/asm/kvm_host.h9
-rw-r--r--arch/x86/include/asm/mwait.h2
-rw-r--r--arch/x86/include/asm/pti.h2
-rw-r--r--arch/x86/include/asm/uaccess.h23
-rw-r--r--arch/x86/include/asm/vmware.h14
-rw-r--r--arch/x86/kernel/apic/apic.c28
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c3
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c4
-rw-r--r--arch/x86/kernel/cpu/resctrl/ctrlmondata.c4
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c7
-rw-r--r--arch/x86/kernel/head64.c22
-rw-r--r--arch/x86/kernel/process.h2
-rw-r--r--arch/x86/kernel/tsc.c3
-rw-r--r--arch/x86/kvm/cpuid.c104
-rw-r--r--arch/x86/kvm/lapic.c18
-rw-r--r--arch/x86/kvm/lapic.h5
-rw-r--r--arch/x86/kvm/mmu.c65
-rw-r--r--arch/x86/kvm/svm.c16
-rw-r--r--arch/x86/kvm/vmx/nested.c66
-rw-r--r--arch/x86/kvm/vmx/nested.h13
-rw-r--r--arch/x86/kvm/vmx/pmu_intel.c7
-rw-r--r--arch/x86/kvm/vmx/vmx.c41
-rw-r--r--arch/x86/kvm/x86.c91
-rw-r--r--arch/x86/lib/delay.c4
-rw-r--r--arch/x86/platform/efi/efi.c3
-rw-r--r--arch/x86/xen/efi.c14
-rw-r--r--arch/x86/xen/enlighten.c28
-rw-r--r--arch/x86/xen/enlighten_pv.c8
43 files changed, 542 insertions, 319 deletions
diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c
index 149795c369f2..25019d42ae93 100644
--- a/arch/x86/boot/compressed/acpi.c
+++ b/arch/x86/boot/compressed/acpi.c
@@ -21,30 +21,6 @@
struct mem_vector immovable_mem[MAX_NUMNODES*2];
/*
- * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex
- * digits, and '\0' for termination.
- */
-#define MAX_ADDR_LEN 19
-
-static acpi_physical_address get_cmdline_acpi_rsdp(void)
-{
- acpi_physical_address addr = 0;
-
-#ifdef CONFIG_KEXEC
- char val[MAX_ADDR_LEN] = { };
- int ret;
-
- ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN);
- if (ret < 0)
- return 0;
-
- if (kstrtoull(val, 16, &addr))
- return 0;
-#endif
- return addr;
-}
-
-/*
* Search EFI system tables for RSDP. If both ACPI_20_TABLE_GUID and
* ACPI_TABLE_GUID are found, take the former, which has more features.
*/
@@ -298,6 +274,30 @@ acpi_physical_address get_rsdp_addr(void)
}
#if defined(CONFIG_RANDOMIZE_BASE) && defined(CONFIG_MEMORY_HOTREMOVE)
+/*
+ * Max length of 64-bit hex address string is 19, prefix "0x" + 16 hex
+ * digits, and '\0' for termination.
+ */
+#define MAX_ADDR_LEN 19
+
+static acpi_physical_address get_cmdline_acpi_rsdp(void)
+{
+ acpi_physical_address addr = 0;
+
+#ifdef CONFIG_KEXEC
+ char val[MAX_ADDR_LEN] = { };
+ int ret;
+
+ ret = cmdline_find_option("acpi_rsdp", val, MAX_ADDR_LEN);
+ if (ret < 0)
+ return 0;
+
+ if (kstrtoull(val, 16, &addr))
+ return 0;
+#endif
+ return addr;
+}
+
/* Compute SRAT address from RSDP. */
static unsigned long get_acpi_srat_table(void)
{
diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c
index d6662fdef300..82bc60c8acb2 100644
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -13,6 +13,7 @@
#include <asm/e820/types.h>
#include <asm/setup.h>
#include <asm/desc.h>
+#include <asm/boot.h>
#include "../string.h"
#include "eboot.h"
@@ -813,7 +814,8 @@ efi_main(struct efi_config *c, struct boot_params *boot_params)
status = efi_relocate_kernel(sys_table, &bzimage_addr,
hdr->init_size, hdr->init_size,
hdr->pref_address,
- hdr->kernel_alignment);
+ hdr->kernel_alignment,
+ LOAD_PHYSICAL_ADDR);
if (status != EFI_SUCCESS) {
efi_printk(sys_table, "efi_relocate_kernel() failed!\n");
goto fail;
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 53ac0cb2396d..9652d5c2afda 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -345,6 +345,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
{
const unsigned long kernel_total_size = VO__end - VO__text;
unsigned long virt_addr = LOAD_PHYSICAL_ADDR;
+ unsigned long needed_size;
/* Retain x86 boot parameters pointer passed from startup_32/64. */
boot_params = rmode;
@@ -379,26 +380,38 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
free_mem_ptr = heap; /* Heap */
free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
+ /*
+ * The memory hole needed for the kernel is the larger of either
+ * the entire decompressed kernel plus relocation table, or the
+ * entire decompressed kernel plus .bss and .brk sections.
+ *
+ * On X86_64, the memory is mapped with PMD pages. Round the
+ * size up so that the full extent of PMD pages mapped is
+ * included in the check against the valid memory table
+ * entries. This ensures the full mapped area is usable RAM
+ * and doesn't include any reserved areas.
+ */
+ needed_size = max(output_len, kernel_total_size);
+#ifdef CONFIG_X86_64
+ needed_size = ALIGN(needed_size, MIN_KERNEL_ALIGN);
+#endif
+
/* Report initial kernel position details. */
debug_putaddr(input_data);
debug_putaddr(input_len);
debug_putaddr(output);
debug_putaddr(output_len);
debug_putaddr(kernel_total_size);
+ debug_putaddr(needed_size);
#ifdef CONFIG_X86_64
/* Report address of 32-bit trampoline */
debug_putaddr(trampoline_32bit);
#endif
- /*
- * The memory hole needed for the kernel is the larger of either
- * the entire decompressed kernel plus relocation table, or the
- * entire decompressed kernel plus .bss and .brk sections.
- */
choose_random_location((unsigned long)input_data, input_len,
(unsigned long *)&output,
- max(output_len, kernel_total_size),
+ needed_size,
&virt_addr);
/* Validate memory location choices. */
diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c
index e7d35f60d53f..64c3e70b0556 100644
--- a/arch/x86/events/amd/core.c
+++ b/arch/x86/events/amd/core.c
@@ -5,12 +5,14 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/delay.h>
+#include <linux/jiffies.h>
#include <asm/apicdef.h>
#include <asm/nmi.h>
#include "../perf_event.h"
-static DEFINE_PER_CPU(unsigned int, perf_nmi_counter);
+static DEFINE_PER_CPU(unsigned long, perf_nmi_tstamp);
+static unsigned long perf_nmi_window;
static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
@@ -641,11 +643,12 @@ static void amd_pmu_disable_event(struct perf_event *event)
* handler when multiple PMCs are active or PMC overflow while handling some
* other source of an NMI.
*
- * Attempt to mitigate this by using the number of active PMCs to determine
- * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset
- * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the
- * number of active PMCs or 2. The value of 2 is used in case an NMI does not
- * arrive at the LAPIC in time to be collapsed into an already pending NMI.
+ * Attempt to mitigate this by creating an NMI window in which un-handled NMIs
+ * received during this window will be claimed. This prevents extending the
+ * window past when it is possible that latent NMIs should be received. The
+ * per-CPU perf_nmi_tstamp will be set to the window end time whenever perf has
+ * handled a counter. When an un-handled NMI is received, it will be claimed
+ * only if arriving within that window.
*/
static int amd_pmu_handle_irq(struct pt_regs *regs)
{
@@ -663,21 +666,19 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
handled = x86_pmu_handle_irq(regs);
/*
- * If a counter was handled, record the number of possible remaining
- * NMIs that can occur.
+ * If a counter was handled, record a timestamp such that un-handled
+ * NMIs will be claimed if arriving within that window.
*/
if (handled) {
- this_cpu_write(perf_nmi_counter,
- min_t(unsigned int, 2, active));
+ this_cpu_write(perf_nmi_tstamp,
+ jiffies + perf_nmi_window);
return handled;
}
- if (!this_cpu_read(perf_nmi_counter))
+ if (time_after(jiffies, this_cpu_read(perf_nmi_tstamp)))
return NMI_DONE;
- this_cpu_dec(perf_nmi_counter);
-
return NMI_HANDLED;
}
@@ -909,6 +910,9 @@ static int __init amd_core_pmu_init(void)
if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE))
return 0;
+ /* Avoid calulating the value each time in the NMI handler */
+ perf_nmi_window = msecs_to_jiffies(100);
+
switch (boot_cpu_data.x86) {
case 0x15:
pr_cont("Fam15h ");
diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c
index 5b35b7ea5d72..26c36357c4c9 100644
--- a/arch/x86/events/amd/ibs.c
+++ b/arch/x86/events/amd/ibs.c
@@ -377,7 +377,8 @@ static inline void perf_ibs_disable_event(struct perf_ibs *perf_ibs,
struct hw_perf_event *hwc, u64 config)
{
config &= ~perf_ibs->cnt_mask;
- wrmsrl(hwc->config_base, config);
+ if (boot_cpu_data.x86 == 0x10)
+ wrmsrl(hwc->config_base, config);
config &= ~perf_ibs->enable_mask;
wrmsrl(hwc->config_base, config);
}
@@ -553,7 +554,8 @@ static struct perf_ibs perf_ibs_op = {
},
.msr = MSR_AMD64_IBSOPCTL,
.config_mask = IBS_OP_CONFIG_MASK,
- .cnt_mask = IBS_OP_MAX_CNT,
+ .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT |
+ IBS_OP_CUR_CNT_RAND,
.enable_mask = IBS_OP_ENABLE,
.valid_mask = IBS_OP_VAL,
.max_period = IBS_OP_MAX_CNT << 4,
@@ -614,7 +616,7 @@ fail:
if (event->attr.sample_type & PERF_SAMPLE_RAW)
offset_max = perf_ibs->offset_max;
else if (check_rip)
- offset_max = 2;
+ offset_max = 3;
else
offset_max = 1;
do {
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 27ee47a7be66..fcef678c3423 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -4983,6 +4983,8 @@ __init int intel_pmu_init(void)
case INTEL_FAM6_SKYLAKE:
case INTEL_FAM6_KABYLAKE_L:
case INTEL_FAM6_KABYLAKE:
+ case INTEL_FAM6_COMETLAKE_L:
+ case INTEL_FAM6_COMETLAKE:
x86_add_quirk(intel_pebs_isolation_quirk);
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
@@ -5031,6 +5033,8 @@ __init int intel_pmu_init(void)
/* fall through */
case INTEL_FAM6_ICELAKE_L:
case INTEL_FAM6_ICELAKE:
+ case INTEL_FAM6_TIGERLAKE_L:
+ case INTEL_FAM6_TIGERLAKE:
x86_pmu.late_ack = true;
memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c
index 9f2f39003d96..e1daf4151e11 100644
--- a/arch/x86/events/intel/cstate.c
+++ b/arch/x86/events/intel/cstate.c
@@ -45,46 +45,49 @@
* MSR_CORE_C3_RESIDENCY: CORE C3 Residency Counter
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,GLM,
- CNL
+ * CNL,KBL,CML
* Scope: Core
* MSR_CORE_C6_RESIDENCY: CORE C6 Residency Counter
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW,
- * SKL,KNL,GLM,CNL
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL
* Scope: Core
* MSR_CORE_C7_RESIDENCY: CORE C7 Residency Counter
* perf code: 0x03
- * Available model: SNB,IVB,HSW,BDW,SKL,CNL
+ * Available model: SNB,IVB,HSW,BDW,SKL,CNL,KBL,CML,
+ * ICL,TGL
* Scope: Core
* MSR_PKG_C2_RESIDENCY: Package C2 Residency Counter.
* perf code: 0x00
- * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL
+ * Available model: SNB,IVB,HSW,BDW,SKL,KNL,GLM,CNL,
+ * KBL,CML,ICL,TGL
* Scope: Package (physical package)
* MSR_PKG_C3_RESIDENCY: Package C3 Residency Counter.
* perf code: 0x01
* Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,KNL,
- * GLM,CNL
+ * GLM,CNL,KBL,CML,ICL,TGL
* Scope: Package (physical package)
* MSR_PKG_C6_RESIDENCY: Package C6 Residency Counter.
* perf code: 0x02
* Available model: SLM,AMT,NHM,WSM,SNB,IVB,HSW,BDW
- * SKL,KNL,GLM,CNL
+ * SKL,KNL,GLM,CNL,KBL,CML,ICL,TGL
* Scope: Package (physical package)
* MSR_PKG_C7_RESIDENCY: Package C7 Residency Counter.
* perf code: 0x03
- * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL
+ * Available model: NHM,WSM,SNB,IVB,HSW,BDW,SKL,CNL,
+ * KBL,CML,ICL,TGL
* Scope: Package (physical package)
* MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter.
* perf code: 0x04
- * Available model: HSW ULT,KBL,CNL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL
* Scope: Package (physical package)
* MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter.
* perf code: 0x05
- * Available model: HSW ULT,KBL,CNL
+ * Available model: HSW ULT,KBL,CNL,CML,ICL,TGL
* Scope: Package (physical package)
* MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter.
* perf code: 0x06
- * Available model: HSW ULT,KBL,GLM,CNL
+ * Available model: HSW ULT,KBL,GLM,CNL,CML,ICL,TGL
* Scope: Package (physical package)
*
*/
@@ -544,6 +547,19 @@ static const struct cstate_model cnl_cstates __initconst = {
BIT(PERF_CSTATE_PKG_C10_RES),
};
+static const struct cstate_model icl_cstates __initconst = {
+ .core_events = BIT(PERF_CSTATE_CORE_C6_RES) |
+ BIT(PERF_CSTATE_CORE_C7_RES),
+
+ .pkg_events = BIT(PERF_CSTATE_PKG_C2_RES) |
+ BIT(PERF_CSTATE_PKG_C3_RES) |
+ BIT(PERF_CSTATE_PKG_C6_RES) |
+ BIT(PERF_CSTATE_PKG_C7_RES) |
+ BIT(PERF_CSTATE_PKG_C8_RES) |
+ BIT(PERF_CSTATE_PKG_C9_RES) |
+ BIT(PERF_CSTATE_PKG_C10_RES),
+};
+
static const struct cstate_model slm_cstates __initconst = {
.core_events = BIT(PERF_CSTATE_CORE_C1_RES) |
BIT(PERF_CSTATE_CORE_C6_RES),
@@ -614,6 +630,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_L, hswult_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE, hswult_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE_L, hswult_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_COMETLAKE, hswult_cstates),
X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_L, cnl_cstates),
@@ -625,8 +643,10 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = {
X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, snb_cstates),
- X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, snb_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_L, icl_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE, icl_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE_L, icl_cstates),
+ X86_CSTATES_MODEL(INTEL_FAM6_TIGERLAKE, icl_cstates),
{ },
};
MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match);
diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index 74e80ed9c6c4..05e43d0f430b 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -627,7 +627,7 @@ static struct topa *topa_alloc(int cpu, gfp_t gfp)
* link as the 2nd entry in the table
*/
if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) {
- TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p);
+ TOPA_ENTRY(&tp->topa, 1)->base = page_to_phys(p) >> TOPA_SHIFT;
TOPA_ENTRY(&tp->topa, 1)->end = 1;
}
diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c
index 6fc2e06ab4c6..86467f85c383 100644
--- a/arch/x86/events/intel/uncore.c
+++ b/arch/x86/events/intel/uncore.c
@@ -502,10 +502,8 @@ void uncore_pmu_event_start(struct perf_event *event, int flags)
local64_set(&event->hw.prev_count, uncore_read_counter(box, event));
uncore_enable_event(box, event);
- if (box->n_active == 1) {
- uncore_enable_box(box);
+ if (box->n_active == 1)
uncore_pmu_start_hrtimer(box);
- }
}
void uncore_pmu_event_stop(struct perf_event *event, int flags)
@@ -529,10 +527,8 @@ void uncore_pmu_event_stop(struct perf_event *event, int flags)
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
hwc->state |= PERF_HES_STOPPED;
- if (box->n_active == 0) {
- uncore_disable_box(box);
+ if (box->n_active == 0)
uncore_pmu_cancel_hrtimer(box);
- }
}
if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
@@ -778,6 +774,40 @@ static int uncore_pmu_event_init(struct perf_event *event)
return ret;
}
+static void uncore_pmu_enable(struct pmu *pmu)
+{
+ struct intel_uncore_pmu *uncore_pmu;
+ struct intel_uncore_box *box;
+
+ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
+ if (!uncore_pmu)
+ return;
+
+ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
+ if (!box)
+ return;
+
+ if (uncore_pmu->type->ops->enable_box)
+ uncore_pmu->type->ops->enable_box(box);
+}
+
+static void uncore_pmu_disable(struct pmu *pmu)
+{
+ struct intel_uncore_pmu *uncore_pmu;
+ struct intel_uncore_box *box;
+
+ uncore_pmu = container_of(pmu, struct intel_uncore_pmu, pmu);
+ if (!uncore_pmu)
+ return;
+
+ box = uncore_pmu_to_box(uncore_pmu, smp_processor_id());
+ if (!box)
+ return;
+
+ if (uncore_pmu->type->ops->disable_box)
+ uncore_pmu->type->ops->disable_box(box);
+}
+
static ssize_t uncore_get_attr_cpumask(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -803,6 +833,8 @@ static int uncore_pmu_register(struct intel_uncore_pmu *pmu)
pmu->pmu = (struct pmu) {
.attr_groups = pmu->type->attr_groups,
.task_ctx_nr = perf_invalid_context,
+ .pmu_enable = uncore_pmu_enable,
+ .pmu_disable = uncore_pmu_disable,
.event_init = uncore_pmu_event_init,
.add = uncore_pmu_event_add,
.del = uncore_pmu_event_del,
diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h
index f36f7bebbc1b..bbfdaa720b45 100644
--- a/arch/x86/events/intel/uncore.h
+++ b/arch/x86/events/intel/uncore.h
@@ -441,18 +441,6 @@ static inline int uncore_freerunning_hw_config(struct intel_uncore_box *box,
return -EINVAL;
}
-static inline void uncore_disable_box(struct intel_uncore_box *box)
-{
- if (box->pmu->type->ops->disable_box)
- box->pmu->type->ops->disable_box(box);
-}
-
-static inline void uncore_enable_box(struct intel_uncore_box *box)
-{
- if (box->pmu->type->ops->enable_box)
- box->pmu->type->ops->enable_box(box);
-}
-
static inline void uncore_disable_event(struct intel_uncore_box *box,
struct perf_event *event)
{
diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c
index b1afc77f0704..6f86650b3f77 100644
--- a/arch/x86/events/msr.c
+++ b/arch/x86/events/msr.c
@@ -89,7 +89,14 @@ static bool test_intel(int idx, void *data)
case INTEL_FAM6_SKYLAKE_X:
case INTEL_FAM6_KABYLAKE_L:
case INTEL_FAM6_KABYLAKE:
+ case INTEL_FAM6_COMETLAKE_L:
+ case INTEL_FAM6_COMETLAKE:
case INTEL_FAM6_ICELAKE_L:
+ case INTEL_FAM6_ICELAKE:
+ case INTEL_FAM6_ICELAKE_X:
+ case INTEL_FAM6_ICELAKE_D:
+ case INTEL_FAM6_TIGERLAKE_L:
+ case INTEL_FAM6_TIGERLAKE:
if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
return true;
break;
diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c
index 5c056b8aebef..e01078e93dd3 100644
--- a/arch/x86/hyperv/hv_apic.c
+++ b/arch/x86/hyperv/hv_apic.c
@@ -260,11 +260,21 @@ void __init hv_apic_init(void)
}
if (ms_hyperv.hints & HV_X64_APIC_ACCESS_RECOMMENDED) {
- pr_info("Hyper-V: Using MSR based APIC access\n");
+ pr_info("Hyper-V: Using enlightened APIC (%s mode)",
+ x2apic_enabled() ? "x2apic" : "xapic");
+ /*
+ * With x2apic, architectural x2apic MSRs are equivalent to the
+ * respective synthetic MSRs, so there's no need to override
+ * the apic accessors. The only exception is
+ * hv_apic_eoi_write, because it benefits from lazy EOI when
+ * available, but it works for both xapic and x2apic modes.
+ */
apic_set_eoi_write(hv_apic_eoi_write);
- apic->read = hv_apic_read;
- apic->write = hv_apic_write;
- apic->icr_write = hv_apic_icr_write;
- apic->icr_read = hv_apic_icr_read;
+ if (!x2apic_enabled()) {
+ apic->read = hv_apic_read;
+ apic->write = hv_apic_write;
+ apic->icr_write = hv_apic_icr_write;
+ apic->icr_read = hv_apic_icr_read;
+ }
}
}
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index cff3f3f3bfe0..8348f7d69fd5 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_CPU_ENTRY_AREA_H
#define _ASM_X86_CPU_ENTRY_AREA_H
diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
index f04622500da3..c606c0b70738 100644
--- a/arch/x86/include/asm/intel-family.h
+++ b/arch/x86/include/asm/intel-family.h
@@ -83,6 +83,9 @@
#define INTEL_FAM6_TIGERLAKE_L 0x8C
#define INTEL_FAM6_TIGERLAKE 0x8D
+#define INTEL_FAM6_COMETLAKE 0xA5
+#define INTEL_FAM6_COMETLAKE_L 0xA6
+
/* "Small Core" Processors (Atom) */
#define INTEL_FAM6_ATOM_BONNELL 0x1C /* Diamondville, Pineview */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 23edf56cf577..24d6598dea29 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -219,13 +219,6 @@ enum {
PFERR_WRITE_MASK | \
PFERR_PRESENT_MASK)
-/*
- * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
- * Access Tracking SPTEs. We use bit 62 instead of bit 63 to avoid conflicting
- * with the SVE bit in EPT PTEs.
- */
-#define SPTE_SPECIAL_MASK (1ULL << 62)
-
/* apic attention bits */
#define KVM_APIC_CHECK_VAPIC 0
/*
@@ -1196,7 +1189,7 @@ struct kvm_x86_ops {
int (*set_nested_state)(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
struct kvm_nested_state *kvm_state);
- void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
+ bool (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
int (*smi_allowed)(struct kvm_vcpu *vcpu);
int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index e28f8b723b5c..9d5252c9685c 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -21,7 +21,7 @@
#define MWAIT_ECX_INTERRUPT_BREAK 0x1
#define MWAITX_ECX_TIMER_ENABLE BIT(1)
#define MWAITX_MAX_LOOPS ((u32)-1)
-#define MWAITX_DISABLE_CSTATES 0xf
+#define MWAITX_DISABLE_CSTATES 0xf0
static inline void __monitor(const void *eax, unsigned long ecx,
unsigned long edx)
diff --git a/arch/x86/include/asm/pti.h b/arch/x86/include/asm/pti.h
index 5df09a0b80b8..07375b476c4f 100644
--- a/arch/x86/include/asm/pti.h
+++ b/arch/x86/include/asm/pti.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_PTI_H
#define _ASM_X86_PTI_H
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 35c225ede0e4..61d93f062a36 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -734,5 +734,28 @@ do { \
if (unlikely(__gu_err)) goto err_label; \
} while (0)
+/*
+ * We want the unsafe accessors to always be inlined and use
+ * the error labels - thus the macro games.
+ */
+#define unsafe_copy_loop(dst, src, len, type, label) \
+ while (len >= sizeof(type)) { \
+ unsafe_put_user(*(type *)src,(type __user *)dst,label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+#define unsafe_copy_to_user(_dst,_src,_len,label) \
+do { \
+ char __user *__ucu_dst = (_dst); \
+ const char *__ucu_src = (_src); \
+ size_t __ucu_len = (_len); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \
+ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \
+} while (0)
+
#endif /* _ASM_X86_UACCESS_H */
diff --git a/arch/x86/include/asm/vmware.h b/arch/x86/include/asm/vmware.h
index e00c9e875933..ac9fc51e2b18 100644
--- a/arch/x86/include/asm/vmware.h
+++ b/arch/x86/include/asm/vmware.h
@@ -4,6 +4,7 @@
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
+#include <linux/stringify.h>
/*
* The hypercall definitions differ in the low word of the %edx argument
@@ -20,8 +21,8 @@
*/
/* Old port-based version */
-#define VMWARE_HYPERVISOR_PORT "0x5658"
-#define VMWARE_HYPERVISOR_PORT_HB "0x5659"
+#define VMWARE_HYPERVISOR_PORT 0x5658
+#define VMWARE_HYPERVISOR_PORT_HB 0x5659
/* Current vmcall / vmmcall version */
#define VMWARE_HYPERVISOR_HB BIT(0)
@@ -29,7 +30,8 @@
/* The low bandwidth call. The low word of edx is presumed clear. */
#define VMWARE_HYPERCALL \
- ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT ", %%dx; inl (%%dx)", \
+ ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT) ", %%dx; " \
+ "inl (%%dx), %%eax", \
"vmcall", X86_FEATURE_VMCALL, \
"vmmcall", X86_FEATURE_VMW_VMMCALL)
@@ -38,7 +40,8 @@
* HB and OUT bits set.
*/
#define VMWARE_HYPERCALL_HB_OUT \
- ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT_HB ", %%dx; rep outsb", \
+ ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \
+ "rep outsb", \
"vmcall", X86_FEATURE_VMCALL, \
"vmmcall", X86_FEATURE_VMW_VMMCALL)
@@ -47,7 +50,8 @@
* HB bit set.
*/
#define VMWARE_HYPERCALL_HB_IN \
- ALTERNATIVE_2("movw $" VMWARE_HYPERVISOR_PORT_HB ", %%dx; rep insb", \
+ ALTERNATIVE_2("movw $" __stringify(VMWARE_HYPERVISOR_PORT_HB) ", %%dx; " \
+ "rep insb", \
"vmcall", X86_FEATURE_VMCALL, \
"vmmcall", X86_FEATURE_VMW_VMMCALL)
#endif
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 9e2dd2b296cd..2b0faf86da1b 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1586,9 +1586,6 @@ static void setup_local_APIC(void)
{
int cpu = smp_processor_id();
unsigned int value;
-#ifdef CONFIG_X86_32
- int logical_apicid, ldr_apicid;
-#endif
if (disable_apic) {
disable_ioapic_support();
@@ -1626,16 +1623,21 @@ static void setup_local_APIC(void)
apic->init_apic_ldr();
#ifdef CONFIG_X86_32
- /*
- * APIC LDR is initialized. If logical_apicid mapping was
- * initialized during get_smp_config(), make sure it matches the
- * actual value.
- */
- logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
- ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
- WARN_ON(logical_apicid != BAD_APICID && logical_apicid != ldr_apicid);
- /* always use the value from LDR */
- early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
+ if (apic->dest_logical) {
+ int logical_apicid, ldr_apicid;
+
+ /*
+ * APIC LDR is initialized. If logical_apicid mapping was
+ * initialized during get_smp_config(), make sure it matches
+ * the actual value.
+ */
+ logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
+ ldr_apicid = GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
+ if (logical_apicid != BAD_APICID)
+ WARN_ON(logical_apicid != ldr_apicid);
+ /* Always use the value from LDR. */
+ early_per_cpu(x86_cpu_to_logical_apicid, cpu) = ldr_apicid;
+ }
#endif
/*
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 45e92cba92f5..b0889c48a2ac 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -156,7 +156,8 @@ static int x2apic_dead_cpu(unsigned int dead_cpu)
{
struct cluster_mask *cmsk = per_cpu(cluster_masks, dead_cpu);
- cpumask_clear_cpu(dead_cpu, &cmsk->mask);
+ if (cmsk)
+ cpumask_clear_cpu(dead_cpu, &cmsk->mask);
free_cpumask_var(per_cpu(ipi_mask, dead_cpu));
return 0;
}
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index 267daad8c036..c656d92cd708 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -216,6 +216,10 @@ static void __init ms_hyperv_init_platform(void)
int hv_host_info_ecx;
int hv_host_info_edx;
+#ifdef CONFIG_PARAVIRT
+ pv_info.name = "Hyper-V";
+#endif
+
/*
* Extract the features and hints
*/
diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
index efbd54cc4e69..055c8613b531 100644
--- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
+++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c
@@ -522,6 +522,10 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
int ret = 0;
rdtgrp = rdtgroup_kn_lock_live(of->kn);
+ if (!rdtgrp) {
+ ret = -ENOENT;
+ goto out;
+ }
md.priv = of->kn->priv;
resid = md.u.rid;
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 9735139cfdf8..46d732696c1c 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -49,7 +49,7 @@
#define VMWARE_CMD_VCPU_RESERVED 31
#define VMWARE_PORT(cmd, eax, ebx, ecx, edx) \
- __asm__("inl (%%dx)" : \
+ __asm__("inl (%%dx), %%eax" : \
"=a"(eax), "=c"(ecx), "=d"(edx), "=b"(ebx) : \
"a"(VMWARE_HYPERVISOR_MAGIC), \
"c"(VMWARE_CMD_##cmd), \
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 753b8cfe8b8a..87b97897a881 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -94,6 +94,13 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
+ /*
+ * Handle the case where stack trace is collected _before_
+ * cea_exception_stacks had been initialized.
+ */
+ if (!begin)
+ return false;
+
end = begin + sizeof(struct cea_exception_stacks);
/* Bail if @stack is outside the exception stack area. */
if (stk < begin || stk >= end)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 29ffa495bd1c..206a4b6144c2 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -222,13 +222,31 @@ unsigned long __head __startup_64(unsigned long physaddr,
* we might write invalid pmds, when the kernel is relocated
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
+ *
+ * Only the region occupied by the kernel image has so far
+ * been checked against the table of usable memory regions
+ * provided by the firmware, so invalidate pages outside that
+ * region. A page table entry that maps to a reserved area of
+ * memory would allow processor speculation into that area,
+ * and on some hardware (particularly the UV platform) even
+ * speculative access to some reserved areas is caught as an
+ * error, causing the BIOS to halt the system.
*/
pmd = fixup_pointer(level2_kernel_pgt, physaddr);
- for (i = 0; i < PTRS_PER_PMD; i++) {
+
+ /* invalidate pages before the kernel image */
+ for (i = 0; i < pmd_index((unsigned long)_text); i++)
+ pmd[i] &= ~_PAGE_PRESENT;
+
+ /* fixup pages that are part of the kernel image */
+ for (; i <= pmd_index((unsigned long)_end); i++)
if (pmd[i] & _PAGE_PRESENT)
pmd[i] += load_delta;
- }
+
+ /* invalidate pages after the kernel image */
+ for (; i < PTRS_PER_PMD; i++)
+ pmd[i] &= ~_PAGE_PRESENT;
/*
* Fixup phys_base - remove the memory encryption mask to obtain
diff --git a/arch/x86/kernel/process.h b/arch/x86/kernel/process.h
index 320ab978fb1f..1d0797b2338a 100644
--- a/arch/x86/kernel/process.h
+++ b/arch/x86/kernel/process.h
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+/* SPDX-License-Identifier: GPL-2.0 */
//
// Code shared between 32 and 64 bit
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index c59454c382fd..7e322e2daaf5 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -1505,6 +1505,9 @@ void __init tsc_init(void)
return;
}
+ if (tsc_clocksource_reliable || no_tsc_watchdog)
+ clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
+
clocksource_register_khz(&clocksource_tsc_early, tsc_khz);
detect_art();
}
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 63316036f85a..f68c0c753c38 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -363,7 +363,7 @@ static inline void do_cpuid_7_mask(struct kvm_cpuid_entry2 *entry, int index)
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
+ F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
@@ -485,6 +485,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
/* cpuid 0x80000008.ebx */
const u32 kvm_cpuid_8000_0008_ebx_x86_features =
+ F(CLZERO) | F(XSAVEERPTR) |
F(WBNOINVD) | F(AMD_IBPB) | F(AMD_IBRS) | F(AMD_SSBD) | F(VIRT_SSBD) |
F(AMD_SSB_NO) | F(AMD_STIBP) | F(AMD_STIBP_ALWAYS_ON);
@@ -618,16 +619,20 @@ static inline int __do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 function,
*/
case 0x1f:
case 0xb: {
- int i, level_type;
+ int i;
- /* read more entries until level_type is zero */
- for (i = 1; ; ++i) {
+ /*
+ * We filled in entry[0] for CPUID(EAX=<function>,
+ * ECX=00H) above. If its level type (ECX[15:8]) is
+ * zero, then the leaf is unimplemented, and we're
+ * done. Otherwise, continue to populate entries
+ * until the level type (ECX[15:8]) of the previously
+ * added entry is zero.
+ */
+ for (i = 1; entry[i - 1].ecx & 0xff00; ++i) {
if (*nent >= maxnent)
goto out;
- level_type = entry[i - 1].ecx & 0xff00;
- if (!level_type)
- break;
do_host_cpuid(&entry[i], function, i);
++*nent;
}
@@ -969,53 +974,66 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
/*
- * If no match is found, check whether we exceed the vCPU's limit
- * and return the content of the highest valid _standard_ leaf instead.
- * This is to satisfy the CPUID specification.
+ * If the basic or extended CPUID leaf requested is higher than the
+ * maximum supported basic or extended leaf, respectively, then it is
+ * out of range.
*/
-static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
- u32 function, u32 index)
+static bool cpuid_function_in_range(struct kvm_vcpu *vcpu, u32 function)
{
- struct kvm_cpuid_entry2 *maxlevel;
-
- maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
- if (!maxlevel || maxlevel->eax >= function)
- return NULL;
- if (function & 0x80000000) {
- maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
- if (!maxlevel)
- return NULL;
- }
- return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
+ struct kvm_cpuid_entry2 *max;
+
+ max = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
+ return max && function <= max->eax;
}
bool kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx,
u32 *ecx, u32 *edx, bool check_limit)
{
u32 function = *eax, index = *ecx;
- struct kvm_cpuid_entry2 *best;
- bool entry_found = true;
-
- best = kvm_find_cpuid_entry(vcpu, function, index);
-
- if (!best) {
- entry_found = false;
- if (!check_limit)
- goto out;
+ struct kvm_cpuid_entry2 *entry;
+ struct kvm_cpuid_entry2 *max;
+ bool found;
- best = check_cpuid_limit(vcpu, function, index);
+ entry = kvm_find_cpuid_entry(vcpu, function, index);
+ found = entry;
+ /*
+ * Intel CPUID semantics treats any query for an out-of-range
+ * leaf as if the highest basic leaf (i.e. CPUID.0H:EAX) were
+ * requested. AMD CPUID semantics returns all zeroes for any
+ * undefined leaf, whether or not the leaf is in range.
+ */
+ if (!entry && check_limit && !guest_cpuid_is_amd(vcpu) &&
+ !cpuid_function_in_range(vcpu, function)) {
+ max = kvm_find_cpuid_entry(vcpu, 0, 0);
+ if (max) {
+ function = max->eax;
+ entry = kvm_find_cpuid_entry(vcpu, function, index);
+ }
}
-
-out:
- if (best) {
- *eax = best->eax;
- *ebx = best->ebx;
- *ecx = best->ecx;
- *edx = best->edx;
- } else
+ if (entry) {
+ *eax = entry->eax;
+ *ebx = entry->ebx;
+ *ecx = entry->ecx;
+ *edx = entry->edx;
+ } else {
*eax = *ebx = *ecx = *edx = 0;
- trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, entry_found);
- return entry_found;
+ /*
+ * When leaf 0BH or 1FH is defined, CL is pass-through
+ * and EDX is always the x2APIC ID, even for undefined
+ * subleaves. Index 1 will exist iff the leaf is
+ * implemented, so we pass through CL iff leaf 1
+ * exists. EDX can be copied from any existing index.
+ */
+ if (function == 0xb || function == 0x1f) {
+ entry = kvm_find_cpuid_entry(vcpu, function, 1);
+ if (entry) {
+ *ecx = index & 0xff;
+ *edx = entry->edx;
+ }
+ }
+ }
+ trace_kvm_cpuid(function, *eax, *ebx, *ecx, *edx, found);
+ return found;
}
EXPORT_SYMBOL_GPL(kvm_cpuid);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 3a3a6854dcca..b29d00b661ff 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -66,9 +66,10 @@
#define X2APIC_BROADCAST 0xFFFFFFFFul
static bool lapic_timer_advance_dynamic __read_mostly;
-#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100
-#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 5000
-#define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
+#define LAPIC_TIMER_ADVANCE_ADJUST_MIN 100 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_ADJUST_MAX 10000 /* clock cycles */
+#define LAPIC_TIMER_ADVANCE_NS_INIT 1000
+#define LAPIC_TIMER_ADVANCE_NS_MAX 5000
/* step-by-step approximation to mitigate fluctuation */
#define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
@@ -110,11 +111,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
-{
- return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
-}
-
static inline u32 kvm_x2apic_id(struct kvm_lapic *apic)
{
return apic->vcpu->vcpu_id;
@@ -1504,8 +1500,8 @@ static inline void adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
}
- if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_ADJUST_MAX))
- timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+ if (unlikely(timer_advance_ns > LAPIC_TIMER_ADVANCE_NS_MAX))
+ timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
}
@@ -2302,7 +2298,7 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns)
HRTIMER_MODE_ABS_HARD);
apic->lapic_timer.timer.function = apic_timer_fn;
if (timer_advance_ns == -1) {
- apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
+ apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_NS_INIT;
lapic_timer_advance_dynamic = true;
} else {
apic->lapic_timer.timer_advance_ns = timer_advance_ns;
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2aad7e226fc0..1f5014852e20 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -242,4 +242,9 @@ static inline enum lapic_mode kvm_apic_mode(u64 apic_base)
return apic_base & (MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE);
}
+static inline u8 kvm_xapic_id(struct kvm_lapic *apic)
+{
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
+}
+
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5269aa057dfa..24c23c66b226 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -83,7 +83,17 @@ module_param(dbg, bool, 0644);
#define PTE_PREFETCH_NUM 8
#define PT_FIRST_AVAIL_BITS_SHIFT 10
-#define PT64_SECOND_AVAIL_BITS_SHIFT 52
+#define PT64_SECOND_AVAIL_BITS_SHIFT 54
+
+/*
+ * The mask used to denote special SPTEs, which can be either MMIO SPTEs or
+ * Access Tracking SPTEs.
+ */
+#define SPTE_SPECIAL_MASK (3ULL << 52)
+#define SPTE_AD_ENABLED_MASK (0ULL << 52)
+#define SPTE_AD_DISABLED_MASK (1ULL << 52)
+#define SPTE_AD_WRPROT_ONLY_MASK (2ULL << 52)
+#define SPTE_MMIO_MASK (3ULL << 52)
#define PT64_LEVEL_BITS 9
@@ -219,12 +229,11 @@ static u64 __read_mostly shadow_present_mask;
static u64 __read_mostly shadow_me_mask;
/*
- * SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
- * Non-present SPTEs with shadow_acc_track_value set are in place for access
- * tracking.
+ * SPTEs used by MMUs without A/D bits are marked with SPTE_AD_DISABLED_MASK;
+ * shadow_acc_track_mask is the set of bits to be cleared in non-accessed
+ * pages.
*/
static u64 __read_mostly shadow_acc_track_mask;
-static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
/*
* The mask/shift to use for saving the original R/X bits when marking the PTE
@@ -304,7 +313,7 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask)
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
BUG_ON((mmio_mask & mmio_value) != mmio_value);
- shadow_mmio_value = mmio_value | SPTE_SPECIAL_MASK;
+ shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
shadow_mmio_access_mask = access_mask;
}
@@ -320,10 +329,27 @@ static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
return sp->role.ad_disabled;
}
+static inline bool kvm_vcpu_ad_need_write_protect(struct kvm_vcpu *vcpu)
+{
+ /*
+ * When using the EPT page-modification log, the GPAs in the log
+ * would come from L2 rather than L1. Therefore, we need to rely
+ * on write protection to record dirty pages. This also bypasses
+ * PML, since writes now result in a vmexit.
+ */
+ return vcpu->arch.mmu == &vcpu->arch.guest_mmu;
+}
+
static inline bool spte_ad_enabled(u64 spte)
{
MMU_WARN_ON(is_mmio_spte(spte));
- return !(spte & shadow_acc_track_value);
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_DISABLED_MASK;
+}
+
+static inline bool spte_ad_need_write_protect(u64 spte)
+{
+ MMU_WARN_ON(is_mmio_spte(spte));
+ return (spte & SPTE_SPECIAL_MASK) != SPTE_AD_ENABLED_MASK;
}
static inline u64 spte_shadow_accessed_mask(u64 spte)
@@ -461,7 +487,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
{
BUG_ON(!dirty_mask != !accessed_mask);
BUG_ON(!accessed_mask && !acc_track_mask);
- BUG_ON(acc_track_mask & shadow_acc_track_value);
+ BUG_ON(acc_track_mask & SPTE_SPECIAL_MASK);
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
@@ -1589,16 +1615,16 @@ static bool spte_clear_dirty(u64 *sptep)
rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
+ MMU_WARN_ON(!spte_ad_enabled(spte));
spte &= ~shadow_dirty_mask;
-
return mmu_spte_update(sptep, spte);
}
-static bool wrprot_ad_disabled_spte(u64 *sptep)
+static bool spte_wrprot_for_clear_dirty(u64 *sptep)
{
bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
(unsigned long *)sptep);
- if (was_writable)
+ if (was_writable && !spte_ad_enabled(*sptep))
kvm_set_pfn_dirty(spte_to_pfn(*sptep));
return was_writable;
@@ -1617,10 +1643,10 @@ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head)
bool flush = false;
for_each_rmap_spte(rmap_head, &iter, sptep)
- if (spte_ad_enabled(*sptep))
- flush |= spte_clear_dirty(sptep);
+ if (spte_ad_need_write_protect(*sptep))
+ flush |= spte_wrprot_for_clear_dirty(sptep);
else
- flush |= wrprot_ad_disabled_spte(sptep);
+ flush |= spte_clear_dirty(sptep);
return flush;
}
@@ -1631,6 +1657,11 @@ static bool spte_set_dirty(u64 *sptep)
rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
+ /*
+ * Similar to the !kvm_x86_ops->slot_disable_log_dirty case,
+ * do not bother adding back write access to pages marked
+ * SPTE_AD_WRPROT_ONLY_MASK.
+ */
spte |= shadow_dirty_mask;
return mmu_spte_update(sptep, spte);
@@ -2622,7 +2653,7 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
shadow_user_mask | shadow_x_mask | shadow_me_mask;
if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
+ spte |= SPTE_AD_DISABLED_MASK;
else
spte |= shadow_accessed_mask;
@@ -2968,7 +2999,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
sp = page_header(__pa(sptep));
if (sp_ad_disabled(sp))
- spte |= shadow_acc_track_value;
+ spte |= SPTE_AD_DISABLED_MASK;
+ else if (kvm_vcpu_ad_need_write_protect(vcpu))
+ spte |= SPTE_AD_WRPROT_ONLY_MASK;
/*
* For the EPT case, shadow_present_mask is 0 if hardware
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f8ecb6df5106..c5673bda4b66 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -734,8 +734,14 @@ static int get_npt_level(struct kvm_vcpu *vcpu)
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -4591,6 +4597,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
if (ldr == svm->ldr_reg)
return 0;
@@ -4598,7 +4605,7 @@ static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
avic_invalidate_logical_id_entry(vcpu);
if (ldr)
- ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr);
+ ret = avic_ldr_write(vcpu, id, ldr);
if (!ret)
svm->ldr_reg = ldr;
@@ -4610,8 +4617,7 @@ static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu)
{
u64 *old, *new;
struct vcpu_svm *svm = to_svm(vcpu);
- u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
- u32 id = (apic_id_reg >> 24) & 0xff;
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
if (vcpu->vcpu_id == id)
return 0;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 41abc62c9a8a..0e7c9301fe86 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -2610,7 +2610,7 @@ static int nested_check_vm_entry_controls(struct kvm_vcpu *vcpu,
/* VM-entry exception error code */
if (CC(has_error_code &&
- vmcs12->vm_entry_exception_error_code & GENMASK(31, 15)))
+ vmcs12->vm_entry_exception_error_code & GENMASK(31, 16)))
return -EINVAL;
/* VM-entry interruption-info field: reserved bits */
@@ -2917,7 +2917,7 @@ static int nested_vmx_check_vmentry_hw(struct kvm_vcpu *vcpu)
static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12);
-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
+static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2937,19 +2937,18 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
vmx->nested.apic_access_page = NULL;
}
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
- /*
- * If translation failed, no matter: This feature asks
- * to exit when accessing the given address, and if it
- * can never be accessed, this feature won't do
- * anything anyway.
- */
if (!is_error_page(page)) {
vmx->nested.apic_access_page = page;
hpa = page_to_phys(vmx->nested.apic_access_page);
vmcs_write64(APIC_ACCESS_ADDR, hpa);
} else {
- secondary_exec_controls_clearbit(vmx,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+ pr_debug_ratelimited("%s: no backing 'struct page' for APIC-access address in vmcs12\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+ return false;
}
}
@@ -2994,6 +2993,7 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
else
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+ return true;
}
/*
@@ -3032,13 +3032,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
/*
* If from_vmentry is false, this is being called from state restore (either RSM
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
-+ *
-+ * Returns:
-+ * 0 - success, i.e. proceed with actual VMEnter
-+ * 1 - consistency check VMExit
-+ * -1 - consistency check VMFail
+ *
+ * Returns:
+ * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
+ * NVMX_ENTRY_VMFAIL: Consistency check VMFail
+ * NVMX_ENTRY_VMEXIT: Consistency check VMExit
+ * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
*/
-int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -3081,11 +3083,12 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
prepare_vmcs02_early(vmx, vmcs12);
if (from_vmentry) {
- nested_get_vmcs12_pages(vcpu);
+ if (unlikely(!nested_get_vmcs12_pages(vcpu)))
+ return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
- return -1;
+ return NVMX_VMENTRY_VMFAIL;
}
if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
@@ -3149,7 +3152,7 @@ int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
* returned as far as L1 is concerned. It will only return (and set
* the success flag) when L2 exits (see nested_vmx_vmexit()).
*/
- return 0;
+ return NVMX_VMENTRY_SUCCESS;
/*
* A failed consistency check that leads to a VMExit during L1's
@@ -3165,14 +3168,14 @@ vmentry_fail_vmexit:
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
if (!from_vmentry)
- return 1;
+ return NVMX_VMENTRY_VMEXIT;
load_vmcs12_host_state(vcpu, vmcs12);
vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
vmcs12->exit_qualification = exit_qual;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
vmx->nested.need_vmcs12_to_shadow_sync = true;
- return 1;
+ return NVMX_VMENTRY_VMEXIT;
}
/*
@@ -3182,9 +3185,9 @@ vmentry_fail_vmexit:
static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
{
struct vmcs12 *vmcs12;
+ enum nvmx_vmentry_status status;
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
- int ret;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -3244,13 +3247,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
* the nested entry.
*/
vmx->nested.nested_run_pending = 1;
- ret = nested_vmx_enter_non_root_mode(vcpu, true);
- vmx->nested.nested_run_pending = !ret;
- if (ret > 0)
- return 1;
- else if (ret)
- return nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ status = nested_vmx_enter_non_root_mode(vcpu, true);
+ if (unlikely(status != NVMX_VMENTRY_SUCCESS))
+ goto vmentry_failed;
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -3281,6 +3280,15 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
return kvm_vcpu_halt(vcpu);
}
return 1;
+
+vmentry_failed:
+ vmx->nested.nested_run_pending = 0;
+ if (status == NVMX_VMENTRY_KVM_INTERNAL_ERROR)
+ return 0;
+ if (status == NVMX_VMENTRY_VMEXIT)
+ return 1;
+ WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
+ return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
}
/*
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 187d39bf0bf1..6280f33e5fa6 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -6,6 +6,16 @@
#include "vmcs12.h"
#include "vmx.h"
+/*
+ * Status returned by nested_vmx_enter_non_root_mode():
+ */
+enum nvmx_vmentry_status {
+ NVMX_VMENTRY_SUCCESS, /* Entered VMX non-root mode */
+ NVMX_VMENTRY_VMFAIL, /* Consistency check VMFail */
+ NVMX_VMENTRY_VMEXIT, /* Consistency check VMExit */
+ NVMX_VMENTRY_KVM_INTERNAL_ERROR,/* KVM internal error */
+};
+
void vmx_leave_nested(struct kvm_vcpu *vcpu);
void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
bool apicv);
@@ -13,7 +23,8 @@ void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
void nested_vmx_vcpu_setup(void);
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
-int nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry);
+enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
+ bool from_vmentry);
bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
u32 exit_intr_info, unsigned long exit_qualification);
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 4dea0e0e7e39..3e9c059099e9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -262,6 +262,7 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct x86_pmu_capability x86_pmu;
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
@@ -283,8 +284,10 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
if (!pmu->version)
return;
+ perf_get_x86_pmu_capability(&x86_pmu);
+
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
- INTEL_PMC_MAX_GENERIC);
+ x86_pmu.num_counters_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
pmu->available_event_types = ~entry->ebx &
((1ull << eax.split.mask_length) - 1);
@@ -294,7 +297,7 @@ static void intel_pmu_refresh(struct kvm_vcpu *vcpu)
} else {
pmu->nr_arch_fixed_counters =
min_t(int, edx.split.num_counters_fixed,
- INTEL_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d4575ffb3cec..5d21a4ab28cf 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -209,6 +209,11 @@ static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
struct page *page;
unsigned int i;
+ if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
+ l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
+ return 0;
+ }
+
if (!enable_ept) {
l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
return 0;
@@ -964,17 +969,9 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
- if (!enable_ept) {
- /*
- * NX is needed to handle CR0.WP=1, CR4.SMEP=1. Testing
- * host CPUID is more efficient than testing guest CPUID
- * or CR4. Host SMEP is anyway a requirement for guest SMEP.
- */
- if (boot_cpu_has(X86_FEATURE_SMEP))
- guest_efer |= EFER_NX;
- else if (!(guest_efer & EFER_NX))
- ignore_bits |= EFER_NX;
- }
+ /* Shadow paging assumes NX to be available. */
+ if (!enable_ept)
+ guest_efer |= EFER_NX;
/*
* LMA and LME handled by hardware; SCE meaningless outside long mode.
@@ -5538,14 +5535,6 @@ static int handle_encls(struct kvm_vcpu *vcpu)
return 1;
}
-static int handle_unexpected_vmexit(struct kvm_vcpu *vcpu)
-{
- kvm_skip_emulated_instruction(vcpu);
- WARN_ONCE(1, "Unexpected VM-Exit Reason = 0x%x",
- vmcs_read32(VM_EXIT_REASON));
- return 1;
-}
-
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -5597,15 +5586,11 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_INVVPID] = handle_vmx_instruction,
[EXIT_REASON_RDRAND] = handle_invalid_op,
[EXIT_REASON_RDSEED] = handle_invalid_op,
- [EXIT_REASON_XSAVES] = handle_unexpected_vmexit,
- [EXIT_REASON_XRSTORS] = handle_unexpected_vmexit,
[EXIT_REASON_PML_FULL] = handle_pml_full,
[EXIT_REASON_INVPCID] = handle_invpcid,
[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
[EXIT_REASON_ENCLS] = handle_encls,
- [EXIT_REASON_UMWAIT] = handle_unexpected_vmexit,
- [EXIT_REASON_TPAUSE] = handle_unexpected_vmexit,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7995,12 +7980,10 @@ static int __init vmx_init(void)
* contain 'auto' which will be turned into the default 'cond'
* mitigation mode.
*/
- if (boot_cpu_has(X86_BUG_L1TF)) {
- r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
- if (r) {
- vmx_exit();
- return r;
- }
+ r = vmx_setup_l1d_flush(vmentry_l1d_flush_param);
+ if (r) {
+ vmx_exit();
+ return r;
}
#ifdef CONFIG_KEXEC_CORE
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0ed07d8d2caa..ff395f812719 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,8 +92,8 @@ u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
-#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -212,7 +212,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
{ "mmu_unsync", VM_STAT(mmu_unsync) },
{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages) },
+ { "largepages", VM_STAT(lpages, .mode = 0444) },
{ "max_mmu_page_hash_collisions",
VM_STAT(max_mmu_page_hash_collisions) },
{ NULL }
@@ -360,8 +360,7 @@ EXPORT_SYMBOL_GPL(kvm_set_apic_base);
asmlinkage __visible void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
- if (!kvm_rebooting)
- BUG();
+ BUG_ON(!kvm_rebooting);
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);
@@ -885,34 +884,42 @@ int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- unsigned long old_cr4 = kvm_read_cr4(vcpu);
- unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
-
if (cr4 & CR4_RESERVED_BITS)
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
- return 1;
+ return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ return -EINVAL;
+
+ return 0;
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ unsigned long old_cr4 = kvm_read_cr4(vcpu);
+ unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+
+ if (kvm_valid_cr4(vcpu, cr4))
return 1;
if (is_long_mode(vcpu)) {
@@ -1161,13 +1168,6 @@ static u32 msrs_to_save[] = {
MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
- MSR_ARCH_PERFMON_PERFCTR0 + 18, MSR_ARCH_PERFMON_PERFCTR0 + 19,
- MSR_ARCH_PERFMON_PERFCTR0 + 20, MSR_ARCH_PERFMON_PERFCTR0 + 21,
- MSR_ARCH_PERFMON_PERFCTR0 + 22, MSR_ARCH_PERFMON_PERFCTR0 + 23,
- MSR_ARCH_PERFMON_PERFCTR0 + 24, MSR_ARCH_PERFMON_PERFCTR0 + 25,
- MSR_ARCH_PERFMON_PERFCTR0 + 26, MSR_ARCH_PERFMON_PERFCTR0 + 27,
- MSR_ARCH_PERFMON_PERFCTR0 + 28, MSR_ARCH_PERFMON_PERFCTR0 + 29,
- MSR_ARCH_PERFMON_PERFCTR0 + 30, MSR_ARCH_PERFMON_PERFCTR0 + 31,
MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
@@ -1177,13 +1177,6 @@ static u32 msrs_to_save[] = {
MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
- MSR_ARCH_PERFMON_EVENTSEL0 + 18, MSR_ARCH_PERFMON_EVENTSEL0 + 19,
- MSR_ARCH_PERFMON_EVENTSEL0 + 20, MSR_ARCH_PERFMON_EVENTSEL0 + 21,
- MSR_ARCH_PERFMON_EVENTSEL0 + 22, MSR_ARCH_PERFMON_EVENTSEL0 + 23,
- MSR_ARCH_PERFMON_EVENTSEL0 + 24, MSR_ARCH_PERFMON_EVENTSEL0 + 25,
- MSR_ARCH_PERFMON_EVENTSEL0 + 26, MSR_ARCH_PERFMON_EVENTSEL0 + 27,
- MSR_ARCH_PERFMON_EVENTSEL0 + 28, MSR_ARCH_PERFMON_EVENTSEL0 + 29,
- MSR_ARCH_PERFMON_EVENTSEL0 + 30, MSR_ARCH_PERFMON_EVENTSEL0 + 31,
};
static unsigned num_msrs_to_save;
@@ -2543,6 +2536,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
static void kvmclock_reset(struct kvm_vcpu *vcpu)
{
vcpu->arch.pv_time_enabled = false;
+ vcpu->arch.time = 0;
}
static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
@@ -2708,8 +2702,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_KVM_SYSTEM_TIME: {
struct kvm_arch *ka = &vcpu->kvm->arch;
- kvmclock_reset(vcpu);
-
if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
@@ -2723,14 +2715,13 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
/* we verify if the enable bit is set... */
+ vcpu->arch.pv_time_enabled = false;
if (!(data & 1))
break;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
&vcpu->arch.pv_time, data & ~1ULL,
sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = false;
- else
vcpu->arch.pv_time_enabled = true;
break;
@@ -5097,13 +5088,14 @@ out:
static void kvm_init_msr_list(void)
{
+ struct x86_pmu_capability x86_pmu;
u32 dummy[2];
unsigned i, j;
BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
"Please update the fixed PMCs in msrs_to_save[]");
- BUILD_BUG_ON_MSG(INTEL_PMC_MAX_GENERIC != 32,
- "Please update the generic perfctr/eventsel MSRs in msrs_to_save[]");
+
+ perf_get_x86_pmu_capability(&x86_pmu);
for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) {
if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
@@ -5145,6 +5137,15 @@ static void kvm_init_msr_list(void)
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
break;
+ case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
+ if (msrs_to_save[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ continue;
+ break;
+ case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
+ if (msrs_to_save[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
+ min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
+ continue;
}
default:
break;
@@ -7937,8 +7938,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
- kvm_x86_ops->get_vmcs12_pages(vcpu);
+ if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
+ r = 0;
+ goto out;
+ }
+ }
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
kvm_mmu_unload(vcpu);
if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
@@ -8714,10 +8719,6 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
{
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- (sregs->cr4 & X86_CR4_OSXSAVE))
- return -EINVAL;
-
if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
/*
* When EFER.LME and CR0.PG are set, the processor is in
@@ -8736,7 +8737,7 @@ static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
return -EINVAL;
}
- return 0;
+ return kvm_valid_cr4(vcpu, sregs->cr4);
}
static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index b7375dc6898f..c126571e5e2e 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -113,8 +113,8 @@ static void delay_mwaitx(unsigned long __loops)
__monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
/*
- * AMD, like Intel, supports the EAX hint and EAX=0xf
- * means, do not enter any deep C-state and we use it
+ * AMD, like Intel's MWAIT version, supports the EAX hint and
+ * EAX=0xf0 means, do not enter any deep C-state and we use it
* here in delay() to minimize wakeup latency.
*/
__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index c202e1b07e29..425e025341db 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -917,9 +917,6 @@ static void __init kexec_enter_virtual_mode(void)
if (efi_enabled(EFI_OLD_MEMMAP) && (__supported_pte_mask & _PAGE_NX))
runtime_code_page_mkexec();
-
- /* clean DUMMY object */
- efi_delete_dummy_variable();
#endif
}
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index 0d3365cb64de..a04551ee5568 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -57,19 +57,7 @@ static efi_system_table_t __init *xen_efi_probe(void)
return NULL;
/* Here we know that Xen runs on EFI platform. */
-
- efi.get_time = xen_efi_get_time;
- efi.set_time = xen_efi_set_time;
- efi.get_wakeup_time = xen_efi_get_wakeup_time;
- efi.set_wakeup_time = xen_efi_set_wakeup_time;
- efi.get_variable = xen_efi_get_variable;
- efi.get_next_variable = xen_efi_get_next_variable;
- efi.set_variable = xen_efi_set_variable;
- efi.query_variable_info = xen_efi_query_variable_info;
- efi.update_capsule = xen_efi_update_capsule;
- efi.query_capsule_caps = xen_efi_query_capsule_caps;
- efi.get_next_high_mono_count = xen_efi_get_next_high_mono_count;
- efi.reset_system = xen_efi_reset_system;
+ xen_efi_runtime_setup();
efi_systab_xen.tables = info->cfg.addr;
efi_systab_xen.nr_tables = info->cfg.nent;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 750f46ad018a..205b1176084f 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -269,19 +269,41 @@ void xen_reboot(int reason)
BUG();
}
+static int reboot_reason = SHUTDOWN_reboot;
+static bool xen_legacy_crash;
void xen_emergency_restart(void)
{
- xen_reboot(SHUTDOWN_reboot);
+ xen_reboot(reboot_reason);
}
static int
xen_panic_event(struct notifier_block *this, unsigned long event, void *ptr)
{
- if (!kexec_crash_loaded())
- xen_reboot(SHUTDOWN_crash);
+ if (!kexec_crash_loaded()) {
+ if (xen_legacy_crash)
+ xen_reboot(SHUTDOWN_crash);
+
+ reboot_reason = SHUTDOWN_crash;
+
+ /*
+ * If panic_timeout==0 then we are supposed to wait forever.
+ * However, to preserve original dom0 behavior we have to drop
+ * into hypervisor. (domU behavior is controlled by its
+ * config file)
+ */
+ if (panic_timeout == 0)
+ panic_timeout = -1;
+ }
return NOTIFY_DONE;
}
+static int __init parse_xen_legacy_crash(char *arg)
+{
+ xen_legacy_crash = true;
+ return 0;
+}
+early_param("xen_legacy_crash", parse_xen_legacy_crash);
+
static struct notifier_block xen_panic_block = {
.notifier_call = xen_panic_event,
.priority = INT_MIN
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 58f79ab32358..5bfea374a160 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -117,6 +117,14 @@ static void __init xen_banner(void)
printk(KERN_INFO "Xen version: %d.%d%s%s\n",
version >> 16, version & 0xffff, extra.extraversion,
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
+
+#ifdef CONFIG_X86_32
+ pr_warn("WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n"
+ "Support for running as 32-bit PV-guest under Xen will soon be removed\n"
+ "from the Linux kernel!\n"
+ "Please use either a 64-bit kernel or switch to HVM or PVH mode!\n"
+ "WARNING! WARNING! WARNING! WARNING! WARNING! WARNING! WARNING!\n");
+#endif
}
static void __init xen_pv_init_platform(void)