summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2017-11-13 14:13:48 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2017-11-13 14:13:48 -0800
commitd6ec9d9a4def52a5094237564eaf6f6979fd7a27 (patch)
treeadfb80f83f04a021e82cb25227b64b1bb9e793dc /arch/x86/kernel
parent3e2014637c50e5d6a77cd63d5db6c209fe29d1b1 (diff)
parent91a6a6cfee8ad34ea4cc10a54c0765edfe437cdb (diff)
downloadlinux-d6ec9d9a4def52a5094237564eaf6f6979fd7a27.tar.bz2
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 core updates from Ingo Molnar: "Note that in this cycle most of the x86 topics interacted at a level that caused them to be merged into tip:x86/asm - but this should be a temporary phenomenon, hopefully we'll back to the usual patterns in the next merge window. The main changes in this cycle were: Hardware enablement: - Add support for the Intel UMIP (User Mode Instruction Prevention) CPU feature. This is a security feature that disables certain instructions such as SGDT, SLDT, SIDT, SMSW and STR. (Ricardo Neri) [ Note that this is disabled by default for now, there are some smaller enhancements in the pipeline that I'll follow up with in the next 1-2 days, which allows this to be enabled by default.] - Add support for the AMD SEV (Secure Encrypted Virtualization) CPU feature, on top of SME (Secure Memory Encryption) support that was added in v4.14. (Tom Lendacky, Brijesh Singh) - Enable new SSE/AVX/AVX512 CPU features: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, AVX512_BITALG. (Gayatri Kammela) Other changes: - A big series of entry code simplifications and enhancements (Andy Lutomirski) - Make the ORC unwinder default on x86 and various objtool enhancements. (Josh Poimboeuf) - 5-level paging enhancements (Kirill A. Shutemov) - Micro-optimize the entry code a bit (Borislav Petkov) - Improve the handling of interdependent CPU features in the early FPU init code (Andi Kleen) - Build system enhancements (Changbin Du, Masahiro Yamada) - ... plus misc enhancements, fixes and cleanups" * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (118 commits) x86/build: Make the boot image generation less verbose selftests/x86: Add tests for the STR and SLDT instructions selftests/x86: Add tests for User-Mode Instruction Prevention x86/traps: Fix up general protection faults caused by UMIP x86/umip: Enable User-Mode Instruction Prevention at runtime x86/umip: Force a page fault when unable to copy emulated result to user x86/umip: Add emulation code for UMIP instructions x86/cpufeature: Add User-Mode Instruction Prevention definitions x86/insn-eval: Add support to resolve 16-bit address encodings x86/insn-eval: Handle 32-bit address encodings in virtual-8086 mode x86/insn-eval: Add wrapper function for 32 and 64-bit addresses x86/insn-eval: Add support to resolve 32-bit address encodings x86/insn-eval: Compute linear address in several utility functions resource: Fix resource_size.cocci warnings X86/KVM: Clear encryption attribute when SEV is active X86/KVM: Decrypt shared per-cpu variables when SEV is active percpu: Introduce DEFINE_PER_CPU_DECRYPTED x86: Add support for changing memory encryption attribute in early boot x86/io: Unroll string I/O when SEV is active x86/boot: Add early boot support when running with SEV active ...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile11
-rw-r--r--arch/x86/kernel/alternative.c26
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/common.c54
-rw-r--r--arch/x86/kernel/cpu/cpuid-deps.c121
-rw-r--r--arch/x86/kernel/crash.c18
-rw-r--r--arch/x86/kernel/fpu/init.c11
-rw-r--r--arch/x86/kernel/fpu/xstate.c43
-rw-r--r--arch/x86/kernel/head_32.S5
-rw-r--r--arch/x86/kernel/head_64.S45
-rw-r--r--arch/x86/kernel/kvm.c40
-rw-r--r--arch/x86/kernel/kvmclock.c65
-rw-r--r--arch/x86/kernel/ldt.c16
-rw-r--r--arch/x86/kernel/pmem.c2
-rw-r--r--arch/x86/kernel/process.c8
-rw-r--r--arch/x86/kernel/process_32.c6
-rw-r--r--arch/x86/kernel/process_64.c5
-rw-r--r--arch/x86/kernel/setup.c6
-rw-r--r--arch/x86/kernel/smpboot.c3
-rw-r--r--arch/x86/kernel/traps.c9
-rw-r--r--arch/x86/kernel/umip.c366
-rw-r--r--arch/x86/kernel/uprobes.c15
-rw-r--r--arch/x86/kernel/verify_cpu.S3
-rw-r--r--arch/x86/kernel/vm86_32.c20
24 files changed, 767 insertions, 132 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5f70044340ff..81bb565f4497 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,9 +25,9 @@ endif
KASAN_SANITIZE_head$(BITS).o := n
KASAN_SANITIZE_dumpstack.o := n
KASAN_SANITIZE_dumpstack_$(BITS).o := n
-KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_paravirt.o := n
-OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y
OBJECT_FILES_NON_STANDARD_test_nx.o := y
@@ -127,10 +127,11 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
obj-$(CONFIG_TRACING) += tracepoint.o
obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
+obj-$(CONFIG_X86_INTEL_UMIP) += umip.o
-obj-$(CONFIG_ORC_UNWINDER) += unwind_orc.o
-obj-$(CONFIG_FRAME_POINTER_UNWINDER) += unwind_frame.o
-obj-$(CONFIG_GUESS_UNWINDER) += unwind_guess.o
+obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o
###
# 64 bit specific files
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3344d3382e91..dbaf14d69ebd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -442,7 +442,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
{
const s32 *poff;
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -452,7 +451,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
if (*ptr == 0x3e)
text_poke(ptr, ((unsigned char []){0xf0}), 1);
}
- mutex_unlock(&text_mutex);
}
static void alternatives_smp_unlock(const s32 *start, const s32 *end,
@@ -460,7 +458,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
{
const s32 *poff;
- mutex_lock(&text_mutex);
for (poff = start; poff < end; poff++) {
u8 *ptr = (u8 *)poff + *poff;
@@ -470,7 +467,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
if (*ptr == 0xf0)
text_poke(ptr, ((unsigned char []){0x3E}), 1);
}
- mutex_unlock(&text_mutex);
}
struct smp_alt_module {
@@ -489,8 +485,7 @@ struct smp_alt_module {
struct list_head next;
};
static LIST_HEAD(smp_alt_modules);
-static DEFINE_MUTEX(smp_alt);
-static bool uniproc_patched = false; /* protected by smp_alt */
+static bool uniproc_patched = false; /* protected by text_mutex */
void __init_or_module alternatives_smp_module_add(struct module *mod,
char *name,
@@ -499,7 +494,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
{
struct smp_alt_module *smp;
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
if (!uniproc_patched)
goto unlock;
@@ -526,14 +521,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp_unlock:
alternatives_smp_unlock(locks, locks_end, text, text_end);
unlock:
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
void __init_or_module alternatives_smp_module_del(struct module *mod)
{
struct smp_alt_module *item;
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
list_for_each_entry(item, &smp_alt_modules, next) {
if (mod != item->mod)
continue;
@@ -541,7 +536,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
kfree(item);
break;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
void alternatives_enable_smp(void)
@@ -551,7 +546,7 @@ void alternatives_enable_smp(void)
/* Why bother if there are no other CPUs? */
BUG_ON(num_possible_cpus() == 1);
- mutex_lock(&smp_alt);
+ mutex_lock(&text_mutex);
if (uniproc_patched) {
pr_info("switching to SMP code\n");
@@ -563,10 +558,13 @@ void alternatives_enable_smp(void)
mod->text, mod->text_end);
uniproc_patched = false;
}
- mutex_unlock(&smp_alt);
+ mutex_unlock(&text_mutex);
}
-/* Return 1 if the address range is reserved for smp-alternatives */
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
int alternatives_text_reserved(void *start, void *end)
{
struct smp_alt_module *mod;
@@ -574,6 +572,8 @@ int alternatives_text_reserved(void *start, void *end)
u8 *text_start = start;
u8 *text_end = end;
+ lockdep_assert_held(&text_mutex);
+
list_for_each_entry(mod, &smp_alt_modules, next) {
if (mod->text > text_end || mod->text_end < text_start)
continue;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c60922a66385..90cb82dbba57 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -23,6 +23,7 @@ obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
obj-$(CONFIG_CPU_FREQ) += aperfmperf.o
+obj-y += cpuid-deps.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c9176bae7fd8..47f8a85be11c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -329,6 +329,28 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
}
}
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+ /* Check the boot processor, plus build option for UMIP. */
+ if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+ goto out;
+
+ /* Check the current processor's cpuid bits. */
+ if (!cpu_has(c, X86_FEATURE_UMIP))
+ goto out;
+
+ cr4_set_bits(X86_CR4_UMIP);
+
+ return;
+
+out:
+ /*
+ * Make sure UMIP is disabled in case it was enabled in a
+ * previous boot (e.g., via kexec).
+ */
+ cr4_clear_bits(X86_CR4_UMIP);
+}
+
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1147,9 +1169,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
/* Disable the PN if appropriate */
squash_the_stupid_serial_number(c);
- /* Set up SMEP/SMAP */
+ /* Set up SMEP/SMAP/UMIP */
setup_smep(c);
setup_smap(c);
+ setup_umip(c);
/*
* The vendor-specific functions might have changed features.
@@ -1301,18 +1324,16 @@ void print_cpu_info(struct cpuinfo_x86 *c)
pr_cont(")\n");
}
-static __init int setup_disablecpuid(char *arg)
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
{
- int bit;
-
- if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
- setup_clear_cpu_cap(bit);
- else
- return 0;
-
return 1;
}
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);
#ifdef CONFIG_X86_64
DEFINE_PER_CPU_FIRST(union irq_stack_union,
@@ -1572,9 +1593,13 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, me);
- load_sp0(t, &current->thread);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);
clear_all_debug_regs();
@@ -1596,7 +1621,6 @@ void cpu_init(void)
int cpu = smp_processor_id();
struct task_struct *curr = current;
struct tss_struct *t = &per_cpu(cpu_tss, cpu);
- struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
@@ -1627,9 +1651,13 @@ void cpu_init(void)
initialize_tlbstate_and_flush();
enter_lazy_tlb(&init_mm, curr);
- load_sp0(t, thread);
+ /*
+ * Initialize the TSS. Don't bother initializing sp0, as the initial
+ * task never enters user mode.
+ */
set_tss_desc(cpu, t);
load_TR_desc();
+
load_mm_ldt(&init_mm);
t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..904b0a3c4e53
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,121 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+ unsigned int feature;
+ unsigned int depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+ { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE },
+ { X86_FEATURE_AVX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_PKU, X86_FEATURE_XSAVE },
+ { X86_FEATURE_MPX, X86_FEATURE_XSAVE },
+ { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE },
+ { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM, X86_FEATURE_FXSR },
+ { X86_FEATURE_XMM2, X86_FEATURE_XMM },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 },
+ { X86_FEATURE_XMM3, X86_FEATURE_XMM2 },
+ { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, },
+ { X86_FEATURE_F16C, X86_FEATURE_XMM2, },
+ { X86_FEATURE_AES, X86_FEATURE_XMM2 },
+ { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 },
+ { X86_FEATURE_FMA, X86_FEATURE_AVX },
+ { X86_FEATURE_AVX2, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512F, X86_FEATURE_AVX, },
+ { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VAES, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL },
+ { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F },
+ { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F },
+ {}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ /*
+ * Note: This could use the non atomic __*_bit() variants, but the
+ * rest of the cpufeature code uses atomics as well, so keep it for
+ * consistency. Cleanup all of it separately.
+ */
+ if (!c) {
+ clear_cpu_cap(&boot_cpu_data, feature);
+ set_bit(feature, (unsigned long *)cpu_caps_cleared);
+ } else {
+ clear_bit(feature, (unsigned long *)c->x86_capability);
+ }
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+ const struct cpuid_dep *d;
+ bool changed;
+
+ if (WARN_ON(feature >= MAX_FEATURE_BITS))
+ return;
+
+ clear_feature(c, feature);
+
+ /* Collect all features to disable, handling dependencies */
+ memset(disable, 0, sizeof(disable));
+ __set_bit(feature, disable);
+
+ /* Loop until we get a stable state. */
+ do {
+ changed = false;
+ for (d = cpuid_deps; d->feature; d++) {
+ if (!test_bit(d->depends, disable))
+ continue;
+ if (__test_and_set_bit(d->feature, disable))
+ continue;
+
+ changed = true;
+ clear_feature(c, d->feature);
+ }
+ } while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+ do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+ do_clear_cpu_cap(NULL, feature);
+}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 44404e2307bb..10e74d4778a1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
}
#ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
{
unsigned int *nr_ranges = arg;
@@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
return ret;
}
-static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
{
struct crash_elf_data *ced = arg;
Elf64_Ehdr *ehdr;
@@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
ehdr = ced->ehdr;
/* Exclude unwanted mem ranges */
- ret = elf_header_exclude_ranges(ced, start, end);
+ ret = elf_header_exclude_ranges(ced, res->start, res->end);
if (ret)
return ret;
@@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
return 0;
}
-static int memmap_entry_callback(u64 start, u64 end, void *arg)
+static int memmap_entry_callback(struct resource *res, void *arg)
{
struct crash_memmap_data *cmd = arg;
struct boot_params *params = cmd->params;
struct e820_entry ei;
- ei.addr = start;
- ei.size = end - start + 1;
+ ei.addr = res->start;
+ ei.size = resource_size(res);
ei.type = cmd->type;
add_e820_entry(params, &ei);
@@ -619,12 +619,12 @@ out:
return ret;
}
-static int determine_backup_region(u64 start, u64 end, void *arg)
+static int determine_backup_region(struct resource *res, void *arg)
{
struct kimage *image = arg;
- image->arch.backup_src_start = start;
- image->arch.backup_src_sz = end - start + 1;
+ image->arch.backup_src_start = res->start;
+ image->arch.backup_src_sz = resource_size(res);
/* Expecting only one range for backup region */
return 1;
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7affb7e3d9a5..6abd83572b01 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
+ char arg[32];
+ char *argptr = arg;
+ int bit;
+
if (cmdline_find_option_bool(boot_command_line, "no387"))
setup_clear_cpu_cap(X86_FEATURE_FPU);
@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+ if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+ sizeof(arg)) &&
+ get_option(&argptr, &bit) &&
+ bit >= 0 &&
+ bit < NCAPINTS * 32)
+ setup_clear_cpu_cap(bit);
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f1d5476c9022..87a57b7642d3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -15,6 +15,7 @@
#include <asm/fpu/xstate.h>
#include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
/*
* Although we spell it out in here, the Processor Trace
@@ -36,6 +37,19 @@ static const char *xfeature_names[] =
"unknown xstate feature" ,
};
+static short xsave_cpuid_features[] __initdata = {
+ X86_FEATURE_FPU,
+ X86_FEATURE_XMM,
+ X86_FEATURE_AVX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_MPX,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_AVX512F,
+ X86_FEATURE_INTEL_PT,
+ X86_FEATURE_PKU,
+};
+
/*
* Mask of xstate features supported by the CPU and the kernel:
*/
@@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size;
void fpu__xstate_clear_all_cpu_caps(void)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
- setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
- setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- setup_clear_cpu_cap(X86_FEATURE_AVX);
- setup_clear_cpu_cap(X86_FEATURE_AVX2);
- setup_clear_cpu_cap(X86_FEATURE_AVX512F);
- setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
- setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
- setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
- setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
- setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
- setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
- setup_clear_cpu_cap(X86_FEATURE_MPX);
- setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
- setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
- setup_clear_cpu_cap(X86_FEATURE_PKU);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
- setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
}
/*
@@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void)
unsigned int eax, ebx, ecx, edx;
static int on_boot_cpu __initdata = 1;
int err;
+ int i;
WARN_ON_FPU(!on_boot_cpu);
on_boot_cpu = 0;
@@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void)
goto out_disable;
}
+ /*
+ * Clear XSAVE features that are disabled in the normal CPUID.
+ */
+ for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+ if (!boot_cpu_has(xsave_cpuid_features[i]))
+ xfeatures_mask &= ~BIT(i);
+ }
+
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f1d528bb66a6..c29020907886 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -212,9 +212,6 @@ ENTRY(startup_32_smp)
#endif
.Ldefault_entry:
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $(CR0_STATE & ~X86_CR0_PG),%eax
movl %eax,%cr0
@@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array)
# 24(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
pushl $0 # Dummy error code, to make stack frame uniform
.endif
pushl $i # 20(%esp) Vector number
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6dde3f3fc1f8..7dca675fe78d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,11 +38,12 @@
*
*/
-#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
+#endif
L3_START_KERNEL = pud_index(__START_KERNEL_map)
.text
@@ -50,6 +51,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded an identity mapped page table
@@ -89,6 +91,7 @@ startup_64:
addq $(early_top_pgt - __START_KERNEL_map), %rax
jmp 1f
ENTRY(secondary_startup_64)
+ UNWIND_HINT_EMPTY
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
* and someone has loaded a mapped page table.
@@ -133,6 +136,7 @@ ENTRY(secondary_startup_64)
movq $1f, %rax
jmp *%rax
1:
+ UNWIND_HINT_EMPTY
/* Check if nx is implemented */
movl $0x80000001, %eax
@@ -150,9 +154,6 @@ ENTRY(secondary_startup_64)
1: wrmsr /* Make changes effective */
/* Setup cr0 */
-#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
- X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
- X86_CR0_PG)
movl $CR0_STATE, %eax
/* Make changes effective */
movq %rax, %cr0
@@ -235,7 +236,7 @@ ENTRY(secondary_startup_64)
pushq %rax # target address in negative space
lretq
.Lafter_lret:
-ENDPROC(secondary_startup_64)
+END(secondary_startup_64)
#include "verify_cpu.S"
@@ -247,6 +248,7 @@ ENDPROC(secondary_startup_64)
*/
ENTRY(start_cpu0)
movq initial_stack(%rip), %rsp
+ UNWIND_HINT_EMPTY
jmp .Ljump_to_C_code
ENDPROC(start_cpu0)
#endif
@@ -266,26 +268,24 @@ ENDPROC(start_cpu0)
.quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
__FINITDATA
-bad_address:
- jmp bad_address
-
__INIT
ENTRY(early_idt_handler_array)
- # 104(%rsp) %rflags
- # 96(%rsp) %cs
- # 88(%rsp) %rip
- # 80(%rsp) error code
i = 0
.rept NUM_EXCEPTION_VECTORS
- .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
- pushq $0 # Dummy error code, to make stack frame uniform
+ .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+ UNWIND_HINT_IRET_REGS
+ pushq $0 # Dummy error code, to make stack frame uniform
+ .else
+ UNWIND_HINT_IRET_REGS offset=8
.endif
pushq $i # 72(%rsp) Vector number
jmp early_idt_handler_common
+ UNWIND_HINT_IRET_REGS
i = i + 1
.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
.endr
-ENDPROC(early_idt_handler_array)
+ UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)
early_idt_handler_common:
/*
@@ -313,6 +313,7 @@ early_idt_handler_common:
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ UNWIND_HINT_REGS
cmpq $14,%rsi /* Page fault? */
jnz 10f
@@ -327,8 +328,8 @@ early_idt_handler_common:
20:
decl early_recursion_flag(%rip)
- jmp restore_regs_and_iret
-ENDPROC(early_idt_handler_common)
+ jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)
__INITDATA
@@ -362,10 +363,7 @@ NEXT_PAGE(early_dynamic_pgts)
.data
-#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
- .fill 512,8,0
-#else
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
NEXT_PAGE(init_top_pgt)
.quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
.org init_top_pgt + PGD_PAGE_OFFSET*8, 0
@@ -382,6 +380,9 @@ NEXT_PAGE(level2_ident_pgt)
* Don't set NX because code runs from these pages.
*/
PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PAGE(init_top_pgt)
+ .fill 512,8,0
#endif
#ifdef CONFIG_X86_5LEVEL
@@ -435,7 +436,7 @@ ENTRY(phys_base)
EXPORT_SYMBOL(phys_base)
#include "../../x86/xen/xen-head.S"
-
+
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8bb9594d0761..3df743b60c80 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg)
early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
-static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
-static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);
static int has_steal_clock = 0;
/*
@@ -312,7 +312,7 @@ static void kvm_register_steal_time(void)
cpu, (unsigned long long) slow_virt_to_phys(st));
}
-static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
{
@@ -426,9 +426,42 @@ void kvm_disable_steal_time(void)
wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}
+static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+ early_set_memory_decrypted((unsigned long) ptr, size);
+}
+
+/*
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
+ */
+static void __init sev_map_percpu_data(void)
+{
+ int cpu;
+
+ if (!sev_active())
+ return;
+
+ for_each_possible_cpu(cpu) {
+ __set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+ __set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+ __set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+ }
+}
+
#ifdef CONFIG_SMP
static void __init kvm_smp_prepare_boot_cpu(void)
{
+ /*
+ * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+ * shares the guest physical address with the hypervisor.
+ */
+ sev_map_percpu_data();
+
kvm_guest_cpu_init();
native_smp_prepare_boot_cpu();
kvm_spinlock_init();
@@ -496,6 +529,7 @@ void __init kvm_guest_init(void)
kvm_cpu_online, kvm_cpu_down_prepare) < 0)
pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
#else
+ sev_map_percpu_data();
kvm_guest_cpu_init();
#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5b609e28ce3f..77b492c2d658 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -27,6 +27,7 @@
#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <asm/mem_encrypt.h>
#include <asm/x86_init.h>
#include <asm/reboot.h>
#include <asm/kvmclock.h>
@@ -45,7 +46,7 @@ early_param("no-kvmclock", parse_no_kvmclock);
/* The hypervisor will put information about time periodically here */
static struct pvclock_vsyscall_time_info *hv_clock;
-static struct pvclock_wall_clock wall_clock;
+static struct pvclock_wall_clock *wall_clock;
struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
{
@@ -64,15 +65,15 @@ static void kvm_get_wallclock(struct timespec *now)
int low, high;
int cpu;
- low = (int)__pa_symbol(&wall_clock);
- high = ((u64)__pa_symbol(&wall_clock) >> 32);
+ low = (int)slow_virt_to_phys(wall_clock);
+ high = ((u64)slow_virt_to_phys(wall_clock) >> 32);
native_write_msr(msr_kvm_wall_clock, low, high);
cpu = get_cpu();
vcpu_time = &hv_clock[cpu].pvti;
- pvclock_read_wallclock(&wall_clock, vcpu_time, now);
+ pvclock_read_wallclock(wall_clock, vcpu_time, now);
put_cpu();
}
@@ -249,11 +250,39 @@ static void kvm_shutdown(void)
native_machine_shutdown();
}
+static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size,
+ phys_addr_t align)
+{
+ phys_addr_t mem;
+
+ mem = memblock_alloc(size, align);
+ if (!mem)
+ return 0;
+
+ if (sev_active()) {
+ if (early_set_memory_decrypted((unsigned long)__va(mem), size))
+ goto e_free;
+ }
+
+ return mem;
+e_free:
+ memblock_free(mem, size);
+ return 0;
+}
+
+static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size)
+{
+ if (sev_active())
+ early_set_memory_encrypted((unsigned long)__va(addr), size);
+
+ memblock_free(addr, size);
+}
+
void __init kvmclock_init(void)
{
struct pvclock_vcpu_time_info *vcpu_time;
- unsigned long mem;
- int size, cpu;
+ unsigned long mem, mem_wall_clock;
+ int size, cpu, wall_clock_size;
u8 flags;
size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
@@ -267,21 +296,35 @@ void __init kvmclock_init(void)
} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
return;
- printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
- msr_kvm_system_time, msr_kvm_wall_clock);
+ wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock));
+ mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE);
+ if (!mem_wall_clock)
+ return;
- mem = memblock_alloc(size, PAGE_SIZE);
- if (!mem)
+ wall_clock = __va(mem_wall_clock);
+ memset(wall_clock, 0, wall_clock_size);
+
+ mem = kvm_memblock_alloc(size, PAGE_SIZE);
+ if (!mem) {
+ kvm_memblock_free(mem_wall_clock, wall_clock_size);
+ wall_clock = NULL;
return;
+ }
+
hv_clock = __va(mem);
memset(hv_clock, 0, size);
if (kvm_register_clock("primary cpu clock")) {
hv_clock = NULL;
- memblock_free(mem, size);
+ kvm_memblock_free(mem, size);
+ kvm_memblock_free(mem_wall_clock, wall_clock_size);
+ wall_clock = NULL;
return;
}
+ printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a2fcf037bd80..1c1eae961340 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -13,6 +13,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/smp.h>
+#include <linux/syscalls.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/uaccess.h>
@@ -295,8 +296,8 @@ out:
return error;
}
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
- unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+ unsigned long , bytecount)
{
int ret = -ENOSYS;
@@ -314,5 +315,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
ret = write_ldt(ptr, bytecount, 0);
break;
}
- return ret;
+ /*
+ * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+ * return type, but tht ABI for sys_modify_ldt() expects
+ * 'int'. This cast gives us an int-sized value in %rax
+ * for the return code. The 'unsigned' is necessary so
+ * the compiler does not try to sign-extend the negative
+ * return codes into the high half of the register when
+ * taking the value from int->long.
+ */
+ return (unsigned int)ret;
}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3fe690067802..6b07faaa1579 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -7,7 +7,7 @@
#include <linux/init.h>
#include <linux/ioport.h>
-static int found(u64 start, u64 end, void *data)
+static int found(struct resource *res, void *data)
{
return 1;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c67685337c5a..97fb3e5737f5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -49,7 +49,13 @@
*/
__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
.x86_tss = {
- .sp0 = TOP_OF_INIT_STACK,
+ /*
+ * .sp0 is only used when entering ring 0 from a lower
+ * privilege level. Since the init task never runs anything
+ * but ring 0 code, there is no need for a valid value here.
+ * Poison it.
+ */
+ .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
#ifdef CONFIG_X86_32
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 11966251cd42..45bf0c5f93e1 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Reload esp0 and cpu_current_top_of_stack. This changes
- * current_thread_info().
+ * current_thread_info(). Refresh the SYSENTER configuration in
+ * case prev or next is vm86.
*/
- load_sp0(tss, next);
+ update_sp0(next_p);
+ refresh_sysenter_cs(next);
this_cpu_write(cpu_current_top_of_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 302e7b2572d1..eeeb34f85c25 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
struct inactive_task_frame *frame;
struct task_struct *me = current;
- p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
fork_frame = container_of(childregs, struct fork_frame, regs);
frame = &fork_frame->frame;
@@ -464,8 +463,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
this_cpu_write(current_task, next_p);
- /* Reload esp0 and ss1. This changes current_thread_info(). */
- load_sp0(tss, next);
+ /* Reload sp0. */
+ update_sp0(next_p);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0957dd73d127..507100a72eb3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -380,9 +380,11 @@ static void __init reserve_initrd(void)
* If SME is active, this memory will be marked encrypted by the
* kernel when it is accessed (including relocation). However, the
* ramdisk image was loaded decrypted by the bootloader, so make
- * sure that it is encrypted before accessing it.
+ * sure that it is encrypted before accessing it. For SEV the
+ * ramdisk will already be encrypted, so only do this for SME.
*/
- sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+ if (sme_active())
+ sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
initrd_start = 0;
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7ab6db86b573..88fc3a51640c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -963,8 +963,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
- per_cpu(cpu_current_top_of_stack, cpu) =
- (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+ per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
#else
initial_gs = per_cpu_offset(cpu);
#endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5a6b8f809792..c3aca0b533a1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
#include <asm/trace/mpx.h>
#include <asm/mpx.h>
#include <asm/vm86.h>
+#include <asm/umip.h>
#ifdef CONFIG_X86_64
#include <asm/x86_init.h>
@@ -141,8 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
- BUG_ON((unsigned long)(current_top_of_stack() -
- current_stack_pointer) >= THREAD_SIZE);
+ BUG_ON(!on_thread_stack());
preempt_enable_no_resched();
}
@@ -518,6 +518,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
+ if (static_cpu_has(X86_FEATURE_UMIP)) {
+ if (user_mode(regs) && fixup_umip_exception(regs))
+ return;
+ }
+
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
new file mode 100644
index 000000000000..6ba82be68cff
--- /dev/null
+++ b/arch/x86/kernel/umip.c
@@ -0,0 +1,366 @@
+/*
+ * umip.c Emulation for instruction protected by the Intel User-Mode
+ * Instruction Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * The feature User-Mode Instruction Prevention present in recent Intel
+ * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str)
+ * from being executed with CPL > 0. Otherwise, a general protection fault is
+ * issued.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (sgdt and sidt) and those which return a
+ * value (sldt, str and smsw).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * Given that sldt and str are not commonly used in programs that run on WineHQ
+ * or DOSEMU2, they are not emulated.
+ *
+ * The instruction smsw is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ *
+ * Also, emulation is provided only for 32-bit processes; 64-bit processes
+ * that attempt to use the instructions that UMIP protects will receive the
+ * SIGSEGV signal issued as a consequence of the general protection fault.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffe0000
+#define UMIP_DUMMY_IDT_BASE 0xffff0000
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes, the
+ * only processes for which emulation is provided, X has a value of 4.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define UMIP_INST_SGDT 0 /* 0F 01 /0 */
+#define UMIP_INST_SIDT 1 /* 0F 01 /1 */
+#define UMIP_INST_SMSW 3 /* 0F 01 /4 */
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn: Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+ /* By getting modrm we also get the opcode. */
+ insn_get_modrm(insn);
+
+ if (!insn->modrm.nbytes)
+ return -EINVAL;
+
+ /* All the instructions of interest start with 0x0f. */
+ if (insn->opcode.bytes[0] != 0xf)
+ return -EINVAL;
+
+ if (insn->opcode.bytes[1] == 0x1) {
+ switch (X86_MODRM_REG(insn->modrm.value)) {
+ case 0:
+ return UMIP_INST_SGDT;
+ case 1:
+ return UMIP_INST_SIDT;
+ case 4:
+ return UMIP_INST_SMSW;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* SLDT AND STR are not emulated */
+ return -EINVAL;
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn: Instruction structure with operands
+ * @umip_inst: A constant indicating the instruction to emulate
+ * @data: Buffer into which the dummy result is stored
+ * @data_size: Size of the emulated result
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+ unsigned char *data, int *data_size)
+{
+ unsigned long dummy_base_addr, dummy_value;
+ unsigned short dummy_limit = 0;
+
+ if (!data || !data_size || !insn)
+ return -EINVAL;
+ /*
+ * These two instructions return the base address and limit of the
+ * global and interrupt descriptor table, respectively. According to the
+ * Intel Software Development manual, the base address can be 24-bit,
+ * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+ * 16-bit, the returned value of the base address is supposed to be a
+ * zero-extended 24-byte number. However, it seems that a 32-byte number
+ * is always returned irrespective of the operand size.
+ */
+ if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+ /* SGDT and SIDT do not use registers operands. */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ return -EINVAL;
+
+ if (umip_inst == UMIP_INST_SGDT)
+ dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+ else
+ dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+ *data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE;
+
+ memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE);
+ memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+ } else if (umip_inst == UMIP_INST_SMSW) {
+ dummy_value = CR0_STATE;
+
+ /*
+ * Even though the CR0 register has 4 bytes, the number
+ * of bytes to be copied in the result buffer is determined
+ * by whether the operand is a register or a memory location.
+ * If operand is a register, return as many bytes as the operand
+ * size. If operand is memory, return only the two least
+ * siginificant bytes of CR0.
+ */
+ if (X86_MODRM_MOD(insn->modrm.value) == 3)
+ *data_size = insn->opnd_bytes;
+ else
+ *data_size = 2;
+
+ memcpy(data, &dummy_value, *data_size);
+ /* STR and SLDT are not emulated */
+ } else {
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr: Address that caused the signal
+ * @regs: Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+ siginfo_t info;
+ struct task_struct *tsk = current;
+
+ tsk->thread.cr2 = (unsigned long)addr;
+ tsk->thread.error_code = X86_PF_USER | X86_PF_WRITE;
+ tsk->thread.trap_nr = X86_TRAP_PF;
+
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = addr;
+ force_sig_info(SIGSEGV, &info, tsk);
+
+ if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+ return;
+
+ pr_err_ratelimited("%s[%d] umip emulation segfault ip:%lx sp:%lx error:%x in %lx\n",
+ tsk->comm, task_pid_nr(tsk), regs->ip,
+ regs->sp, X86_PF_USER | X86_PF_WRITE,
+ regs->ip);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs: Registers as saved when entering the #GP handler
+ *
+ * The instructions sgdt, sidt, str, smsw, sldt cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). If the offending
+ * user-space process is not in long mode, this function fixes the exception
+ * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not
+ * fixed up. Also long mode user-space processes are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+ int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst;
+ unsigned long seg_base = 0, *reg_addr;
+ /* 10 bytes is the maximum size of the result of UMIP instructions */
+ unsigned char dummy_data[10] = { 0 };
+ unsigned char buf[MAX_INSN_SIZE];
+ void __user *uaddr;
+ struct insn insn;
+ char seg_defs;
+
+ if (!regs)
+ return false;
+
+ /* Do not emulate 64-bit processes. */
+ if (user_64bit_mode(regs))
+ return false;
+
+ /*
+ * If not in user-space long mode, a custom code segment could be in
+ * use. This is true in protected mode (if the process defined a local
+ * descriptor table), or virtual-8086 mode. In most of the cases
+ * seg_base will be zero as in USER_CS.
+ */
+ if (!user_64bit_mode(regs))
+ seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+
+ if (seg_base == -1L)
+ return false;
+
+ not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+ sizeof(buf));
+ nr_copied = sizeof(buf) - not_copied;
+
+ /*
+ * The copy_from_user above could have failed if user code is protected
+ * by a memory protection key. Give up on emulation in such a case.
+ * Should we issue a page fault?
+ */
+ if (!nr_copied)
+ return false;
+
+ insn_init(&insn, buf, nr_copied, user_64bit_mode(regs));
+
+ /*
+ * Override the default operand and address sizes with what is specified
+ * in the code segment descriptor. The instruction decoder only sets
+ * the address size it to either 4 or 8 address bytes and does nothing
+ * for the operand bytes. This OK for most of the cases, but we could
+ * have special cases where, for instance, a 16-bit code segment
+ * descriptor is used.
+ * If there is an address override prefix, the instruction decoder
+ * correctly updates these values, even for 16-bit defaults.
+ */
+ seg_defs = insn_get_code_seg_params(regs);
+ if (seg_defs == -EINVAL)
+ return false;
+
+ insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
+ insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
+
+ insn_get_length(&insn);
+ if (nr_copied < insn.length)
+ return false;
+
+ umip_inst = identify_insn(&insn);
+ if (umip_inst < 0)
+ return false;
+
+ if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
+ return false;
+
+ /*
+ * If operand is a register, write result to the copy of the register
+ * value that was pushed to the stack when entering into kernel mode.
+ * Upon exit, the value we write will be restored to the actual hardware
+ * register.
+ */
+ if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+ reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+ /*
+ * Negative values are usually errors. In memory addressing,
+ * the exception is -EDOM. Since we expect a register operand,
+ * all negative values are errors.
+ */
+ if (reg_offset < 0)
+ return false;
+
+ reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+ memcpy(reg_addr, dummy_data, dummy_data_size);
+ } else {
+ uaddr = insn_get_addr_ref(&insn, regs);
+ if ((unsigned long)uaddr == -1L)
+ return false;
+
+ nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+ if (nr_copied > 0) {
+ /*
+ * If copy fails, send a signal and tell caller that
+ * fault was fixed up.
+ */
+ force_sig_info_umip_fault(uaddr, regs);
+ return true;
+ }
+ }
+
+ /* increase IP to let the program keep going */
+ regs->ip += insn.length;
+ return true;
+}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 495c776de4b4..a3755d293a48 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -271,12 +271,15 @@ static bool is_prefix_bad(struct insn *insn)
int i;
for (i = 0; i < insn->prefixes.nbytes; i++) {
- switch (insn->prefixes.bytes[i]) {
- case 0x26: /* INAT_PFX_ES */
- case 0x2E: /* INAT_PFX_CS */
- case 0x36: /* INAT_PFX_DS */
- case 0x3E: /* INAT_PFX_SS */
- case 0xF0: /* INAT_PFX_LOCK */
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+ switch (attr) {
+ case INAT_MAKE_PREFIX(INAT_PFX_ES):
+ case INAT_MAKE_PREFIX(INAT_PFX_CS):
+ case INAT_MAKE_PREFIX(INAT_PFX_DS):
+ case INAT_MAKE_PREFIX(INAT_PFX_SS):
+ case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
return true;
}
}
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 014ea59aa153..3d3c2f71f617 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -33,7 +33,7 @@
#include <asm/cpufeatures.h>
#include <asm/msr-index.h>
-verify_cpu:
+ENTRY(verify_cpu)
pushf # Save caller passed flags
push $0 # Kill any dangerous flags
popf
@@ -139,3 +139,4 @@ verify_cpu:
popf # Restore caller passed flags
xorl %eax, %eax
ret
+ENDPROC(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 68244742ecb0..5edb27f1a2c4 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -55,6 +55,7 @@
#include <asm/irq.h>
#include <asm/traps.h>
#include <asm/vm86.h>
+#include <asm/switch_to.h>
/*
* Known problems:
@@ -94,7 +95,6 @@
void save_v86_state(struct kernel_vm86_regs *regs, int retval)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86plus_struct __user *user;
struct vm86 *vm86 = current->thread.vm86;
@@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
do_exit(SIGSEGV);
}
- tss = &per_cpu(cpu_tss, get_cpu());
+ preempt_disable();
tsk->thread.sp0 = vm86->saved_sp0;
tsk->thread.sysenter_cs = __KERNEL_CS;
- load_sp0(tss, &tsk->thread);
+ update_sp0(tsk);
+ refresh_sysenter_cs(&tsk->thread);
vm86->saved_sp0 = 0;
- put_cpu();
+ preempt_enable();
memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
@@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
{
- struct tss_struct *tss;
struct task_struct *tsk = current;
struct vm86 *vm86 = tsk->thread.vm86;
struct kernel_vm86_regs vm86regs;
@@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86->saved_sp0 = tsk->thread.sp0;
lazy_save_gs(vm86->regs32.gs);
- tss = &per_cpu(cpu_tss, get_cpu());
/* make room for real-mode segments */
+ preempt_disable();
tsk->thread.sp0 += 16;
- if (static_cpu_has(X86_FEATURE_SEP))
+ if (static_cpu_has(X86_FEATURE_SEP)) {
tsk->thread.sysenter_cs = 0;
+ refresh_sysenter_cs(&tsk->thread);
+ }
- load_sp0(tss, &tsk->thread);
- put_cpu();
+ update_sp0(tsk);
+ preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);