summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt2
-rw-r--r--Documentation/x86/entry_64.rst9
-rw-r--r--Documentation/x86/x86_64/fsgs.rst199
-rw-r--r--Documentation/x86/x86_64/index.rst1
-rw-r--r--arch/x86/entry/calling.h40
-rw-r--r--arch/x86/entry/entry_64.S115
-rw-r--r--arch/x86/include/asm/fsgsbase.h45
-rw-r--r--arch/x86/include/asm/inst.h15
-rw-r--r--arch/x86/include/uapi/asm/hwcap2.h3
-rw-r--r--arch/x86/kernel/cpu/common.c22
-rw-r--r--arch/x86/kernel/cpu/intel.c27
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c15
-rw-r--r--arch/x86/kernel/process_64.c119
-rw-r--r--tools/testing/selftests/x86/Makefile5
-rw-r--r--tools/testing/selftests/x86/fsgsbase.c74
-rw-r--r--tools/testing/selftests/x86/syscall_arg_fault.c112
16 files changed, 235 insertions, 568 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 35bc3c3574c6..138f6664b2e2 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2857,8 +2857,6 @@
no5lvl [X86-64] Disable 5-level paging mode. Forces
kernel to use 4-level paging instead.
- nofsgsbase [X86] Disables FSGSBASE instructions.
-
no_console_suspend
[HW] Never suspend the console
Disable suspending of consoles during suspend and
diff --git a/Documentation/x86/entry_64.rst b/Documentation/x86/entry_64.rst
index b87c1d816aea..a48b3f6ebbe8 100644
--- a/Documentation/x86/entry_64.rst
+++ b/Documentation/x86/entry_64.rst
@@ -108,12 +108,3 @@ We try to only use IST entries and the paranoid entry code for vectors
that absolutely need the more expensive check for the GS base - and we
generate all 'normal' entry points with the regular (faster) paranoid=0
variant.
-
-On a FSGSBASE system, however, user space can set GS without kernel
-interaction. It means the value of GS base itself does not imply anything,
-whether a kernel value or a user space value. So, there is no longer a safe
-way to check whether the exception is entering from user mode or kernel
-mode in the paranoid entry code path. So the GSBASE value needs to be read
-out, saved and the kernel GSBASE value written. On exit the saved GSBASE
-value needs to be restored unconditionally. The non paranoid entry/exit
-code still uses SWAPGS unconditionally as the state is known.
diff --git a/Documentation/x86/x86_64/fsgs.rst b/Documentation/x86/x86_64/fsgs.rst
deleted file mode 100644
index 380c0b5ccca2..000000000000
--- a/Documentation/x86/x86_64/fsgs.rst
+++ /dev/null
@@ -1,199 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-Using FS and GS segments in user space applications
-===================================================
-
-The x86 architecture supports segmentation. Instructions which access
-memory can use segment register based addressing mode. The following
-notation is used to address a byte within a segment:
-
- Segment-register:Byte-address
-
-The segment base address is added to the Byte-address to compute the
-resulting virtual address which is accessed. This allows to access multiple
-instances of data with the identical Byte-address, i.e. the same code. The
-selection of a particular instance is purely based on the base-address in
-the segment register.
-
-In 32-bit mode the CPU provides 6 segments, which also support segment
-limits. The limits can be used to enforce address space protections.
-
-In 64-bit mode the CS/SS/DS/ES segments are ignored and the base address is
-always 0 to provide a full 64bit address space. The FS and GS segments are
-still functional in 64-bit mode.
-
-Common FS and GS usage
-------------------------------
-
-The FS segment is commonly used to address Thread Local Storage (TLS). FS
-is usually managed by runtime code or a threading library. Variables
-declared with the '__thread' storage class specifier are instantiated per
-thread and the compiler emits the FS: address prefix for accesses to these
-variables. Each thread has its own FS base address so common code can be
-used without complex address offset calculations to access the per thread
-instances. Applications should not use FS for other purposes when they use
-runtimes or threading libraries which manage the per thread FS.
-
-The GS segment has no common use and can be used freely by
-applications. GCC and Clang support GS based addressing via address space
-identifiers.
-
-Reading and writing the FS/GS base address
-------------------------------------------
-
-There exist two mechanisms to read and write the FS/FS base address:
-
- - the arch_prctl() system call
-
- - the FSGSBASE instruction family
-
-Accessing FS/GS base with arch_prctl()
---------------------------------------
-
- The arch_prctl(2) based mechanism is available on all 64bit CPUs and all
- kernel versions.
-
- Reading the base:
-
- arch_prctl(ARCH_GET_FS, &fsbase);
- arch_prctl(ARCH_GET_GS, &gsbase);
-
- Writing the base:
-
- arch_prctl(ARCH_SET_FS, fsbase);
- arch_prctl(ARCH_SET_GS, gsbase);
-
- The ARCH_SET_GS prctl may be disabled depending on kernel configuration
- and security settings.
-
-Accessing FS/GS base with the FSGSBASE instructions
----------------------------------------------------
-
- With the Ivy Bridge CPU generation Intel introduced a new set of
- instructions to access the FS and GS base registers directly from user
- space. These instructions are also supported on AMD Family 17H CPUs. The
- following instructions are available:
-
- =============== ===========================
- RDFSBASE %reg Read the FS base register
- RDGSBASE %reg Read the GS base register
- WRFSBASE %reg Write the FS base register
- WRGSBASE %reg Write the GS base register
- =============== ===========================
-
- The instructions avoid the overhead of the arch_prctl() syscall and allow
- more flexible usage of the FS/GS addressing modes in user space
- applications. This does not prevent conflicts between threading libraries
- and runtimes which utilize FS and applications which want to use it for
- their own purpose.
-
-FSGSBASE instructions enablement
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- The instructions are enumerated in CPUID leaf 7, bit 0 of EBX. If
- available /proc/cpuinfo shows 'fsgsbase' in the flag entry of the CPUs.
-
- The availability of the instructions does not enable them
- automatically. The kernel has to enable them explicitly in CR4. The
- reason for this is that older kernels make assumptions about the values in
- the GS register and enforce them when GS base is set via
- arch_prctl(). Allowing user space to write arbitrary values to GS base
- would violate these assumptions and cause malfunction.
-
- On kernels which do not enable FSGSBASE the execution of the FSGSBASE
- instructions will fault with a #UD exception.
-
- The kernel provides reliable information about the enabled state in the
- ELF AUX vector. If the HWCAP2_FSGSBASE bit is set in the AUX vector, the
- kernel has FSGSBASE instructions enabled and applications can use them.
- The following code example shows how this detection works::
-
- #include <sys/auxv.h>
- #include <elf.h>
-
- /* Will be eventually in asm/hwcap.h */
- #ifndef HWCAP2_FSGSBASE
- #define HWCAP2_FSGSBASE (1 << 1)
- #endif
-
- ....
-
- unsigned val = getauxval(AT_HWCAP2);
-
- if (val & HWCAP2_FSGSBASE)
- printf("FSGSBASE enabled\n");
-
-FSGSBASE instructions compiler support
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-GCC version 4.6.4 and newer provide instrinsics for the FSGSBASE
-instructions. Clang supports them as well.
-
- =================== ===========================
- _readfsbase_u64() Read the FS base register
- _readfsbase_u64() Read the GS base register
- _writefsbase_u64() Write the FS base register
- _writegsbase_u64() Write the GS base register
- =================== ===========================
-
-To utilize these instrinsics <immintrin.h> must be included in the source
-code and the compiler option -mfsgsbase has to be added.
-
-Compiler support for FS/GS based addressing
--------------------------------------------
-
-GCC version 6 and newer provide support for FS/GS based addressing via
-Named Address Spaces. GCC implements the following address space
-identifiers for x86:
-
- ========= ====================================
- __seg_fs Variable is addressed relative to FS
- __seg_gs Variable is addressed relative to GS
- ========= ====================================
-
-The preprocessor symbols __SEG_FS and __SEG_GS are defined when these
-address spaces are supported. Code which implements fallback modes should
-check whether these symbols are defined. Usage example::
-
- #ifdef __SEG_GS
-
- long data0 = 0;
- long data1 = 1;
-
- long __seg_gs *ptr;
-
- /* Check whether FSGSBASE is enabled by the kernel (HWCAP2_FSGSBASE) */
- ....
-
- /* Set GS to point to data0 */
- _writegsbase_u64(&data0);
-
- /* Access offset 0 of GS */
- ptr = 0;
- printf("data0 = %ld\n", *ptr);
-
- /* Set GS to point to data1 */
- _writegsbase_u64(&data1);
- /* ptr still addresses offset 0! */
- printf("data1 = %ld\n", *ptr);
-
-
-Clang does not provide the GCC address space identifiers, but it provides
-address spaces via an attribute based mechanism in Clang 5 and newer
-versions:
-
- ==================================== =====================================
- __attribute__((address_space(256)) Variable is addressed relative to GS
- __attribute__((address_space(257)) Variable is addressed relative to FS
- ==================================== =====================================
-
-FS/GS based addressing with inline assembly
--------------------------------------------
-
-In case the compiler does not support address spaces, inline assembly can
-be used for FS/GS based addressing mode::
-
- mov %fs:offset, %reg
- mov %gs:offset, %reg
-
- mov %reg, %fs:offset
- mov %reg, %gs:offset
diff --git a/Documentation/x86/x86_64/index.rst b/Documentation/x86/x86_64/index.rst
index a56070fc8e77..d6eaaa5a35fc 100644
--- a/Documentation/x86/x86_64/index.rst
+++ b/Documentation/x86/x86_64/index.rst
@@ -14,4 +14,3 @@ x86_64 Support
fake-numa-for-cpusets
cpu-hotplug-spec
machinecheck
- fsgs
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index d3fbe2dc03ea..efb0d1b1f15f 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,7 +6,6 @@
#include <asm/percpu.h>
#include <asm/asm-offsets.h>
#include <asm/processor-flags.h>
-#include <asm/inst.h>
/*
@@ -338,12 +337,6 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
-.macro SAVE_AND_SET_GSBASE scratch_reg:req save_reg:req
- rdgsbase \save_reg
- GET_PERCPU_BASE \scratch_reg
- wrgsbase \scratch_reg
-.endm
-
#endif /* CONFIG_X86_64 */
.macro STACKLEAK_ERASE
@@ -352,39 +345,6 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
-#ifdef CONFIG_SMP
-
-/*
- * CPU/node NR is loaded from the limit (size) field of a special segment
- * descriptor entry in GDT.
- */
-.macro LOAD_CPU_AND_NODE_SEG_LIMIT reg:req
- movq $__CPUNODE_SEG, \reg
- lsl \reg, \reg
-.endm
-
-/*
- * Fetch the per-CPU GSBASE value for this processor and put it in @reg.
- * We normally use %gs for accessing per-CPU data, but we are setting up
- * %gs here and obviously can not use %gs itself to access per-CPU data.
- */
-.macro GET_PERCPU_BASE reg:req
- ALTERNATIVE \
- "LOAD_CPU_AND_NODE_SEG_LIMIT \reg", \
- "RDPID \reg", \
- X86_FEATURE_RDPID
- andq $VDSO_CPUNODE_MASK, \reg
- movq __per_cpu_offset(, \reg, 8), \reg
-.endm
-
-#else
-
-.macro GET_PERCPU_BASE reg:req
- movq pcpu_unit_offsets(%rip), \reg
-.endm
-
-#endif /* CONFIG_SMP */
-
/*
* This does 'call enter_from_user_mode' unless we can avoid it based on
* kernel config or using the static jump infrastructure.
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7f9f5119d6b1..3b7a0e8d3bc0 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -38,7 +38,6 @@
#include <asm/export.h>
#include <asm/frame.h>
#include <asm/nospec-branch.h>
-#include <asm/fsgsbase.h>
#include <linux/err.h>
#include "calling.h"
@@ -948,6 +947,7 @@ ENTRY(\sym)
addq $\ist_offset, CPU_TSS_IST(\shift_ist)
.endif
+ /* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
jmp paranoid_exit
.else
@@ -1164,21 +1164,24 @@ idtentry machine_check do_mce has_error_code=0 paranoid=1
#endif
/*
- * Save all registers in pt_regs. Return GSBASE related information
- * in EBX depending on the availability of the FSGSBASE instructions:
- *
- * FSGSBASE R/EBX
- * N 0 -> SWAPGS on exit
- * 1 -> no SWAPGS on exit
- *
- * Y GSBASE value at entry, must be restored in paranoid_exit
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(paranoid_entry)
UNWIND_HINT_FUNC
cld
PUSH_AND_CLEAR_REGS save_ret=1
ENCODE_FRAME_POINTER 8
+ movl $1, %ebx
+ movl $MSR_GS_BASE, %ecx
+ rdmsr
+ testl %edx, %edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx, %ebx
+1:
/*
* Always stash CR3 in %r14. This value will be restored,
* verbatim, at exit. Needed if paranoid_entry interrupted
@@ -1188,49 +1191,9 @@ ENTRY(paranoid_entry)
* This is also why CS (stashed in the "iret frame" by the
* hardware at entry) can not be used: this may be a return
* to kernel code, but with a user CR3 value.
- *
- * Switching CR3 does not depend on kernel GSBASE so it can
- * be done before switching to the kernel GSBASE. This is
- * required for FSGSBASE because the kernel GSBASE has to
- * be retrieved from a kernel internal table.
*/
SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
- /*
- * Handling GSBASE depends on the availability of FSGSBASE.
- *
- * Without FSGSBASE the kernel enforces that negative GSBASE
- * values indicate kernel GSBASE. With FSGSBASE no assumptions
- * can be made about the GSBASE value when entering from user
- * space.
- */
- ALTERNATIVE "jmp .Lparanoid_entry_checkgs", "", X86_FEATURE_FSGSBASE
-
- /*
- * Read the current GSBASE and store it in in %rbx unconditionally,
- * retrieve and set the current CPUs kernel GSBASE. The stored value
- * has to be restored in paranoid_exit unconditionally.
- */
- SAVE_AND_SET_GSBASE scratch_reg=%rax save_reg=%rbx
- ret
-
-.Lparanoid_entry_checkgs:
- /* EBX = 1 -> kernel GSBASE active, no restore required */
- movl $1, %ebx
- /*
- * The kernel-enforced convention is a negative GSBASE indicates
- * a kernel value. No SWAPGS needed on entry and exit.
- */
- movl $MSR_GS_BASE, %ecx
- rdmsr
- testl %edx, %edx
- jns .Lparanoid_entry_swapgs
- ret
-
-.Lparanoid_entry_swapgs:
- SWAPGS
- /* EBX = 0 -> SWAPGS required on exit */
- xorl %ebx, %ebx
ret
END(paranoid_entry)
@@ -1241,47 +1204,28 @@ END(paranoid_entry)
*
* We may be returning to very strange contexts (e.g. very early
* in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, there's no good reason to try
- * to handle preemption here.
- *
- * R/EBX contains the GSBASE related information depending on the
- * availability of the FSGSBASE instructions:
- *
- * FSGSBASE R/EBX
- * N 0 -> SWAPGS on exit
- * 1 -> no SWAPGS on exit
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
*
- * Y User space GSBASE, must be restored unconditionally
+ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
*/
ENTRY(paranoid_exit)
UNWIND_HINT_REGS
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF_DEBUG
-
- /* Handle GS depending on FSGSBASE availability */
- ALTERNATIVE "jmp .Lparanoid_exit_checkgs", "nop",X86_FEATURE_FSGSBASE
-
- /* With FSGSBASE enabled, unconditionally restore GSBASE */
- wrgsbase %rbx
- jmp .Lparanoid_exit_no_swapgs;
-
-.Lparanoid_exit_checkgs:
- /* On non-FSGSBASE systems, conditionally do SWAPGS */
- testl %ebx, %ebx
+ testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
TRACE_IRQS_IRETQ
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp .Lparanoid_exit_restore
-
.Lparanoid_exit_no_swapgs:
TRACE_IRQS_IRETQ_DEBUG
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
-
.Lparanoid_exit_restore:
- jmp restore_regs_and_return_to_kernel
+ jmp restore_regs_and_return_to_kernel
END(paranoid_exit)
/*
@@ -1692,27 +1636,10 @@ end_repeat_nmi:
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
- /*
- * The above invocation of paranoid_entry stored the GSBASE
- * related information in R/EBX depending on the availability
- * of FSGSBASE.
- *
- * If FSGSBASE is enabled, restore the saved GSBASE value
- * unconditionally, otherwise take the conditional SWAPGS path.
- */
- ALTERNATIVE "jmp nmi_no_fsgsbase", "", X86_FEATURE_FSGSBASE
-
- wrgsbase %rbx
- jmp nmi_restore
-
-nmi_no_fsgsbase:
- /* EBX == 0 -> invoke SWAPGS */
- testl %ebx, %ebx
+ testl %ebx, %ebx /* swapgs needed? */
jnz nmi_restore
-
nmi_swapgs:
SWAPGS_UNSAFE_STACK
-
nmi_restore:
POP_REGS
@@ -1743,11 +1670,17 @@ nmi_restore:
iretq
END(nmi)
+#ifndef CONFIG_IA32_EMULATION
+/*
+ * This handles SYSCALL from 32-bit code. There is no way to program
+ * MSRs to fully disable 32-bit SYSCALL.
+ */
ENTRY(ignore_sysret)
UNWIND_HINT_EMPTY
mov $-ENOSYS, %eax
sysret
END(ignore_sysret)
+#endif
ENTRY(rewind_stack_do_exit)
UNWIND_HINT_FUNC
diff --git a/arch/x86/include/asm/fsgsbase.h b/arch/x86/include/asm/fsgsbase.h
index aefd53767a5d..bca4c743de77 100644
--- a/arch/x86/include/asm/fsgsbase.h
+++ b/arch/x86/include/asm/fsgsbase.h
@@ -19,63 +19,36 @@ extern unsigned long x86_gsbase_read_task(struct task_struct *task);
extern void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase);
extern void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase);
-/* Must be protected by X86_FEATURE_FSGSBASE check. */
+/* Helper functions for reading/writing FS/GS base */
-static __always_inline unsigned long rdfsbase(void)
+static inline unsigned long x86_fsbase_read_cpu(void)
{
unsigned long fsbase;
- asm volatile("rdfsbase %0" : "=r" (fsbase) :: "memory");
+ rdmsrl(MSR_FS_BASE, fsbase);
return fsbase;
}
-static __always_inline unsigned long rdgsbase(void)
+static inline unsigned long x86_gsbase_read_cpu_inactive(void)
{
unsigned long gsbase;
- asm volatile("rdgsbase %0" : "=r" (gsbase) :: "memory");
+ rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
return gsbase;
}
-static __always_inline void wrfsbase(unsigned long fsbase)
-{
- asm volatile("wrfsbase %0" :: "r" (fsbase) : "memory");
-}
-
-static __always_inline void wrgsbase(unsigned long gsbase)
-{
- asm volatile("wrgsbase %0" :: "r" (gsbase) : "memory");
-}
-
-#include <asm/cpufeature.h>
-
-/* Helper functions for reading/writing FS/GS base */
-
-static inline unsigned long x86_fsbase_read_cpu(void)
+static inline void x86_fsbase_write_cpu(unsigned long fsbase)
{
- unsigned long fsbase;
-
- if (static_cpu_has(X86_FEATURE_FSGSBASE))
- fsbase = rdfsbase();
- else
- rdmsrl(MSR_FS_BASE, fsbase);
-
- return fsbase;
+ wrmsrl(MSR_FS_BASE, fsbase);
}
-static inline void x86_fsbase_write_cpu(unsigned long fsbase)
+static inline void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
{
- if (static_cpu_has(X86_FEATURE_FSGSBASE))
- wrfsbase(fsbase);
- else
- wrmsrl(MSR_FS_BASE, fsbase);
+ wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
}
-extern unsigned long x86_gsbase_read_cpu_inactive(void);
-extern void x86_gsbase_write_cpu_inactive(unsigned long gsbase);
-
#endif /* CONFIG_X86_64 */
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
index d063841a17e3..f5a796da07f8 100644
--- a/arch/x86/include/asm/inst.h
+++ b/arch/x86/include/asm/inst.h
@@ -306,21 +306,6 @@
.endif
MODRM 0xc0 movq_r64_xmm_opd1 movq_r64_xmm_opd2
.endm
-
-.macro RDPID opd
- REG_TYPE rdpid_opd_type \opd
- .if rdpid_opd_type == REG_TYPE_R64
- R64_NUM rdpid_opd \opd
- .else
- R32_NUM rdpid_opd \opd
- .endif
- .byte 0xf3
- .if rdpid_opd > 7
- PFX_REX rdpid_opd 0
- .endif
- .byte 0x0f, 0xc7
- MODRM 0xc0 rdpid_opd 0x7
-.endm
#endif
#endif
diff --git a/arch/x86/include/uapi/asm/hwcap2.h b/arch/x86/include/uapi/asm/hwcap2.h
index c5ce54e749f6..6ebaae90e207 100644
--- a/arch/x86/include/uapi/asm/hwcap2.h
+++ b/arch/x86/include/uapi/asm/hwcap2.h
@@ -5,7 +5,4 @@
/* MONITOR/MWAIT enabled in Ring 3 */
#define HWCAP2_RING3MWAIT (1 << 0)
-/* Kernel allows FSGSBASE instructions available in Ring 3 */
-#define HWCAP2_FSGSBASE BIT(1)
-
#endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0948fc25446a..482f74859fb7 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -366,22 +366,6 @@ out:
cr4_clear_bits(X86_CR4_UMIP);
}
-static __init int x86_nofsgsbase_setup(char *arg)
-{
- /* Require an exact match without trailing characters. */
- if (strlen(arg))
- return 0;
-
- /* Do not emit a message if the feature is not present. */
- if (!boot_cpu_has(X86_FEATURE_FSGSBASE))
- return 1;
-
- setup_clear_cpu_cap(X86_FEATURE_FSGSBASE);
- pr_info("FSGSBASE disabled via kernel command line\n");
- return 1;
-}
-__setup("nofsgsbase", x86_nofsgsbase_setup);
-
/*
* Protection Keys are not available in 32-bit mode.
*/
@@ -1387,12 +1371,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
setup_smap(c);
setup_umip(c);
- /* Enable FSGSBASE instructions if available. */
- if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
- cr4_set_bits(X86_CR4_FSGSBASE);
- elf_hwcap2 |= HWCAP2_FSGSBASE;
- }
-
/*
* The vendor-specific functions might have changed features.
* Now we do "generic changes."
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index f17c1a714779..8d6d92ebeb54 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -66,6 +66,32 @@ void check_mpx_erratum(struct cpuinfo_x86 *c)
}
}
+/*
+ * Processors which have self-snooping capability can handle conflicting
+ * memory type across CPUs by snooping its own cache. However, there exists
+ * CPU models in which having conflicting memory types still leads to
+ * unpredictable behavior, machine check errors, or hangs. Clear this
+ * feature to prevent its use on machines with known erratas.
+ */
+static void check_memory_type_self_snoop_errata(struct cpuinfo_x86 *c)
+{
+ switch (c->x86_model) {
+ case INTEL_FAM6_CORE_YONAH:
+ case INTEL_FAM6_CORE2_MEROM:
+ case INTEL_FAM6_CORE2_MEROM_L:
+ case INTEL_FAM6_CORE2_PENRYN:
+ case INTEL_FAM6_CORE2_DUNNINGTON:
+ case INTEL_FAM6_NEHALEM:
+ case INTEL_FAM6_NEHALEM_G:
+ case INTEL_FAM6_NEHALEM_EP:
+ case INTEL_FAM6_NEHALEM_EX:
+ case INTEL_FAM6_WESTMERE:
+ case INTEL_FAM6_WESTMERE_EP:
+ case INTEL_FAM6_SANDYBRIDGE:
+ setup_clear_cpu_cap(X86_FEATURE_SELFSNOOP);
+ }
+}
+
static bool ring3mwait_disabled __read_mostly;
static int __init ring3mwait_disable(char *__unused)
@@ -304,6 +330,7 @@ static void early_init_intel(struct cpuinfo_x86 *c)
}
check_mpx_erratum(c);
+ check_memory_type_self_snoop_errata(c);
/*
* Get the number of SMT siblings early from the extended topology
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 9356c1c9024d..aa5c064a6a22 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -743,7 +743,15 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
cr0 = read_cr0() | X86_CR0_CD;
write_cr0(cr0);
- wbinvd();
+
+ /*
+ * Cache flushing is the most time-consuming step when programming
+ * the MTRRs. Fortunately, as per the Intel Software Development
+ * Manual, we can skip it if the processor supports cache self-
+ * snooping.
+ */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
/* Save value of CR4 and clear Page Global Enable (bit 7) */
if (boot_cpu_has(X86_FEATURE_PGE)) {
@@ -760,7 +768,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Disable MTRRs, and set the default type to uncached */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
- wbinvd();
+
+ /* Again, only flush caches if we have to. */
+ if (!static_cpu_has(X86_FEATURE_SELFSNOOP))
+ wbinvd();
}
static void post_set(void) __releases(set_atomicity_lock)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 8f239091c15d..250e4c4ac6d9 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -162,40 +162,6 @@ enum which_selector {
};
/*
- * Out of line to be protected from kprobes. It is not used on Xen
- * paravirt. When paravirt support is needed, it needs to be renamed
- * with native_ prefix.
- */
-static noinline unsigned long __rdgsbase_inactive(void)
-{
- unsigned long gsbase;
-
- lockdep_assert_irqs_disabled();
-
- native_swapgs();
- gsbase = rdgsbase();
- native_swapgs();
-
- return gsbase;
-}
-NOKPROBE_SYMBOL(__rdgsbase_inactive);
-
-/*
- * Out of line to be protected from kprobes. It is not used on Xen
- * paravirt. When paravirt support is needed, it needs to be renamed
- * with native_ prefix.
- */
-static noinline void __wrgsbase_inactive(unsigned long gsbase)
-{
- lockdep_assert_irqs_disabled();
-
- native_swapgs();
- wrgsbase(gsbase);
- native_swapgs();
-}
-NOKPROBE_SYMBOL(__wrgsbase_inactive);
-
-/*
* Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
* not available. The goal is to be reasonably fast on non-FSGSBASE systems.
* It's forcibly inlined because it'll generate better code and this function
@@ -244,22 +210,8 @@ static __always_inline void save_fsgs(struct task_struct *task)
{
savesegment(fs, task->thread.fsindex);
savesegment(gs, task->thread.gsindex);
- if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
- unsigned long flags;
-
- /*
- * If FSGSBASE is enabled, we can't make any useful guesses
- * about the base, and user code expects us to save the current
- * value. Fortunately, reading the base directly is efficient.
- */
- task->thread.fsbase = rdfsbase();
- local_irq_save(flags);
- task->thread.gsbase = __rdgsbase_inactive();
- local_irq_restore(flags);
- } else {
- save_base_legacy(task, task->thread.fsindex, FS);
- save_base_legacy(task, task->thread.gsindex, GS);
- }
+ save_base_legacy(task, task->thread.fsindex, FS);
+ save_base_legacy(task, task->thread.gsindex, GS);
}
#if IS_ENABLED(CONFIG_KVM)
@@ -338,22 +290,10 @@ static __always_inline void load_seg_legacy(unsigned short prev_index,
static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
struct thread_struct *next)
{
- if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
- /* Update the FS and GS selectors if they could have changed. */
- if (unlikely(prev->fsindex || next->fsindex))
- loadseg(FS, next->fsindex);
- if (unlikely(prev->gsindex || next->gsindex))
- loadseg(GS, next->gsindex);
-
- /* Update the bases. */
- wrfsbase(next->fsbase);
- __wrgsbase_inactive(next->gsbase);
- } else {
- load_seg_legacy(prev->fsindex, prev->fsbase,
- next->fsindex, next->fsbase, FS);
- load_seg_legacy(prev->gsindex, prev->gsbase,
- next->gsindex, next->gsbase, GS);
- }
+ load_seg_legacy(prev->fsindex, prev->fsbase,
+ next->fsindex, next->fsbase, FS);
+ load_seg_legacy(prev->gsindex, prev->gsbase,
+ next->gsindex, next->gsbase, GS);
}
static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
@@ -399,46 +339,13 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
return base;
}
-unsigned long x86_gsbase_read_cpu_inactive(void)
-{
- unsigned long gsbase;
-
- if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
- unsigned long flags;
-
- /* Interrupts are disabled here. */
- local_irq_save(flags);
- gsbase = __rdgsbase_inactive();
- local_irq_restore(flags);
- } else {
- rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
- }
-
- return gsbase;
-}
-
-void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
-{
- if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
- unsigned long flags;
-
- /* Interrupts are disabled here. */
- local_irq_save(flags);
- __wrgsbase_inactive(gsbase);
- local_irq_restore(flags);
- } else {
- wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
- }
-}
-
unsigned long x86_fsbase_read_task(struct task_struct *task)
{
unsigned long fsbase;
if (task == current)
fsbase = x86_fsbase_read_cpu();
- else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
- (task->thread.fsindex == 0))
+ else if (task->thread.fsindex == 0)
fsbase = task->thread.fsbase;
else
fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
@@ -452,8 +359,7 @@ unsigned long x86_gsbase_read_task(struct task_struct *task)
if (task == current)
gsbase = x86_gsbase_read_cpu_inactive();
- else if (static_cpu_has(X86_FEATURE_FSGSBASE) ||
- (task->thread.gsindex == 0))
+ else if (task->thread.gsindex == 0)
gsbase = task->thread.gsbase;
else
gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
@@ -493,11 +399,10 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
p->thread.sp = (unsigned long) fork_frame;
p->thread.io_bitmap_ptr = NULL;
- save_fsgs(me);
- p->thread.fsindex = me->thread.fsindex;
- p->thread.fsbase = me->thread.fsbase;
- p->thread.gsindex = me->thread.gsindex;
- p->thread.gsbase = me->thread.gsbase;
+ savesegment(gs, p->thread.gsindex);
+ p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
+ savesegment(fs, p->thread.fsindex);
+ p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
savesegment(es, p->thread.es);
savesegment(ds, p->thread.ds);
memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 186520198de7..fa07d526fe39 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -12,8 +12,9 @@ CAN_BUILD_WITH_NOPIE := $(shell ./check_cc.sh $(CC) trivial_program.c -no-pie)
TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt test_mremap_vdso \
check_initial_reg_state sigreturn iopl mpx-mini-test ioperm \
- protection_keys test_vdso test_vsyscall mov_ss_trap
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \
+ protection_keys test_vdso test_vsyscall mov_ss_trap \
+ syscall_arg_fault
+TARGETS_C_32BIT_ONLY := entry_from_vm86 test_syscall_vdso unwind_vdso \
test_FCMOV test_FCOMI test_FISTTP \
vdso_restorer
TARGETS_C_64BIT_ONLY := fsgsbase sysret_rip
diff --git a/tools/testing/selftests/x86/fsgsbase.c b/tools/testing/selftests/x86/fsgsbase.c
index 21fd4f94b5b0..5ab4c60c100e 100644
--- a/tools/testing/selftests/x86/fsgsbase.c
+++ b/tools/testing/selftests/x86/fsgsbase.c
@@ -35,6 +35,8 @@
static volatile sig_atomic_t want_segv;
static volatile unsigned long segv_addr;
+static unsigned short *shared_scratch;
+
static int nerrs;
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
@@ -242,16 +244,11 @@ static void do_remote_base()
static __thread int set_thread_area_entry_number = -1;
-static void do_unexpected_base(void)
+static unsigned short load_gs(void)
{
/*
- * The goal here is to try to arrange for GS == 0, GSBASE !=
- * 0, and for the the kernel the think that GSBASE == 0.
- *
- * To make the test as reliable as possible, this uses
- * explicit descriptors. (This is not the only way. This
- * could use ARCH_SET_GS with a low, nonzero base, but the
- * relevant side effect of ARCH_SET_GS could change.)
+ * Sets GS != 0 and GSBASE != 0 but arranges for the kernel to think
+ * that GSBASE == 0 (i.e. thread.gsbase == 0).
*/
/* Step 1: tell the kernel that we have GSBASE == 0. */
@@ -271,8 +268,9 @@ static void do_unexpected_base(void)
.useable = 0
};
if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) == 0) {
- printf("\tother thread: using LDT slot 0\n");
+ printf("\tusing LDT slot 0\n");
asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
+ return 0x7;
} else {
/* No modify_ldt for us (configured out, perhaps) */
@@ -294,20 +292,15 @@ static void do_unexpected_base(void)
if (ret != 0) {
printf("[NOTE]\tcould not create a segment -- test won't do anything\n");
- return;
+ return 0;
}
- printf("\tother thread: using GDT slot %d\n", desc.entry_number);
+ printf("\tusing GDT slot %d\n", desc.entry_number);
set_thread_area_entry_number = desc.entry_number;
- asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)((desc.entry_number << 3) | 0x3)));
+ unsigned short gs = (unsigned short)((desc.entry_number << 3) | 0x3);
+ asm volatile ("mov %0, %%gs" : : "rm" (gs));
+ return gs;
}
-
- /*
- * Step 3: set the selector back to zero. On AMD chips, this will
- * preserve GSBASE.
- */
-
- asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
}
void test_wrbase(unsigned short index, unsigned long base)
@@ -346,12 +339,19 @@ static void *threadproc(void *ctx)
if (ftx == 3)
return NULL;
- if (ftx == 1)
+ if (ftx == 1) {
do_remote_base();
- else if (ftx == 2)
- do_unexpected_base();
- else
+ } else if (ftx == 2) {
+ /*
+ * On AMD chips, this causes GSBASE != 0, GS == 0, and
+ * thread.gsbase == 0.
+ */
+
+ load_gs();
+ asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0));
+ } else {
errx(1, "helper thread got bad command");
+ }
ftx = 0;
syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
@@ -453,12 +453,7 @@ static void test_ptrace_write_gsbase(void)
if (child == 0) {
printf("[RUN]\tPTRACE_POKE(), write GSBASE from ptracer\n");
- /*
- * Use the LDT setup and fetch the GSBASE from the LDT
- * by switching to the (nonzero) selector (again)
- */
- do_unexpected_base();
- asm volatile ("mov %0, %%gs" : : "rm" ((unsigned short)0x7));
+ *shared_scratch = load_gs();
if (ptrace(PTRACE_TRACEME, 0, NULL, NULL) != 0)
err(1, "PTRACE_TRACEME");
@@ -476,7 +471,7 @@ static void test_ptrace_write_gsbase(void)
gs = ptrace(PTRACE_PEEKUSER, child, gs_offset, NULL);
- if (gs != 0x7) {
+ if (gs != *shared_scratch) {
nerrs++;
printf("[FAIL]\tGS is not prepared with nonzero\n");
goto END;
@@ -494,16 +489,24 @@ static void test_ptrace_write_gsbase(void)
* selector value is changed or not by the GSBASE write in
* a ptracer.
*/
- if (gs != 0x7) {
+ if (gs != *shared_scratch) {
nerrs++;
printf("[FAIL]\tGS changed to %lx\n", gs);
+
+ /*
+ * On older kernels, poking a nonzero value into the
+ * base would zero the selector. On newer kernels,
+ * this behavior has changed -- poking the base
+ * changes only the base and, if FSGSBASE is not
+ * available, this may have no effect.
+ */
+ if (gs == 0)
+ printf("\tNote: this is expected behavior on older kernels.\n");
} else if (have_fsgsbase && (base != 0xFF)) {
nerrs++;
printf("[FAIL]\tGSBASE changed to %lx\n", base);
} else {
- printf("[OK]\tGS remained 0x7 %s");
- if (have_fsgsbase)
- printf("and GSBASE changed to 0xFF");
+ printf("[OK]\tGS remained 0x%hx%s", *shared_scratch, have_fsgsbase ? " and GSBASE changed to 0xFF" : "");
printf("\n");
}
}
@@ -516,6 +519,9 @@ int main()
{
pthread_t thread;
+ shared_scratch = mmap(NULL, 4096, PROT_READ | PROT_WRITE,
+ MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+
/* Probe FSGSBASE */
sethandler(SIGILL, sigill, 0);
if (sigsetjmp(jmpbuf, 1) == 0) {
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
index 4e25d38c8bbd..bc0ecc2e862e 100644
--- a/tools/testing/selftests/x86/syscall_arg_fault.c
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -15,9 +15,30 @@
#include <setjmp.h>
#include <errno.h>
+#ifdef __x86_64__
+# define WIDTH "q"
+#else
+# define WIDTH "l"
+#endif
+
/* Our sigaltstack scratch space. */
static unsigned char altstack_data[SIGSTKSZ];
+static unsigned long get_eflags(void)
+{
+ unsigned long eflags;
+ asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
+ return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+ asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
+ : : "rm" (eflags) : "flags");
+}
+
+#define X86_EFLAGS_TF (1UL << 8)
+
static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
int flags)
{
@@ -35,13 +56,22 @@ static sigjmp_buf jmpbuf;
static volatile sig_atomic_t n_errs;
+#ifdef __x86_64__
+#define REG_AX REG_RAX
+#define REG_IP REG_RIP
+#else
+#define REG_AX REG_EAX
+#define REG_IP REG_EIP
+#endif
+
static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
{
ucontext_t *ctx = (ucontext_t*)ctx_void;
+ long ax = (long)ctx->uc_mcontext.gregs[REG_AX];
- if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
- printf("[FAIL]\tAX had the wrong value: 0x%x\n",
- ctx->uc_mcontext.gregs[REG_EAX]);
+ if (ax != -EFAULT && ax != -ENOSYS) {
+ printf("[FAIL]\tAX had the wrong value: 0x%lx\n",
+ (unsigned long)ax);
n_errs++;
} else {
printf("[OK]\tSeems okay\n");
@@ -50,9 +80,42 @@ static void sigsegv_or_sigbus(int sig, siginfo_t *info, void *ctx_void)
siglongjmp(jmpbuf, 1);
}
+static volatile sig_atomic_t sigtrap_consecutive_syscalls;
+
+static void sigtrap(int sig, siginfo_t *info, void *ctx_void)
+{
+ /*
+ * KVM has some bugs that can cause us to stop making progress.
+ * detect them and complain, but don't infinite loop or fail the
+ * test.
+ */
+
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+ unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
+
+ if (*ip == 0x340f || *ip == 0x050f) {
+ /* The trap was on SYSCALL or SYSENTER */
+ sigtrap_consecutive_syscalls++;
+ if (sigtrap_consecutive_syscalls > 3) {
+ printf("[WARN]\tGot stuck single-stepping -- you probably have a KVM bug\n");
+ siglongjmp(jmpbuf, 1);
+ }
+ } else {
+ sigtrap_consecutive_syscalls = 0;
+ }
+}
+
static void sigill(int sig, siginfo_t *info, void *ctx_void)
{
- printf("[SKIP]\tIllegal instruction\n");
+ ucontext_t *ctx = (ucontext_t*)ctx_void;
+ unsigned short *ip = (unsigned short *)ctx->uc_mcontext.gregs[REG_IP];
+
+ if (*ip == 0x0b0f) {
+ /* one of the ud2 instructions faulted */
+ printf("[OK]\tSYSCALL returned normally\n");
+ } else {
+ printf("[SKIP]\tIllegal instruction\n");
+ }
siglongjmp(jmpbuf, 1);
}
@@ -120,9 +183,48 @@ int main()
"movl $-1, %%ebp\n\t"
"movl $-1, %%esp\n\t"
"syscall\n\t"
- "pushl $0" /* make sure we segfault cleanly */
+ "ud2" /* make sure we recover cleanly */
+ : : : "memory", "flags");
+ }
+
+ printf("[RUN]\tSYSENTER with TF and invalid state\n");
+ sethandler(SIGTRAP, sigtrap, SA_ONSTACK);
+
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ sigtrap_consecutive_syscalls = 0;
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "sysenter"
+ : : : "memory", "flags");
+ }
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
+
+ printf("[RUN]\tSYSCALL with TF and invalid state\n");
+ if (sigsetjmp(jmpbuf, 1) == 0) {
+ sigtrap_consecutive_syscalls = 0;
+ set_eflags(get_eflags() | X86_EFLAGS_TF);
+ asm volatile (
+ "movl $-1, %%eax\n\t"
+ "movl $-1, %%ebx\n\t"
+ "movl $-1, %%ecx\n\t"
+ "movl $-1, %%edx\n\t"
+ "movl $-1, %%esi\n\t"
+ "movl $-1, %%edi\n\t"
+ "movl $-1, %%ebp\n\t"
+ "movl $-1, %%esp\n\t"
+ "syscall\n\t"
+ "ud2" /* make sure we recover cleanly */
: : : "memory", "flags");
}
+ set_eflags(get_eflags() & ~X86_EFLAGS_TF);
return 0;
}