summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-02-23 11:35:10 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2018-02-26 19:06:51 -0800
commitfd468043d4d87da49d717d7747dba9f21bf13ed7 (patch)
tree6a9dc8dbd6d331043b79c8970fc2d95fe7205bcf
parent6f70eb2b00eb416146247c65003d31f4df983ce0 (diff)
downloadlinux-fd468043d4d87da49d717d7747dba9f21bf13ed7.tar.bz2
x86: avoid per-cpu system call trampoline
The per-cpu system call trampoline was a clever trick, and allows us to have percpu data even before swapgs is done by just doing %rip-relative addressing. And that was important, because syscall doesn't have a kernel stack, so we needed that percpu data very very early, just to get a temporary register to switch the page tables around. However, it turns out to be unnecessary. Because we actually have a temporary register that we can use: %r11 is destroyed by the 'syscall' instruction anyway. Ok, technically it contains the user mode flags register, but we *have* that information anyway: it's still in %rflags, we've just masked off a few unimportant bits. We'll destroy the rest too when we do the "and" of the CR3 value, but who cares? It's a system call. Btw, there are a few bits in eflags that might matter to user space: DF and AC. Right now this clears them, but that is fixable by just changing the MSR_SYSCALL_MASK value to not include them, and clearing them by hand the way we do for all other kernel entry points anyway. So the only _real_ flags we'd destroy are IF and the arithmetic flags that get trampled on by the arithmetic instructions that are part of the %cr3 reload logic. However, if we really end up caring, we can save off even those: we'd take advantage of the fact that %rcx - which contains the returning IP of the system call - also has 8 bits free. Why 8? Even with 5-level paging, we only have 57 bits of virtual address space, and the high address space is for the kernel (and vsyscall, but we'd just disable native vsyscall). So the %rip value saved in %rcx can have only 56 valid bits, which means that we have 8 bits free. So *if* we care about IF and the arithmetic flags being saved over a system call, we'd do: shlq $8,%rcx movb %r11b,%cl shrl $8,%r11d andl $8,%r11d orb %r11b,%cl to save those bits off before we then user %r11 as a temporary register (we'd obviously need to then undo that as we save the user space state on the stack). Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/entry/entry_64.S81
-rw-r--r--arch/x86/include/asm/cpu_entry_area.h2
-rw-r--r--arch/x86/kernel/asm-offsets.c1
-rw-r--r--arch/x86/kernel/cpu/common.c11
-rw-r--r--arch/x86/kernel/vmlinux.lds.S8
-rw-r--r--arch/x86/mm/cpu_entry_area.c5
6 files changed, 17 insertions, 91 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index d5c7f18f79ac..81d1a9f04e40 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -142,67 +142,16 @@ END(native_usergs_sysret64)
* with them due to bugs in both AMD and Intel CPUs.
*/
- .pushsection .entry_trampoline, "ax"
-
/*
- * The code in here gets remapped into cpu_entry_area's trampoline. This means
- * that the assembler and linker have the wrong idea as to where this code
- * lives (and, in fact, it's mapped more than once, so it's not even at a
- * fixed address). So we can't reference any symbols outside the entry
- * trampoline and expect it to work.
+ * The 'syscall' instruction will have cleared the MSR_SYSCALL_MASK
+ * bits in eflags. Currently that is:
+ *
+ * X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|
+ * X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT
*
- * Instead, we carefully abuse %rip-relative addressing.
- * _entry_trampoline(%rip) refers to the start of the remapped) entry
- * trampoline. We can thus find cpu_entry_area with this macro:
+ * and we don't care about any of them. So %r11 is a fine scratch
+ * register.
*/
-
-#define CPU_ENTRY_AREA \
- _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
-
-/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
-#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \
- SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
-
-ENTRY(entry_SYSCALL_64_trampoline)
- UNWIND_HINT_EMPTY
- swapgs
-
- /* Stash the user RSP. */
- movq %rsp, RSP_SCRATCH
-
- /* Note: using %rsp as a scratch reg. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
-
- /* Load the top of the task stack into RSP */
- movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
-
- /* Start building the simulated IRET frame. */
- pushq $__USER_DS /* pt_regs->ss */
- pushq RSP_SCRATCH /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
- pushq $__USER_CS /* pt_regs->cs */
- pushq %rcx /* pt_regs->ip */
-
- /*
- * x86 lacks a near absolute jump, and we can't jump to the real
- * entry text with a relative jump. We could push the target
- * address and then use retq, but this destroys the pipeline on
- * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead,
- * spill RDI and restore it in a second-stage trampoline.
- */
- pushq %rdi
- movq $entry_SYSCALL_64_stage2, %rdi
- JMP_NOSPEC %rdi
-END(entry_SYSCALL_64_trampoline)
-
- .popsection
-
-ENTRY(entry_SYSCALL_64_stage2)
- UNWIND_HINT_EMPTY
- popq %rdi
- jmp entry_SYSCALL_64_after_hwframe
-END(entry_SYSCALL_64_stage2)
-
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
@@ -212,17 +161,19 @@ ENTRY(entry_SYSCALL_64)
*/
swapgs
- /*
- * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it
- * is not required to switch CR3.
- */
- movq %rsp, PER_CPU_VAR(rsp_scratch)
+
+ /* Note: using %r11 as a scratch reg - user eflags */
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
+
+ movq %rsp, %r11
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
- pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
- pushq %r11 /* pt_regs->flags */
+ pushq %r11 /* pt_regs->sp */
+ pushfq /* pt_regs->flags */
+ orq $X86_EFLAGS_IF,(%rsp) /* We'll always return with interrupts enabled */
+ movq (%rsp),%r11 /* We "restore" %r11 */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
index 4a7884b8dca5..29c706415443 100644
--- a/arch/x86/include/asm/cpu_entry_area.h
+++ b/arch/x86/include/asm/cpu_entry_area.h
@@ -30,8 +30,6 @@ struct cpu_entry_area {
*/
struct tss_struct tss;
- char entry_trampoline[PAGE_SIZE];
-
#ifdef CONFIG_X86_64
/*
* Exception stacks used for IST entries.
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 76417a9aab73..13c07c7dd5e0 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -100,7 +100,6 @@ void common(void) {
/* Layout info for cpu_entry_area */
OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);
- OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline);
OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page);
DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));
}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 348cf4821240..293f0e2a3bed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1418,19 +1418,10 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
- extern char _entry_trampoline[];
- extern char entry_SYSCALL_64_trampoline[];
-
int cpu = smp_processor_id();
- unsigned long SYSCALL64_entry_trampoline =
- (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline +
- (entry_SYSCALL_64_trampoline - _entry_trampoline);
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- if (static_cpu_has(X86_FEATURE_PTI))
- wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline);
- else
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
#ifdef CONFIG_IA32_EMULATION
wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 9b138a06c1a4..21ae8fd3c9a8 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -116,14 +116,6 @@ SECTIONS
*(.fixup)
*(.gnu.warning)
-#ifdef CONFIG_X86_64
- . = ALIGN(PAGE_SIZE);
- _entry_trampoline = .;
- *(.entry_trampoline)
- . = ALIGN(PAGE_SIZE);
- ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big");
-#endif
-
#ifdef CONFIG_RETPOLINE
__indirect_thunk_start = .;
*(.text.__x86.indirect_thunk)
diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
index b9283cc27622..ae5c715bc9dc 100644
--- a/arch/x86/mm/cpu_entry_area.c
+++ b/arch/x86/mm/cpu_entry_area.c
@@ -68,8 +68,6 @@ static void percpu_setup_debug_store(int cpu)
static void __init setup_cpu_entry_area(int cpu)
{
#ifdef CONFIG_X86_64
- extern char _entry_trampoline[];
-
/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */
pgprot_t gdt_prot = PAGE_KERNEL_RO;
pgprot_t tss_prot = PAGE_KERNEL_RO;
@@ -131,9 +129,6 @@ static void __init setup_cpu_entry_area(int cpu)
cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks,
&per_cpu(exception_stacks, cpu),
sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL);
-
- cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline,
- __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX);
#endif
percpu_setup_debug_store(cpu);
}