From fd468043d4d87da49d717d7747dba9f21bf13ed7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 23 Feb 2018 11:35:10 -0800 Subject: x86: avoid per-cpu system call trampoline The per-cpu system call trampoline was a clever trick, and allows us to have percpu data even before swapgs is done by just doing %rip-relative addressing. And that was important, because syscall doesn't have a kernel stack, so we needed that percpu data very very early, just to get a temporary register to switch the page tables around. However, it turns out to be unnecessary. Because we actually have a temporary register that we can use: %r11 is destroyed by the 'syscall' instruction anyway. Ok, technically it contains the user mode flags register, but we *have* that information anyway: it's still in %rflags, we've just masked off a few unimportant bits. We'll destroy the rest too when we do the "and" of the CR3 value, but who cares? It's a system call. Btw, there are a few bits in eflags that might matter to user space: DF and AC. Right now this clears them, but that is fixable by just changing the MSR_SYSCALL_MASK value to not include them, and clearing them by hand the way we do for all other kernel entry points anyway. So the only _real_ flags we'd destroy are IF and the arithmetic flags that get trampled on by the arithmetic instructions that are part of the %cr3 reload logic. However, if we really end up caring, we can save off even those: we'd take advantage of the fact that %rcx - which contains the returning IP of the system call - also has 8 bits free. Why 8? Even with 5-level paging, we only have 57 bits of virtual address space, and the high address space is for the kernel (and vsyscall, but we'd just disable native vsyscall). So the %rip value saved in %rcx can have only 56 valid bits, which means that we have 8 bits free. So *if* we care about IF and the arithmetic flags being saved over a system call, we'd do: shlq $8,%rcx movb %r11b,%cl shrl $8,%r11d andl $8,%r11d orb %r11b,%cl to save those bits off before we then user %r11 as a temporary register (we'd obviously need to then undo that as we save the user space state on the stack). Signed-off-by: Linus Torvalds --- arch/x86/entry/entry_64.S | 81 +++++++---------------------------- arch/x86/include/asm/cpu_entry_area.h | 2 - arch/x86/kernel/asm-offsets.c | 1 - arch/x86/kernel/cpu/common.c | 11 +---- arch/x86/kernel/vmlinux.lds.S | 8 ---- arch/x86/mm/cpu_entry_area.c | 5 --- 6 files changed, 17 insertions(+), 91 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d5c7f18f79ac..81d1a9f04e40 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -142,67 +142,16 @@ END(native_usergs_sysret64) * with them due to bugs in both AMD and Intel CPUs. */ - .pushsection .entry_trampoline, "ax" - /* - * The code in here gets remapped into cpu_entry_area's trampoline. This means - * that the assembler and linker have the wrong idea as to where this code - * lives (and, in fact, it's mapped more than once, so it's not even at a - * fixed address). So we can't reference any symbols outside the entry - * trampoline and expect it to work. + * The 'syscall' instruction will have cleared the MSR_SYSCALL_MASK + * bits in eflags. Currently that is: + * + * X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| + * X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT * - * Instead, we carefully abuse %rip-relative addressing. - * _entry_trampoline(%rip) refers to the start of the remapped) entry - * trampoline. We can thus find cpu_entry_area with this macro: + * and we don't care about any of them. So %r11 is a fine scratch + * register. */ - -#define CPU_ENTRY_AREA \ - _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) - -/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ - SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA - -ENTRY(entry_SYSCALL_64_trampoline) - UNWIND_HINT_EMPTY - swapgs - - /* Stash the user RSP. */ - movq %rsp, RSP_SCRATCH - - /* Note: using %rsp as a scratch reg. */ - SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp - - /* Load the top of the task stack into RSP */ - movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp - - /* Start building the simulated IRET frame. */ - pushq $__USER_DS /* pt_regs->ss */ - pushq RSP_SCRATCH /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - - /* - * x86 lacks a near absolute jump, and we can't jump to the real - * entry text with a relative jump. We could push the target - * address and then use retq, but this destroys the pipeline on - * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, - * spill RDI and restore it in a second-stage trampoline. - */ - pushq %rdi - movq $entry_SYSCALL_64_stage2, %rdi - JMP_NOSPEC %rdi -END(entry_SYSCALL_64_trampoline) - - .popsection - -ENTRY(entry_SYSCALL_64_stage2) - UNWIND_HINT_EMPTY - popq %rdi - jmp entry_SYSCALL_64_after_hwframe -END(entry_SYSCALL_64_stage2) - ENTRY(entry_SYSCALL_64) UNWIND_HINT_EMPTY /* @@ -212,17 +161,19 @@ ENTRY(entry_SYSCALL_64) */ swapgs - /* - * This path is only taken when PAGE_TABLE_ISOLATION is disabled so it - * is not required to switch CR3. - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) + + /* Note: using %r11 as a scratch reg - user eflags */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%r11 + + movq %rsp, %r11 movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ - pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ + pushq %r11 /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + orq $X86_EFLAGS_IF,(%rsp) /* We'll always return with interrupts enabled */ + movq (%rsp),%r11 /* We "restore" %r11 */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ GLOBAL(entry_SYSCALL_64_after_hwframe) diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 4a7884b8dca5..29c706415443 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -30,8 +30,6 @@ struct cpu_entry_area { */ struct tss_struct tss; - char entry_trampoline[PAGE_SIZE]; - #ifdef CONFIG_X86_64 /* * Exception stacks used for IST entries. diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 76417a9aab73..13c07c7dd5e0 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -100,7 +100,6 @@ void common(void) { /* Layout info for cpu_entry_area */ OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); - OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 348cf4821240..293f0e2a3bed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1418,19 +1418,10 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count); /* May not be marked __init: used by software suspend */ void syscall_init(void) { - extern char _entry_trampoline[]; - extern char entry_SYSCALL_64_trampoline[]; - int cpu = smp_processor_id(); - unsigned long SYSCALL64_entry_trampoline = - (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + - (entry_SYSCALL_64_trampoline - _entry_trampoline); wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - if (static_cpu_has(X86_FEATURE_PTI)) - wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); - else - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 9b138a06c1a4..21ae8fd3c9a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -116,14 +116,6 @@ SECTIONS *(.fixup) *(.gnu.warning) -#ifdef CONFIG_X86_64 - . = ALIGN(PAGE_SIZE); - _entry_trampoline = .; - *(.entry_trampoline) - . = ALIGN(PAGE_SIZE); - ASSERT(. - _entry_trampoline == PAGE_SIZE, "entry trampoline is too big"); -#endif - #ifdef CONFIG_RETPOLINE __indirect_thunk_start = .; *(.text.__x86.indirect_thunk) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index b9283cc27622..ae5c715bc9dc 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -68,8 +68,6 @@ static void percpu_setup_debug_store(int cpu) static void __init setup_cpu_entry_area(int cpu) { #ifdef CONFIG_X86_64 - extern char _entry_trampoline[]; - /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; pgprot_t tss_prot = PAGE_KERNEL_RO; @@ -131,9 +129,6 @@ static void __init setup_cpu_entry_area(int cpu) cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, &per_cpu(exception_stacks, cpu), sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); - - cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, - __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); #endif percpu_setup_debug_store(cpu); } -- cgit v1.2.3