summaryrefslogtreecommitdiffstats
path: root/arch/x86/entry
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-03 18:59:10 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-03 18:59:10 -0800
commita75a3f6fc92888e4119744d8594ffdf748c3d444 (patch)
tree06c344beb369b1067a5621a175fae04508ba8d0d /arch/x86/entry
parentd2bea739f8b41d620c235d81e00289d01169dc3c (diff)
parent3bd29515d1cad26fa85a1a9b442de8816c1f5c54 (diff)
downloadlinux-a75a3f6fc92888e4119744d8594ffdf748c3d444.tar.bz2
Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 asm changes from Ingo Molnar: "The main change in this cycle is another step in the big x86 system call interface rework by Andy Lutomirski, which moves most of the low level x86 entry code from assembly to C, for all syscall entries except native 64-bit system calls: arch/x86/entry/entry_32.S | 182 ++++------ arch/x86/entry/entry_64_compat.S | 547 ++++++++----------------------- 194 insertions(+), 535 deletions(-) ... our hope is that the final remaining step (converting native 64-bit system calls) will be less painful as all the previous steps, given that most of the legacies and quirks are concentrated around native 32-bit and compat environments" * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (47 commits) x86/entry/32: Fix FS and GS restore in opportunistic SYSEXIT x86/entry/32: Fix entry_INT80_32() to expect interrupts to be on um/x86: Fix build after x86 syscall changes x86/asm: Remove the xyz_cfi macros from dwarf2.h selftests/x86: Style fixes for the 'unwind_vdso' test x86/entry/64/compat: Document sysenter_fix_flags's reason for existence x86/entry: Split and inline syscall_return_slowpath() x86/entry: Split and inline prepare_exit_to_usermode() x86/entry: Use pt_regs_to_thread_info() in syscall entry tracing x86/entry: Hide two syscall entry assertions behind CONFIG_DEBUG_ENTRY x86/entry: Micro-optimize compat fast syscall arg fetch x86/entry: Force inlining of 32-bit syscall code x86/entry: Make irqs_disabled checks in exit code depend on lockdep x86/entry: Remove unnecessary IRQ twiddling in fast 32-bit syscalls x86/asm: Remove thread_info.sysenter_return x86/entry/32: Re-implement SYSENTER using the new C path x86/entry/32: Switch INT80 to the new C syscall path x86/entry/32: Open-code return tracking from fork and kthreads x86/entry/compat: Implement opportunistic SYSRETL for compat syscalls x86/vdso/compat: Wire up SYSENTER and SYSCSALL for compat userspace ...
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/common.c264
-rw-r--r--arch/x86/entry/entry_32.S182
-rw-r--r--arch/x86/entry/entry_64.S9
-rw-r--r--arch/x86/entry/entry_64_compat.S547
-rw-r--r--arch/x86/entry/syscall_32.c9
-rw-r--r--arch/x86/entry/syscall_64.c4
-rw-r--r--arch/x86/entry/syscalls/syscall_32.tbl12
-rw-r--r--arch/x86/entry/vdso/Makefile39
-rw-r--r--arch/x86/entry/vdso/vdso2c.c2
-rw-r--r--arch/x86/entry/vdso/vdso32-setup.c28
-rw-r--r--arch/x86/entry/vdso/vdso32/int80.S56
-rw-r--r--arch/x86/entry/vdso/vdso32/syscall.S75
-rw-r--r--arch/x86/entry/vdso/vdso32/sysenter.S116
-rw-r--r--arch/x86/entry/vdso/vdso32/system_call.S57
-rw-r--r--arch/x86/entry/vdso/vma.c13
-rw-r--r--arch/x86/entry/vsyscall/vsyscall_64.c9
16 files changed, 504 insertions, 918 deletions
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 80dcc9261ca3..a89fdbc1f0be 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -24,10 +24,19 @@
#include <asm/desc.h>
#include <asm/traps.h>
+#include <asm/vdso.h>
+#include <asm/uaccess.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
+static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
+{
+ unsigned long top_of_stack =
+ (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
+ return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+}
+
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible void enter_from_user_mode(void)
@@ -66,13 +75,14 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
*/
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
unsigned long ret = 0;
u32 work;
- BUG_ON(regs != task_pt_regs(current));
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ BUG_ON(regs != task_pt_regs(current));
- work = ACCESS_ONCE(current_thread_info()->flags) &
- _TIF_WORK_SYSCALL_ENTRY;
+ work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
#ifdef CONFIG_CONTEXT_TRACKING
/*
@@ -154,11 +164,12 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
unsigned long phase1_result)
{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
long ret = 0;
- u32 work = ACCESS_ONCE(current_thread_info()->flags) &
- _TIF_WORK_SYSCALL_ENTRY;
+ u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;
- BUG_ON(regs != task_pt_regs(current));
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ BUG_ON(regs != task_pt_regs(current));
/*
* If we stepped into a sysenter/syscall insn, it trapped in
@@ -207,19 +218,12 @@ long syscall_trace_enter(struct pt_regs *regs)
return syscall_trace_enter_phase2(regs, arch, phase1_result);
}
-static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
-{
- unsigned long top_of_stack =
- (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
- return (struct thread_info *)(top_of_stack - THREAD_SIZE);
-}
+#define EXIT_TO_USERMODE_LOOP_FLAGS \
+ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
+ _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY)
-/* Called with IRQs disabled. */
-__visible void prepare_exit_to_usermode(struct pt_regs *regs)
+static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
{
- if (WARN_ON(!irqs_disabled()))
- local_irq_disable();
-
/*
* In order to return to user mode, we need to have IRQs off with
* none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
@@ -229,14 +233,6 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
* work to clear some of the flags can sleep.
*/
while (true) {
- u32 cached_flags =
- READ_ONCE(pt_regs_to_thread_info(regs)->flags);
-
- if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
- _TIF_UPROBE | _TIF_NEED_RESCHED |
- _TIF_USER_RETURN_NOTIFY)))
- break;
-
/* We have work to do. */
local_irq_enable();
@@ -260,50 +256,81 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
/* Disable IRQs and retry */
local_irq_disable();
+
+ cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+ if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+ break;
+
}
+}
+
+/* Called with IRQs disabled. */
+__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+ u32 cached_flags;
+
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
+ local_irq_disable();
+
+ lockdep_sys_exit();
+
+ cached_flags =
+ READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+ if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
+ exit_to_usermode_loop(regs, cached_flags);
user_enter();
}
+#define SYSCALL_EXIT_WORK_FLAGS \
+ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \
+ _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)
+
+static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
+{
+ bool step;
+
+ audit_syscall_exit(regs);
+
+ if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+ trace_sys_exit(regs, regs->ax);
+
+ /*
+ * If TIF_SYSCALL_EMU is set, we only get here because of
+ * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+ * We already reported this syscall instruction in
+ * syscall_trace_enter().
+ */
+ step = unlikely(
+ (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+ == _TIF_SINGLESTEP);
+ if (step || cached_flags & _TIF_SYSCALL_TRACE)
+ tracehook_report_syscall_exit(regs, step);
+}
+
/*
* Called with IRQs on and fully valid regs. Returns with IRQs off in a
* state such that we can immediately switch to user mode.
*/
-__visible void syscall_return_slowpath(struct pt_regs *regs)
+__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{
struct thread_info *ti = pt_regs_to_thread_info(regs);
u32 cached_flags = READ_ONCE(ti->flags);
- bool step;
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
- if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
- regs->orig_ax))
+ if (IS_ENABLED(CONFIG_PROVE_LOCKING) &&
+ WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax))
local_irq_enable();
/*
* First do one-time work. If these work items are enabled, we
* want to run them exactly once per syscall exit with IRQs on.
*/
- if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
- _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
- audit_syscall_exit(regs);
-
- if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
- trace_sys_exit(regs, regs->ax);
-
- /*
- * If TIF_SYSCALL_EMU is set, we only get here because of
- * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
- * We already reported this syscall instruction in
- * syscall_trace_enter().
- */
- step = unlikely(
- (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
- == _TIF_SINGLESTEP);
- if (step || cached_flags & _TIF_SYSCALL_TRACE)
- tracehook_report_syscall_exit(regs, step);
- }
+ if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS))
+ syscall_slow_exit_work(regs, cached_flags);
#ifdef CONFIG_COMPAT
/*
@@ -316,3 +343,144 @@ __visible void syscall_return_slowpath(struct pt_regs *regs)
local_irq_disable();
prepare_exit_to_usermode(regs);
}
+
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+/*
+ * Does a 32-bit syscall. Called with IRQs on and does all entry and
+ * exit work and returns with IRQs off. This function is extremely hot
+ * in workloads that use it, and it's usually called from
+ * do_fast_syscall_32, so forcibly inline it to improve performance.
+ */
+#ifdef CONFIG_X86_32
+/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */
+__visible
+#else
+/* 64-bit kernels use do_syscall_32_irqs_off() instead. */
+static
+#endif
+__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+{
+ struct thread_info *ti = pt_regs_to_thread_info(regs);
+ unsigned int nr = (unsigned int)regs->orig_ax;
+
+#ifdef CONFIG_IA32_EMULATION
+ ti->status |= TS_COMPAT;
+#endif
+
+ if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) {
+ /*
+ * Subtlety here: if ptrace pokes something larger than
+ * 2^32-1 into orig_ax, this truncates it. This may or
+ * may not be necessary, but it matches the old asm
+ * behavior.
+ */
+ nr = syscall_trace_enter(regs);
+ }
+
+ if (likely(nr < IA32_NR_syscalls)) {
+ /*
+ * It's possible that a 32-bit syscall implementation
+ * takes a 64-bit parameter but nonetheless assumes that
+ * the high bits are zero. Make sure we zero-extend all
+ * of the args.
+ */
+ regs->ax = ia32_sys_call_table[nr](
+ (unsigned int)regs->bx, (unsigned int)regs->cx,
+ (unsigned int)regs->dx, (unsigned int)regs->si,
+ (unsigned int)regs->di, (unsigned int)regs->bp);
+ }
+
+ syscall_return_slowpath(regs);
+}
+
+#ifdef CONFIG_X86_64
+/* Handles INT80 on 64-bit kernels */
+__visible void do_syscall_32_irqs_off(struct pt_regs *regs)
+{
+ local_irq_enable();
+ do_syscall_32_irqs_on(regs);
+}
+#endif
+
+/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
+__visible long do_fast_syscall_32(struct pt_regs *regs)
+{
+ /*
+ * Called using the internal vDSO SYSENTER/SYSCALL32 calling
+ * convention. Adjust regs so it looks like we entered using int80.
+ */
+
+ unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
+ vdso_image_32.sym_int80_landing_pad;
+
+ /*
+ * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
+ * so that 'regs->ip -= 2' lands back on an int $0x80 instruction.
+ * Fix it up.
+ */
+ regs->ip = landing_pad;
+
+ /*
+ * Fetch ECX from where the vDSO stashed it.
+ *
+ * WARNING: We are in CONTEXT_USER and RCU isn't paying attention!
+ */
+ local_irq_enable();
+ if (
+#ifdef CONFIG_X86_64
+ /*
+ * Micro-optimization: the pointer we're following is explicitly
+ * 32 bits, so it can't be out of range.
+ */
+ __get_user(*(u32 *)&regs->cx,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#else
+ get_user(*(u32 *)&regs->cx,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp)
+#endif
+ ) {
+
+ /* User code screwed up. */
+ local_irq_disable();
+ regs->ax = -EFAULT;
+#ifdef CONFIG_CONTEXT_TRACKING
+ enter_from_user_mode();
+#endif
+ prepare_exit_to_usermode(regs);
+ return 0; /* Keep it simple: use IRET. */
+ }
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs);
+
+#ifdef CONFIG_X86_64
+ /*
+ * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
+ * SYSRETL is available on all 64-bit CPUs, so we don't need to
+ * bother with SYSEXIT.
+ *
+ * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+ * because the ECX fixup above will ensure that this is essentially
+ * never the case.
+ */
+ return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
+ regs->ip == landing_pad &&
+ (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
+#else
+ /*
+ * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
+ *
+ * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
+ * because the ECX fixup above will ensure that this is essentially
+ * never the case.
+ *
+ * We don't allow syscalls at all from VM86 mode, but we still
+ * need to check VM, because we might be returning from sys_vm86.
+ */
+ return static_cpu_has(X86_FEATURE_SEP) &&
+ regs->cs == __USER_CS && regs->ss == __USER_DS &&
+ regs->ip == landing_pad &&
+ (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
+#endif
+}
+#endif
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index b2909bf8cf70..3eb572ed3d7a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -3,7 +3,7 @@
*
* entry_32.S contains the system-call and low-level fault and trap handling routines.
*
- * Stack layout in 'syscall_exit':
+ * Stack layout while running C code:
* ptrace needs to have all registers on the stack.
* If the order here is changed, it needs to be
* updated in fork.c:copy_process(), signal.c:do_signal(),
@@ -153,13 +153,13 @@
#endif /* CONFIG_X86_32_LAZY_GS */
-.macro SAVE_ALL
+.macro SAVE_ALL pt_regs_ax=%eax
cld
PUSH_GS
pushl %fs
pushl %es
pushl %ds
- pushl %eax
+ pushl \pt_regs_ax
pushl %ebp
pushl %edi
pushl %esi
@@ -211,7 +211,11 @@ ENTRY(ret_from_fork)
popl %eax
pushl $0x0202 # Reset kernel eflags
popfl
- jmp syscall_exit
+
+ /* When we fork, we trace the syscall return in the child, too. */
+ movl %esp, %eax
+ call syscall_return_slowpath
+ jmp restore_all
END(ret_from_fork)
ENTRY(ret_from_kernel_thread)
@@ -224,7 +228,15 @@ ENTRY(ret_from_kernel_thread)
movl PT_EBP(%esp), %eax
call *PT_EBX(%esp)
movl $0, PT_EAX(%esp)
- jmp syscall_exit
+
+ /*
+ * Kernel threads return to userspace as if returning from a syscall.
+ * We should check whether anything actually uses this path and, if so,
+ * consider switching it over to ret_from_fork.
+ */
+ movl %esp, %eax
+ call syscall_return_slowpath
+ jmp restore_all
ENDPROC(ret_from_kernel_thread)
/*
@@ -255,7 +267,6 @@ ret_from_intr:
jb resume_kernel # not returning to v8086 or userspace
ENTRY(resume_userspace)
- LOCKDEP_SYS_EXIT
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_OFF
movl %esp, %eax
@@ -276,76 +287,47 @@ need_resched:
END(resume_kernel)
#endif
-/*
- * SYSENTER_RETURN points to after the SYSENTER instruction
- * in the vsyscall page. See vsyscall-sysentry.S, which defines
- * the symbol.
- */
-
# SYSENTER call handler stub
ENTRY(entry_SYSENTER_32)
movl TSS_sysenter_sp0(%esp), %esp
sysenter_past_esp:
+ pushl $__USER_DS /* pt_regs->ss */
+ pushl %ecx /* pt_regs->cx */
+ pushfl /* pt_regs->flags (except IF = 0) */
+ orl $X86_EFLAGS_IF, (%esp) /* Fix IF */
+ pushl $__USER_CS /* pt_regs->cs */
+ pushl $0 /* pt_regs->ip = 0 (placeholder) */
+ pushl %eax /* pt_regs->orig_ax */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+
/*
- * Interrupts are disabled here, but we can't trace it until
- * enough kernel state to call TRACE_IRQS_OFF can be called - but
- * we immediately enable interrupts at that point anyway.
- */
- pushl $__USER_DS
- pushl %ebp
- pushfl
- orl $X86_EFLAGS_IF, (%esp)
- pushl $__USER_CS
- /*
- * Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary: TI_sysenter_return
- * is relative to thread_info, which is at the bottom of the
- * kernel stack page. 4*4 means the 4 words pushed above;
- * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
- * and THREAD_SIZE takes us to the bottom.
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
*/
- pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
-
- pushl %eax
- SAVE_ALL
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-/*
- * Load the potential sixth argument from user stack.
- * Careful about security.
- */
- cmpl $__PAGE_OFFSET-3, %ebp
- jae syscall_fault
- ASM_STAC
-1: movl (%ebp), %ebp
- ASM_CLAC
- movl %ebp, PT_EBP(%esp)
- _ASM_EXTABLE(1b, syscall_fault)
+ TRACE_IRQS_OFF
- GET_THREAD_INFO(%ebp)
+ movl %esp, %eax
+ call do_fast_syscall_32
+ testl %eax, %eax
+ jz .Lsyscall_32_done
- testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
- jnz syscall_trace_entry
-sysenter_do_call:
- cmpl $(NR_syscalls), %eax
- jae sysenter_badsys
- call *sys_call_table(, %eax, 4)
-sysenter_after_call:
- movl %eax, PT_EAX(%esp)
- LOCKDEP_SYS_EXIT
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl TI_flags(%ebp), %ecx
- testl $_TIF_ALLWORK_MASK, %ecx
- jnz syscall_exit_work_irqs_off
-sysenter_exit:
-/* if something modifies registers it must also disable sysexit */
- movl PT_EIP(%esp), %edx
- movl PT_OLDESP(%esp), %ecx
- xorl %ebp, %ebp
- TRACE_IRQS_ON
+/* Opportunistic SYSEXIT */
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ movl PT_EIP(%esp), %edx /* pt_regs->ip */
+ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */
1: mov PT_FS(%esp), %fs
PTGS_TO_GS
+ popl %ebx /* pt_regs->bx */
+ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */
+ popl %esi /* pt_regs->si */
+ popl %edi /* pt_regs->di */
+ popl %ebp /* pt_regs->bp */
+ popl %eax /* pt_regs->ax */
+
+ /*
+ * Return back to the vDSO, which will pop ecx and edx.
+ * Don't bother with DS and ES (they already contain __USER_DS).
+ */
ENABLE_INTERRUPTS_SYSEXIT
.pushsection .fixup, "ax"
@@ -359,21 +341,18 @@ ENDPROC(entry_SYSENTER_32)
# system call handler stub
ENTRY(entry_INT80_32)
ASM_CLAC
- pushl %eax # save orig_eax
- SAVE_ALL
- GET_THREAD_INFO(%ebp)
- # system call tracing in operation / emulation
- testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
- jnz syscall_trace_entry
- cmpl $(NR_syscalls), %eax
- jae syscall_badsys
-syscall_call:
- call *sys_call_table(, %eax, 4)
-syscall_after_call:
- movl %eax, PT_EAX(%esp) # store the return value
-syscall_exit:
- LOCKDEP_SYS_EXIT
- jmp syscall_exit_work
+ pushl %eax /* pt_regs->orig_ax */
+ SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */
+
+ /*
+ * User mode is traced as though IRQs are on. Unlike the 64-bit
+ * case, INT80 is a trap gate on 32-bit kernels, so interrupts
+ * are already on (unless user code is messing around with iopl).
+ */
+
+ movl %esp, %eax
+ call do_syscall_32_irqs_on
+.Lsyscall_32_done:
restore_all:
TRACE_IRQS_IRET
@@ -450,47 +429,6 @@ ldt_ss:
#endif
ENDPROC(entry_INT80_32)
- # perform syscall exit tracing
- ALIGN
-syscall_trace_entry:
- movl $-ENOSYS, PT_EAX(%esp)
- movl %esp, %eax
- call syscall_trace_enter
- /* What it returned is what we'll actually use. */
- cmpl $(NR_syscalls), %eax
- jnae syscall_call
- jmp syscall_exit
-END(syscall_trace_entry)
-
- # perform syscall exit tracing
- ALIGN
-syscall_exit_work_irqs_off:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_ANY)
-
-syscall_exit_work:
- movl %esp, %eax
- call syscall_return_slowpath
- jmp restore_all
-END(syscall_exit_work)
-
-syscall_fault:
- ASM_CLAC
- GET_THREAD_INFO(%ebp)
- movl $-EFAULT, PT_EAX(%esp)
- jmp resume_userspace
-END(syscall_fault)
-
-syscall_badsys:
- movl $-ENOSYS, %eax
- jmp syscall_after_call
-END(syscall_badsys)
-
-sysenter_badsys:
- movl $-ENOSYS, %eax
- jmp sysenter_after_call
-END(sysenter_badsys)
-
.macro FIXUP_ESPFIX_STACK
/*
* Switch back for ESPFIX stack to the normal zerobased stack
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 055a01de7c8d..53616ca03244 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -391,20 +391,16 @@ GLOBAL(stub_execveat)
jmp return_from_execve
END(stub_execveat)
-#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION)
+#if defined(CONFIG_X86_X32_ABI)
.align 8
GLOBAL(stub_x32_execve)
-GLOBAL(stub32_execve)
call compat_sys_execve
jmp return_from_execve
-END(stub32_execve)
END(stub_x32_execve)
.align 8
GLOBAL(stub_x32_execveat)
-GLOBAL(stub32_execveat)
call compat_sys_execveat
jmp return_from_execve
-END(stub32_execveat)
END(stub_x32_execveat)
#endif
@@ -557,7 +553,6 @@ ret_from_intr:
jz retint_kernel
/* Interrupt came from user space */
- LOCKDEP_SYS_EXIT_IRQ
GLOBAL(retint_user)
mov %rsp,%rdi
call prepare_exit_to_usermode
@@ -587,7 +582,7 @@ retint_kernel:
* At this label, code paths which return to kernel and to user,
* which come from interrupts/exception and from syscalls, merge.
*/
-restore_regs_and_iret:
+GLOBAL(restore_regs_and_iret)
RESTORE_EXTRA_REGS
restore_c_regs_and_iret:
RESTORE_C_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index a9360d40fb7f..c3201830a85e 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -16,16 +16,6 @@
#include <linux/linkage.h>
#include <linux/err.h>
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE 0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-# define sysexit_audit ia32_ret_from_sys_call_irqs_off
-# define sysretl_audit ia32_ret_from_sys_call_irqs_off
-#endif
-
.section .entry.text, "ax"
#ifdef CONFIG_PARAVIRT
@@ -58,219 +48,87 @@ ENDPROC(native_usergs_sysret32)
* with the int 0x80 path.
*/
ENTRY(entry_SYSENTER_compat)
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
+ /* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- ENABLE_INTERRUPTS(CLBR_NONE)
- /* Zero-extending 32-bit regs, do not remove */
- movl %ebp, %ebp
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
movl %eax, %eax
- movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
-
/* Construct struct pt_regs on stack */
pushq $__USER32_DS /* pt_regs->ss */
- pushq %rbp /* pt_regs->sp */
- pushfq /* pt_regs->flags */
+ pushq %rcx /* pt_regs->sp */
+
+ /*
+ * Push flags. This is nasty. First, interrupts are currently
+ * off, but we need pt_regs->flags to have IF set. Second, even
+ * if TF was set when SYSENTER started, it's clear by now. We fix
+ * that later using TIF_SINGLESTEP.
+ */
+ pushfq /* pt_regs->flags (except IF = 0) */
+ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */
+ ASM_CLAC /* Clear AC after saving FLAGS */
+
pushq $__USER32_CS /* pt_regs->cs */
- pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */
+ xorq %r8,%r8
+ pushq %r8 /* pt_regs->ip = 0 (placeholder) */
pushq %rax /* pt_regs->orig_ax */
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rcx /* pt_regs->cx */
+ pushq %rcx /* pt_regs->cx (will be overwritten) */
pushq $-ENOSYS /* pt_regs->ax */
+ pushq %r8 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r9 = 0 */
+ pushq %r8 /* pt_regs->r10 = 0 */
+ pushq %r8 /* pt_regs->r11 = 0 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r8 /* pt_regs->r12 = 0 */
+ pushq %r8 /* pt_regs->r13 = 0 */
+ pushq %r8 /* pt_regs->r14 = 0 */
+ pushq %r8 /* pt_regs->r15 = 0 */
cld
- sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
-
- /*
- * no need to do an access_ok check here because rbp has been
- * 32-bit zero extended
- */
- ASM_STAC
-1: movl (%rbp), %ebp
- _ASM_EXTABLE(1b, ia32_badarg)
- ASM_CLAC
/*
* Sysenter doesn't filter flags, so we need to clear NT
* ourselves. To save a few cycles, we can check whether
* NT was set instead of doing an unconditional popfq.
+ * This needs to happen before enabling interrupts so that
+ * we don't get preempted with NT set.
+ *
+ * NB.: sysenter_fix_flags is a label with the code under it moved
+ * out-of-line as an optimization: NT is unlikely to be set in the
+ * majority of the cases and instead of polluting the I$ unnecessarily,
+ * we're keeping that code behind a branch which will predict as
+ * not-taken and therefore its instructions won't be fetched.
*/
testl $X86_EFLAGS_NT, EFLAGS(%rsp)
jnz sysenter_fix_flags
sysenter_flags_fixed:
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysenter_tracesys
-
-sysenter_do_call:
- /* 32-bit syscall -> 64-bit C ABI argument conversion */
- movl %edi, %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
- xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
- movl %ebx, %edi /* arg1 */
- movl %edx, %edx /* arg3 (zero extension) */
-sysenter_dispatch:
- cmpq $(IA32_NR_syscalls-1), %rax
- ja 1f
- call *ia32_sys_call_table(, %rax, 8)
- movq %rax, RAX(%rsp)
-1:
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysexit_audit
-sysexit_from_sys_call:
/*
- * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
- * NMI between STI and SYSEXIT has poorly specified behavior,
- * and and NMI followed by an IRQ with usergs is fatal. So
- * we just pretend we're using SYSEXIT but we really use
- * SYSRETL instead.
- *
- * This code path is still called 'sysexit' because it pairs
- * with 'sysenter' and it uses the SYSENTER calling convention.
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
*/
- andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- movl RIP(%rsp), %ecx /* User %eip */
- movq RAX(%rsp), %rax
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- xorl %edx, %edx /* Do not leak kernel information */
- xorq %r8, %r8
- xorq %r9, %r9
- xorq %r10, %r10
- movl EFLAGS(%rsp), %r11d /* User eflags */
- TRACE_IRQS_ON
-
- /*
- * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT,
- * since it avoids a dicey window with interrupts enabled.
- */
- movl RSP(%rsp), %esp
-
- /*
- * USERGS_SYSRET32 does:
- * gsbase = user's gs base
- * eip = ecx
- * rflags = r11
- * cs = __USER32_CS
- * ss = __USER_DS
- *
- * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
- *
- * pop %ebp
- * pop %edx
- * pop %ecx
- *
- * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
- * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's
- * address (already known to user code), and R12-R15 are
- * callee-saved and therefore don't contain any interesting
- * kernel data.
- */
- USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
- .macro auditsys_entry_common
- /*
- * At this point, registers hold syscall args in the 32-bit syscall ABI:
- * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP.
- *
- * We want to pass them to __audit_syscall_entry(), which is a 64-bit
- * C function with 5 parameters, so shuffle them to match what
- * the function expects: RDI,RSI,RDX,RCX,R8.
- */
- movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */
- xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */
- /* arg3 (RDX) <= 2nd syscall arg (ECX) */
- movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */
- movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */
- call __audit_syscall_entry
-
- /*
- * We are going to jump back to the syscall dispatch code.
- * Prepare syscall args as required by the 64-bit C ABI.
- * Registers clobbered by __audit_syscall_entry() are
- * loaded from pt_regs on stack:
- */
- movl ORIG_RAX(%rsp), %eax /* syscall number */
- movl %ebx, %edi /* arg1 */
- movl RCX(%rsp), %esi /* arg2 */
- movl RDX(%rsp), %edx /* arg3 */
- movl RSI(%rsp), %ecx /* arg4 */
- movl RDI(%rsp), %r8d /* arg5 */
- .endm
-
- .macro auditsys_exit exit
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_ret_from_sys_call
- movl %eax, %esi /* second arg, syscall return value */
- cmpl $-MAX_ERRNO, %eax /* is it an error ? */
- jbe 1f
- movslq %eax, %rsi /* if error sign extend to 64 bits */
-1: setbe %al /* 1 if error, 0 if not */
- movzbl %al, %edi /* zero-extend that into %edi */
- call __audit_syscall_exit
- movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi
- DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz \exit
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- jmp int_ret_from_sys_call_irqs_off
- .endm
-sysenter_auditsys:
- auditsys_entry_common
- movl %ebp, %r9d /* reload 6th syscall arg */
- jmp sysenter_dispatch
-
-sysexit_audit:
- auditsys_exit sysexit_from_sys_call
-#endif
+ movq %rsp, %rdi
+ call do_fast_syscall_32
+ testl %eax, %eax
+ jz .Lsyscall_32_done
+ jmp sysret32_from_system_call
sysenter_fix_flags:
- pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED)
+ pushq $X86_EFLAGS_FIXED
popfq
jmp sysenter_flags_fixed
-
-sysenter_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz sysenter_auditsys
-#endif
- SAVE_EXTRA_REGS
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- movq %rsp, %rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
-
- /* Reload arg registers from stack. (see sysenter_tracesys) */
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
-
- RESTORE_EXTRA_REGS
- jmp sysenter_do_call
ENDPROC(entry_SYSENTER_compat)
/*
@@ -298,21 +156,14 @@ ENDPROC(entry_SYSENTER_compat)
* edi arg5
* esp user stack
* 0(%esp) arg6
- *
- * This is purely a fast path. For anything complicated we use the int 0x80
- * path below. We set up a complete hardware stack frame to share code
- * with the int 0x80 path.
*/
ENTRY(entry_SYSCALL_compat)
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
+ /* Interrupts are off on entry. */
SWAPGS_UNSAFE_STACK
+
+ /* Stash user ESP and switch to the kernel stack. */
movl %esp, %r8d
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
- ENABLE_INTERRUPTS(CLBR_NONE)
/* Zero-extending 32-bit regs, do not remove */
movl %eax, %eax
@@ -327,162 +178,67 @@ ENTRY(entry_SYSCALL_compat)
pushq %rdi /* pt_regs->di */
pushq %rsi /* pt_regs->si */
pushq %rdx /* pt_regs->dx */
- pushq %rbp /* pt_regs->cx */
- movl %ebp, %ecx
+ pushq %rcx /* pt_regs->cx (will be overwritten) */
pushq $-ENOSYS /* pt_regs->ax */
- sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */
+ xorq %r8,%r8
+ pushq %r8 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r9 = 0 */
+ pushq %r8 /* pt_regs->r10 = 0 */
+ pushq %r8 /* pt_regs->r11 = 0 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r8 /* pt_regs->r12 = 0 */
+ pushq %r8 /* pt_regs->r13 = 0 */
+ pushq %r8 /* pt_regs->r14 = 0 */
+ pushq %r8 /* pt_regs->r15 = 0 */
/*
- * No need to do an access_ok check here because r8 has been
- * 32-bit zero extended:
+ * User mode is traced as though IRQs are on, and SYSENTER
+ * turned them off.
*/
- ASM_STAC
-1: movl (%r8), %r9d
- _ASM_EXTABLE(1b, ia32_badarg)
- ASM_CLAC
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz cstar_tracesys
-
-cstar_do_call:
- /* 32-bit syscall -> 64-bit C ABI argument conversion */
- movl %edi, %r8d /* arg5 */
- /* r9 already loaded */ /* arg6 */
- xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
- movl %ebx, %edi /* arg1 */
- movl %edx, %edx /* arg3 (zero extension) */
-
-cstar_dispatch:
- cmpq $(IA32_NR_syscalls-1), %rax
- ja 1f
-
- call *ia32_sys_call_table(, %rax, 8)
- movq %rax, RAX(%rsp)
-1:
- DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz sysretl_audit
-sysretl_from_sys_call:
- andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl RIP(%rsp), %ecx
- movl EFLAGS(%rsp), %r11d
- movq RAX(%rsp), %rax
- xorq %r10, %r10
- xorq %r9, %r9
- xorq %r8, %r8
- TRACE_IRQS_ON
- movl RSP(%rsp), %esp
- /*
- * 64-bit->32-bit SYSRET restores eip from ecx,
- * eflags from r11 (but RF and VM bits are forced to 0),
- * cs and ss are loaded from MSRs.
- * (Note: 32-bit->32-bit SYSRET is different: since r11
- * does not exist, it merely sets eflags.IF=1).
+ movq %rsp, %rdi
+ call do_fast_syscall_32
+ testl %eax, %eax
+ jz .Lsyscall_32_done
+
+ /* Opportunistic SYSRET */
+sysret32_from_system_call:
+ TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ movq RBX(%rsp), %rbx /* pt_regs->rbx */
+ movq RBP(%rsp), %rbp /* pt_regs->rbp */
+ movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
+ movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */
+ addq $RAX, %rsp /* Skip r8-r15 */
+ popq %rax /* pt_regs->rax */
+ popq %rdx /* Skip pt_regs->cx */
+ popq %rdx /* pt_regs->dx */
+ popq %rsi /* pt_regs->si */
+ popq %rdi /* pt_regs->di */
+
+ /*
+ * USERGS_SYSRET32 does:
+ * GSBASE = user's GS base
+ * EIP = ECX
+ * RFLAGS = R11
+ * CS = __USER32_CS
+ * SS = __USER_DS
+ *
+ * ECX will not match pt_regs->cx, but we're returning to a vDSO
+ * trampoline that will fix up RCX, so this is okay.
*
- * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss
- * descriptor is not reinitialized. This means that we must
- * avoid SYSRET with SS == NULL, which could happen if we schedule,
- * exit the kernel, and re-enter using an interrupt vector. (All
- * interrupt entries on x86_64 set SS to NULL.) We prevent that
- * from happening by reloading SS in __switch_to.
- */
- USERGS_SYSRET32
-
-#ifdef CONFIG_AUDITSYSCALL
-cstar_auditsys:
- movl %r9d, R9(%rsp) /* register to be clobbered by call */
- auditsys_entry_common
- movl R9(%rsp), %r9d /* reload 6th syscall arg */
- jmp cstar_dispatch
-
-sysretl_audit:
- auditsys_exit sysretl_from_sys_call
-#endif
-
-cstar_tracesys:
-#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jz cstar_auditsys
-#endif
- xchgl %r9d, %ebp
- SAVE_EXTRA_REGS
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %r9, R9(%rsp)
- movq %rax, R8(%rsp)
- movq %rsp, %rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
- movl R9(%rsp), %r9d
-
- /* Reload arg registers from stack. (see sysenter_tracesys) */
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
-
- RESTORE_EXTRA_REGS
- xchgl %ebp, %r9d
- jmp cstar_do_call
+ * R12-R15 are callee-saved, so they contain whatever was in them
+ * when the system call started, which is already known to user
+ * code. We zero R8-R10 to avoid info leaks.
+ */
+ xorq %r8, %r8
+ xorq %r9, %r9
+ xorq %r10, %r10
+ movq RSP-ORIG_RAX(%rsp), %rsp
+ USERGS_SYSRET32
END(entry_SYSCALL_compat)
-ia32_badarg:
- /*
- * So far, we've entered kernel mode, set AC, turned on IRQs, and
- * saved C regs except r8-r11. We haven't done any of the other
- * standard entry work, though. We want to bail, but we shouldn't
- * treat this as a syscall entry since we don't even know what the
- * args are. Instead, treat this as a non-syscall entry, finish
- * the entry work, and immediately exit after setting AX = -EFAULT.
- *
- * We're really just being polite here. Killing the task outright
- * would be a reasonable action, too. Given that the only valid
- * way to have gotten here is through the vDSO, and we already know
- * that the stack pointer is bad, the task isn't going to survive
- * for long no matter what we do.
- */
-
- ASM_CLAC /* undo STAC */
- movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */
-
- /* Fill in the rest of pt_regs */
- xorl %eax, %eax
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- SAVE_EXTRA_REGS
-
- /* Turn IRQs back off. */
- DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
-
- /* Now finish entering normal kernel mode. */
-#ifdef CONFIG_CONTEXT_TRACKING
- call enter_from_user_mode
-#endif
-
- /* And exit again. */
- jmp retint_user
-
-ia32_ret_from_sys_call_irqs_off:
- TRACE_IRQS_ON
- ENABLE_INTERRUPTS(CLBR_NONE)
-
-ia32_ret_from_sys_call:
- xorl %eax, %eax /* Do not leak kernel information */
- movq %rax, R11(%rsp)
- movq %rax, R10(%rsp)
- movq %rax, R9(%rsp)
- movq %rax, R8(%rsp)
- jmp int_ret_from_sys_call
-
/*
* Emulated IA32 system calls via int 0x80.
*
@@ -507,14 +263,17 @@ ia32_ret_from_sys_call:
ENTRY(entry_INT80_compat)
/*
* Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
*/
PARAVIRT_ADJUST_EXCEPTION_FRAME
SWAPGS
- ENABLE_INTERRUPTS(CLBR_NONE)
- /* Zero-extending 32-bit regs, do not remove */
+ /*
+ * User tracing code (ptrace or signal handlers) might assume that
+ * the saved RAX contains a 32-bit number when we're invoking a 32-bit
+ * syscall. Just in case the high bits are nonzero, zero-extend
+ * the syscall number. (This could almost certainly be deleted
+ * with no ill effects.)
+ */
movl %eax, %eax
/* Construct struct pt_regs on stack (iret frame is already on stack) */
@@ -524,67 +283,37 @@ ENTRY(entry_INT80_compat)
pushq %rdx /* pt_regs->dx */
pushq %rcx /* pt_regs->cx */
pushq $-ENOSYS /* pt_regs->ax */
- pushq $0 /* pt_regs->r8 */
- pushq $0 /* pt_regs->r9 */
- pushq $0 /* pt_regs->r10 */
- pushq $0 /* pt_regs->r11 */
+ xorq %r8,%r8
+ pushq %r8 /* pt_regs->r8 = 0 */
+ pushq %r8 /* pt_regs->r9 = 0 */
+ pushq %r8 /* pt_regs->r10 = 0 */
+ pushq %r8 /* pt_regs->r11 = 0 */
+ pushq %rbx /* pt_regs->rbx */
+ pushq %rbp /* pt_regs->rbp */
+ pushq %r12 /* pt_regs->r12 */
+ pushq %r13 /* pt_regs->r13 */
+ pushq %r14 /* pt_regs->r14 */
+ pushq %r15 /* pt_regs->r15 */
cld
- sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */
-
- orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
- testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
- jnz ia32_tracesys
-
-ia32_do_call:
- /* 32-bit syscall -> 64-bit C ABI argument conversion */
- movl %edi, %r8d /* arg5 */
- movl %ebp, %r9d /* arg6 */
- xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */
- movl %ebx, %edi /* arg1 */
- movl %edx, %edx /* arg3 (zero extension) */
- cmpq $(IA32_NR_syscalls-1), %rax
- ja 1f
- call *ia32_sys_call_table(, %rax, 8)
- movq %rax, RAX(%rsp)
-1:
- jmp int_ret_from_sys_call
-
-ia32_tracesys:
- SAVE_EXTRA_REGS
- movq %rsp, %rdi /* &pt_regs -> arg1 */
- call syscall_trace_enter
/*
- * Reload arg registers from stack in case ptrace changed them.
- * Don't reload %eax because syscall_trace_enter() returned
- * the %rax value we should see. But do truncate it to 32 bits.
- * If it's -1 to make us punt the syscall, then (u32)-1 is still
- * an appropriately invalid value.
+ * User mode is traced as though IRQs are on, and the interrupt
+ * gate turned them off.
*/
- movl RCX(%rsp), %ecx
- movl RDX(%rsp), %edx
- movl RSI(%rsp), %esi
- movl RDI(%rsp), %edi
- movl %eax, %eax /* zero extension */
- RESTORE_EXTRA_REGS
- jmp ia32_do_call
-END(entry_INT80_compat)
+ TRACE_IRQS_OFF
- .macro PTREGSCALL label, func
- ALIGN
-GLOBAL(\label)
- leaq \func(%rip), %rax
- jmp ia32_ptregs_common
- .endm
+ movq %rsp, %rdi
+ call do_syscall_32_irqs_off
+.Lsyscall_32_done:
- PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
- PTREGSCALL stub32_sigreturn, sys32_sigreturn
- PTREGSCALL stub32_fork, sys_fork
- PTREGSCALL stub32_vfork, sys_vfork
+ /* Go back to user mode. */
+ TRACE_IRQS_ON
+ SWAPGS
+ jmp restore_regs_and_iret
+END(entry_INT80_compat)
ALIGN
GLOBAL(stub32_clone)
- leaq sys_clone(%rip), %rax
/*
* The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr).
* The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val).
@@ -593,12 +322,4 @@ GLOBAL(stub32_clone)
* so we need to swap arguments here before calling it:
*/
xchg %r8, %rcx
- jmp ia32_ptregs_common
-
- ALIGN
-ia32_ptregs_common:
- SAVE_EXTRA_REGS 8
- call *%rax
- RESTORE_EXTRA_REGS 8
- ret
-END(ia32_ptregs_common)
+ jmp sys_clone
diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c
index 8ea34f94e973..9a6649857106 100644
--- a/arch/x86/entry/syscall_32.c
+++ b/arch/x86/entry/syscall_32.c
@@ -4,24 +4,21 @@
#include <linux/sys.h>
#include <linux/cache.h>
#include <asm/asm-offsets.h>
+#include <asm/syscall.h>
#ifdef CONFIG_IA32_EMULATION
#define SYM(sym, compat) compat
#else
#define SYM(sym, compat) sym
-#define ia32_sys_call_table sys_call_table
-#define __NR_syscall_compat_max __NR_syscall_max
#endif
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
-typedef asmlinkage void (*sys_call_ptr_t)(void);
-
-extern asmlinkage void sys_ni_syscall(void);
+extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
__visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = {
/*
diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c
index 4ac730b37f0b..41283d22be7a 100644
--- a/arch/x86/entry/syscall_64.c
+++ b/arch/x86/entry/syscall_64.c
@@ -14,13 +14,13 @@
# define __SYSCALL_X32(nr, sym, compat) /* nothing */
#endif
-#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ;
+#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ;
#include <asm/syscalls_64.h>
#undef __SYSCALL_64
#define __SYSCALL_64(nr, sym, compat) [nr] = sym,
-extern void sys_ni_syscall(void);
+extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long);
asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
/*
diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index 7663c455b9f6..caa2c712d1e7 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -8,7 +8,7 @@
#
0 i386 restart_syscall sys_restart_syscall
1 i386 exit sys_exit
-2 i386 fork sys_fork stub32_fork
+2 i386 fork sys_fork sys_fork
3 i386 read sys_read
4 i386 write sys_write
5 i386 open sys_open compat_sys_open
@@ -17,7 +17,7 @@
8 i386 creat sys_creat
9 i386 link sys_link
10 i386 unlink sys_unlink
-11 i386 execve sys_execve stub32_execve
+11 i386 execve sys_execve compat_sys_execve
12 i386 chdir sys_chdir
13 i386 time sys_time compat_sys_time
14 i386 mknod sys_mknod
@@ -125,7 +125,7 @@
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
117 i386 ipc sys_ipc compat_sys_ipc
118 i386 fsync sys_fsync
-119 i386 sigreturn sys_sigreturn stub32_sigreturn
+119 i386 sigreturn sys_sigreturn sys32_sigreturn
120 i386 clone sys_clone stub32_clone
121 i386 setdomainname sys_setdomainname
122 i386 uname sys_newuname
@@ -179,7 +179,7 @@
170 i386 setresgid sys_setresgid16
171 i386 getresgid sys_getresgid16
172 i386 prctl sys_prctl
-173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn
+173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn
174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction
175 i386 rt_sigprocmask sys_rt_sigprocmask
176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending
@@ -196,7 +196,7 @@
187 i386 sendfile sys_sendfile compat_sys_sendfile
188 i386 getpmsg
189 i386 putpmsg
-190 i386 vfork sys_vfork stub32_vfork
+190 i386 vfork sys_vfork sys_vfork
191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit
192 i386 mmap2 sys_mmap_pgoff
193 i386 truncate64 sys_truncate64 sys32_truncate64
@@ -364,7 +364,7 @@
355 i386 getrandom sys_getrandom
356 i386 memfd_create sys_memfd_create
357 i386 bpf sys_bpf
-358 i386 execveat sys_execveat stub32_execveat
+358 i386 execveat sys_execveat compat_sys_execveat
359 i386 socket sys_socket
360 i386 socketpair sys_socketpair
361 i386 bind sys_bind
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index a3d0767a6b29..265c0ed68118 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -19,9 +19,7 @@ obj-y += vma.o
# vDSO images to build
vdso_img-$(VDSO64-y) += 64
vdso_img-$(VDSOX32-y) += x32
-vdso_img-$(VDSO32-y) += 32-int80
-vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall
-vdso_img-$(VDSO32-y) += 32-sysenter
+vdso_img-$(VDSO32-y) += 32
obj-$(VDSO32-y) += vdso32-setup.o
@@ -69,7 +67,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE
CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
$(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
-fno-omit-frame-pointer -foptimize-sibling-calls \
- -DDISABLE_BRANCH_PROFILING
+ -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
$(vobjs): KBUILD_CFLAGS += $(CFL)
@@ -122,15 +120,6 @@ $(obj)/%.so: $(obj)/%.so.dbg
$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
$(call if_changed,vdso)
-#
-# Build multiple 32-bit vDSO images to choose from at boot time.
-#
-vdso32.so-$(VDSO32-y) += int80
-vdso32.so-$(CONFIG_IA32_EMULATION) += syscall
-vdso32.so-$(VDSO32-y) += sysenter
-
-vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
-
CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
@@ -139,14 +128,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1
override obj-dirs = $(dir $(obj)) $(obj)/vdso32/
targets += vdso32/vdso32.lds
-targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o)
+targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o
targets += vdso32/vclock_gettime.o
-$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%)
-
-KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS))
-$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
-$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32
+KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO
+$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32)
+$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32
KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
@@ -157,13 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
KBUILD_CFLAGS_32 += -fno-omit-frame-pointer
KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING
-$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
+$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32)
-$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \
- $(obj)/vdso32/vdso32.lds \
- $(obj)/vdso32/vclock_gettime.o \
- $(obj)/vdso32/note.o \
- $(obj)/vdso32/%.o
+$(obj)/vdso32.so.dbg: FORCE \
+ $(obj)/vdso32/vdso32.lds \
+ $(obj)/vdso32/vclock_gettime.o \
+ $(obj)/vdso32/note.o \
+ $(obj)/vdso32/system_call.o
$(call if_changed,vdso)
#
@@ -206,4 +193,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
PHONY += vdso_install $(vdso_img_insttargets)
vdso_install: $(vdso_img_insttargets) FORCE
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*
+clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so*
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 8627db24a7f6..785d9922b106 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -98,10 +98,10 @@ struct vdso_sym required_syms[] = {
"VDSO_FAKE_SECTION_TABLE_END", false
},
{"VDSO32_NOTE_MASK", true},
- {"VDSO32_SYSENTER_RETURN", true},
{"__kernel_vsyscall", true},
{"__kernel_sigreturn", true},
{"__kernel_rt_sigreturn", true},
+ {"int80_landing_pad", true},
};
__attribute__((format(printf, 1, 2))) __attribute__((noreturn))
diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c
index e904c270573b..08a317a9ae4b 100644
--- a/arch/x86/entry/vdso/vdso32-setup.c
+++ b/arch/x86/entry/vdso/vdso32-setup.c
@@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup);
__setup_param("vdso=", vdso_setup, vdso32_setup, 0);
#endif
-#ifdef CONFIG_X86_64
-
-#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
-#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
-
-#else /* CONFIG_X86_32 */
-
-#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
-#define vdso32_syscall() (0)
-
-#endif /* CONFIG_X86_64 */
-
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
-const struct vdso_image *selected_vdso32;
-#endif
-
int __init sysenter_setup(void)
{
-#ifdef CONFIG_COMPAT
- if (vdso32_syscall())
- selected_vdso32 = &vdso_image_32_syscall;
- else
-#endif
- if (vdso32_sysenter())
- selected_vdso32 = &vdso_image_32_sysenter;
- else
- selected_vdso32 = &vdso_image_32_int80;
-
- init_vdso_image(selected_vdso32);
+ init_vdso_image(&vdso_image_32);
return 0;
}
diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S
deleted file mode 100644
index b15b7c01aedb..000000000000
--- a/arch/x86/entry/vdso/vdso32/int80.S
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Code for the vDSO. This version uses the old int $0x80 method.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#include "sigreturn.S"
-
- .text
- .globl __kernel_vsyscall
- .type __kernel_vsyscall,@function
- ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
- int $0x80
- ret
-.LEND_vsyscall:
- .size __kernel_vsyscall,.-.LSTART_vsyscall
- .previous
-
- .section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
- .long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
- .long 0 /* CIE ID */
- .byte 1 /* Version number */
- .string "zR" /* NUL-terminated augmentation string */
- .uleb128 1 /* Code alignment factor */
- .sleb128 -4 /* Data alignment factor */
- .byte 8 /* Return address register column */
- .uleb128 1 /* Augmentation value length */
- .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
- .byte 0x0c /* DW_CFA_def_cfa */
- .uleb128 4
- .uleb128 4
- .byte 0x88 /* DW_CFA_offset, column 0x8 */
- .uleb128 1
- .align 4
-.LENDCIEDLSI:
- .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
- .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
- .long .LSTART_vsyscall-. /* PC-relative start address */
- .long .LEND_vsyscall-.LSTART_vsyscall
- .uleb128 0
- .align 4
-.LENDFDEDLSI:
- .previous
-
- /*
- * Pad out the segment to match the size of the sysenter.S version.
- */
-VDSO32_vsyscall_eh_frame_size = 0x40
- .section .data,"aw",@progbits
- .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0
- .previous
diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S
deleted file mode 100644
index 6b286bb5251c..000000000000
--- a/arch/x86/entry/vdso/vdso32/syscall.S
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Code for the vDSO. This version uses the syscall instruction.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#define SYSCALL_ENTER_KERNEL syscall
-#include "sigreturn.S"
-
-#include <asm/segment.h>
-
- .text
- .globl __kernel_vsyscall
- .type __kernel_vsyscall,@function
- ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
- push %ebp
-.Lpush_ebp:
- movl %ecx, %ebp
- syscall
- movl %ebp, %ecx
- popl %ebp
-.Lpop_ebp:
- ret
-.LEND_vsyscall:
- .size __kernel_vsyscall,.-.LSTART_vsyscall
-
- .section .eh_frame,"a",@progbits
-.LSTARTFRAME:
- .long .LENDCIE-.LSTARTCIE
-.LSTARTCIE:
- .long 0 /* CIE ID */
- .byte 1 /* Version number */
- .string "zR" /* NUL-terminated augmentation string */
- .uleb128 1 /* Code alignment factor */
- .sleb128 -4 /* Data alignment factor */
- .byte 8 /* Return address register column */
- .uleb128 1 /* Augmentation value length */
- .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
- .byte 0x0c /* DW_CFA_def_cfa */
- .uleb128 4
- .uleb128 4
- .byte 0x88 /* DW_CFA_offset, column 0x8 */
- .uleb128 1
- .align 4
-.LENDCIE:
-
- .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */
-.LSTARTFDE1:
- .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */
- .long .LSTART_vsyscall-. /* PC-relative start address */
- .long .LEND_vsyscall-.LSTART_vsyscall
- .uleb128 0 /* Augmentation length */
- /* What follows are the instructions for the table generation.
- We have to record all changes of the stack pointer. */
- .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .uleb128 8
- .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */
- .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */
- .byte 0xc5 /* DW_CFA_restore %ebp */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .uleb128 4
- .align 4
-.LENDFDE1:
- .previous
-
- /*
- * Pad out the segment to match the size of the sysenter.S version.
- */
-VDSO32_vsyscall_eh_frame_size = 0x40
- .section .data,"aw",@progbits
- .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0
- .previous
diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S
deleted file mode 100644
index e354bceee0e0..000000000000
--- a/arch/x86/entry/vdso/vdso32/sysenter.S
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Code for the vDSO. This version uses the sysenter instruction.
- *
- * First get the common code for the sigreturn entry points.
- * This must come first.
- */
-#include "sigreturn.S"
-
-/*
- * The caller puts arg2 in %ecx, which gets pushed. The kernel will use
- * %ecx itself for arg2. The pushing is because the sysexit instruction
- * (found in entry.S) requires that we clobber %ecx with the desired %esp.
- * User code might expect that %ecx is unclobbered though, as it would be
- * for returning via the iret instruction, so we must push and pop.
- *
- * The caller puts arg3 in %edx, which the sysexit instruction requires
- * for %eip. Thus, exactly as for arg2, we must push and pop.
- *
- * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter
- * instruction clobbers %esp, the user's %esp won't even survive entry
- * into the kernel. We store %esp in %ebp. Code in entry.S must fetch
- * arg6 from the stack.
- *
- * You can not use this vsyscall for the clone() syscall because the
- * three words on the parent stack do not get copied to the child.
- */
- .text
- .globl __kernel_vsyscall
- .type __kernel_vsyscall,@function
- ALIGN
-__kernel_vsyscall:
-.LSTART_vsyscall:
- push %ecx
-.Lpush_ecx:
- push %edx
-.Lpush_edx:
- push %ebp
-.Lenter_kernel:
- movl %esp,%ebp
- sysenter
-
- /* 7: align return point with nop's to make disassembly easier */
- .space 7,0x90
-
- /* 14: System call restart point is here! (SYSENTER_RETURN-2) */
- int $0x80
- /* 16: System call normal return point is here! */
-VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */
- pop %ebp
-.Lpop_ebp:
- pop %edx
-.Lpop_edx:
- pop %ecx
-.Lpop_ecx:
- ret
-.LEND_vsyscall:
- .size __kernel_vsyscall,.-.LSTART_vsyscall
- .previous
-
- .section .eh_frame,"a",@progbits
-.LSTARTFRAMEDLSI:
- .long .LENDCIEDLSI-.LSTARTCIEDLSI
-.LSTARTCIEDLSI:
- .long 0 /* CIE ID */
- .byte 1 /* Version number */
- .string "zR" /* NUL-terminated augmentation string */
- .uleb128 1 /* Code alignment factor */
- .sleb128 -4 /* Data alignment factor */
- .byte 8 /* Return address register column */
- .uleb128 1 /* Augmentation value length */
- .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */
- .byte 0x0c /* DW_CFA_def_cfa */
- .uleb128 4
- .uleb128 4
- .byte 0x88 /* DW_CFA_offset, column 0x8 */
- .uleb128 1
- .align 4
-.LENDCIEDLSI:
- .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */
-.LSTARTFDEDLSI:
- .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */
- .long .LSTART_vsyscall-. /* PC-relative start address */
- .long .LEND_vsyscall-.LSTART_vsyscall
- .uleb128 0
- /* What follows are the instructions for the table generation.
- We have to record all changes of the stack pointer. */
- .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x08 /* RA at offset 8 now */
- .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x0c /* RA at offset 12 now */
- .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x10 /* RA at offset 16 now */
- .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */
- /* Finally the epilogue. */
- .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x0c /* RA at offset 12 now */
- .byte 0xc5 /* DW_CFA_restore %ebp */
- .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x08 /* RA at offset 8 now */
- .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */
- .byte 0x0e /* DW_CFA_def_cfa_offset */
- .byte 0x04 /* RA at offset 4 now */
- .align 4
-.LENDFDEDLSI:
- .previous
-
- /*
- * Emit a symbol with the size of this .eh_frame data,
- * to verify it matches the other versions.
- */
-VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI)
diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S
new file mode 100644
index 000000000000..93bd8452383f
--- /dev/null
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -0,0 +1,57 @@
+/*
+ * Code for the vDSO. This version uses the old int $0x80 method.
+*/
+
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+
+/*
+ * First get the common code for the sigreturn entry points.
+ * This must come first.
+ */
+#include "sigreturn.S"
+
+ .text
+ .globl __kernel_vsyscall
+ .type __kernel_vsyscall,@function
+ ALIGN
+__kernel_vsyscall:
+ CFI_STARTPROC
+ /*
+ * Reshuffle regs so that all of any of the entry instructions
+ * will preserve enough state.
+ */
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx, 0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx, 0
+ movl %esp, %ecx
+
+#ifdef CONFIG_X86_64
+ /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */
+ ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \
+ "syscall", X86_FEATURE_SYSCALL32
+#else
+ ALTERNATIVE "", "sysenter", X86_FEATURE_SEP
+#endif
+
+ /* Enter using int $0x80 */
+ movl (%esp), %ecx
+ int $0x80
+GLOBAL(int80_landing_pad)
+
+ /* Restore ECX and EDX in case they were clobbered. */
+ popl %ecx
+ CFI_RESTORE ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ popl %edx
+ CFI_RESTORE edx
+ CFI_ADJUST_CFA_OFFSET -4
+ ret
+ CFI_ENDPROC
+
+ .size __kernel_vsyscall,.-__kernel_vsyscall
+ .previous
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 434543145d78..64df47148160 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -180,21 +180,10 @@ up_fail:
#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
static int load_vdso32(void)
{
- int ret;
-
if (vdso32_enabled != 1) /* Other values all mean "disabled" */
return 0;
- ret = map_vdso(selected_vdso32, false);
- if (ret)
- return ret;
-
- if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN)
- current_thread_info()->sysenter_return =
- current->mm->context.vdso +
- selected_vdso32->sym_VDSO32_SYSENTER_RETURN;
-
- return 0;
+ return map_vdso(&vdso_image_32, false);
}
#endif
diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index b160c0c6baed..174c2549939d 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -38,7 +38,14 @@
#define CREATE_TRACE_POINTS
#include "vsyscall_trace.h"
-static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
+static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
+#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE)
+ NATIVE;
+#elif defined(CONFIG_LEGACY_VSYSCALL_NONE)
+ NONE;
+#else
+ EMULATE;
+#endif
static int __init vsyscall_setup(char *str)
{