From 1f484aa6904697f390027c12fba130fa94b20831 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:23 -0700 Subject: x86/entry: Move C entry and exit code to arch/x86/entry/common.c The entry and exit C helpers were confusingly scattered between ptrace.c and signal.c, even though they aren't specific to ptrace or signal handling. Move them together in a new file. This change just moves code around. It doesn't change anything. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 253 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 arch/x86/entry/common.c (limited to 'arch/x86/entry/common.c') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 000000000000..917d0c3cb851 --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,253 @@ +/* + * common.c - C code for kernel entry and exit + * Copyright (c) 2015 Andrew Lutomirski + * GPL v2 + * + * Based on asm and ptrace code by many authors. The code here originated + * in ptrace.c and signal.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else +#endif + { + audit_syscall_entry(regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + +/* + * We can return 0 to resume the syscall or anything else to go to phase + * 2. If we resume the syscall, we need to put something appropriate in + * regs->orig_ax. + * + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax + * are fully functional. + * + * For phase 2's benefit, our return value is: + * 0: resume the syscall + * 1: go to phase 2; no seccomp phase 2 needed + * anything else: go to phase 2; pass return value to seccomp + */ +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) +{ + unsigned long ret = 0; + u32 work; + + BUG_ON(regs != task_pt_regs(current)); + + work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + /* + * If TIF_NOHZ is set, we are required to call user_exit() before + * doing anything that could touch RCU. + */ + if (work & _TIF_NOHZ) { + user_exit(); + work &= ~_TIF_NOHZ; + } + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp first -- it should minimize exposure of other + * code, and keeping seccomp fast is probably more valuable + * than the rest of this. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); + + ret = seccomp_phase1(&sd); + if (ret == SECCOMP_PHASE1_SKIP) { + regs->orig_ax = -1; + ret = 0; + } else if (ret != SECCOMP_PHASE1_OK) { + return ret; /* Go directly to phase 2 */ + } + + work &= ~_TIF_SECCOMP; + } +#endif + + /* Do our best to finish without phase 2. */ + if (work == 0) + return ret; /* seccomp and/or nohz only (ret == 0 here) */ + +#ifdef CONFIG_AUDITSYSCALL + if (work == _TIF_SYSCALL_AUDIT) { + /* + * If there is no more work to be done except auditing, + * then audit in phase 1. Phase 2 always audits, so, if + * we audit here, then we can't go on to phase 2. + */ + do_audit_syscall_entry(regs, arch); + return 0; + } +#endif + + return 1; /* Something is enabled that we can't handle in phase 1 */ +} + +/* Returns the syscall nr to run (which should match regs->orig_ax). */ +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, + unsigned long phase1_result) +{ + long ret = 0; + u32 work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + BUG_ON(regs != task_pt_regs(current)); + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (work & _TIF_SINGLESTEP) + regs->flags |= X86_EFLAGS_TF; + +#ifdef CONFIG_SECCOMP + /* + * Call seccomp_phase2 before running the other hooks so that + * they can see any changes made by a seccomp tracer. + */ + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { + /* seccomp failures shouldn't expose any additional code. */ + return -1; + } +#endif + + if (unlikely(work & _TIF_SYSCALL_EMU)) + ret = -1L; + + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); + + do_audit_syscall_entry(regs, arch); + + return ret ?: regs->orig_ax; +} + +long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); + + if (phase1_result == 0) + return regs->orig_ax; + else + return syscall_trace_enter_phase2(regs, arch, phase1_result); +} + +void syscall_trace_leave(struct pt_regs *regs) +{ + bool step; + + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + user_exit(); + + audit_syscall_exit(regs); + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); + + user_enter(); +} + +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +__visible void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) +{ + user_exit(); + + if (thread_info_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + user_enter(); +} -- cgit v1.2.3 From feed36cde0a10adb957445a37e48f957f30b2273 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:25 -0700 Subject: x86/entry: Add enter_from_user_mode() and use it in syscalls Changing the x86 context tracking hooks is dangerous because there are no good checks that we track our context correctly. Add a helper to check that we're actually in CONTEXT_USER when we enter from user mode and wire it up for syscall entries. Subsequent patches will wire this up for all non-NMI entries as well. NMIs are their own special beast and cannot currently switch overall context tracking state. Instead, they have their own special RCU hooks. This is a tiny speedup if !CONFIG_CONTEXT_TRACKING (removes a branch) and a tiny slowdown if CONFIG_CONTEXT_TRACING (adds a layer of indirection). Eventually, we should fix up the core context tracking code to supply a function that does what we want (and can be much simpler than user_exit), which will enable us to get rid of the extra call. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/853b42420066ec3fb856779cdc223a6dcb5d355b.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86/entry/common.c') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 917d0c3cb851..9a327ee24eef 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -28,6 +28,15 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_CONTEXT_TRACKING +/* Called on entry from user mode with IRQs off. */ +__visible void enter_from_user_mode(void) +{ + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit(); +} +#endif + static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) { #ifdef CONFIG_X86_64 @@ -65,14 +74,16 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) work = ACCESS_ONCE(current_thread_info()->flags) & _TIF_WORK_SYSCALL_ENTRY; +#ifdef CONFIG_CONTEXT_TRACKING /* * If TIF_NOHZ is set, we are required to call user_exit() before * doing anything that could touch RCU. */ if (work & _TIF_NOHZ) { - user_exit(); + enter_from_user_mode(); work &= ~_TIF_NOHZ; } +#endif #ifdef CONFIG_SECCOMP /* -- cgit v1.2.3 From c5c46f59e4e7c1ab244b8d38f2b61d317df90bba Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:26 -0700 Subject: x86/entry: Add new, comprehensible entry and exit handlers written in C The current x86 entry and exit code, written in a mixture of assembly and C code, is incomprehensible due to being open-coded in a lot of places without coherent documentation. It appears to work primary by luck and duct tape: i.e. obvious runtime failures were fixed on-demand, without re-thinking the design. Due to those reasons our confidence level in that code is low, and it is very difficult to incrementally improve. Add new code written in C, in preparation for simply deleting the old entry code. prepare_exit_to_usermode() is a new function that will handle all slow path exits to user mode. It is called with IRQs disabled and it leaves us in a state in which it is safe to immediately return to user mode. IRQs must not be re-enabled at any point after prepare_exit_to_usermode() returns and user mode is actually entered. (We can, of course, fail to enter user mode and treat that failure as a fresh entry to kernel mode.) All callers of do_notify_resume() will be migrated to call prepare_exit_to_usermode() instead; prepare_exit_to_usermode() needs to do everything that do_notify_resume() does today, but it also takes care of scheduling and context tracking. Unlike do_notify_resume(), it does not need to be called in a loop. syscall_return_slowpath() is exactly what it sounds like: it will be called on any syscall exit slow path. It will replace syscall_trace_leave() and it calls prepare_exit_to_usermode() on the way out. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/c57c8b87661a4152801d7d3786eac2d1a2f209dd.1435952415.git.luto@kernel.org [ Improved the changelog a bit. ] Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 112 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 1 deletion(-) (limited to 'arch/x86/entry/common.c') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 9a327ee24eef..febc53086a69 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -207,6 +207,7 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } +/* Deprecated. */ void syscall_trace_leave(struct pt_regs *regs) { bool step; @@ -237,8 +238,117 @@ void syscall_trace_leave(struct pt_regs *regs) user_enter(); } +static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) +{ + unsigned long top_of_stack = + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; + return (struct thread_info *)(top_of_stack - THREAD_SIZE); +} + +/* Called with IRQs disabled. */ +__visible void prepare_exit_to_usermode(struct pt_regs *regs) +{ + if (WARN_ON(!irqs_disabled())) + local_irq_disable(); + + /* + * In order to return to user mode, we need to have IRQs off with + * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, + * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags + * can be set at any time on preemptable kernels if we have IRQs on, + * so we need to loop. Disabling preemption wouldn't help: doing the + * work to clear some of the flags can sleep. + */ + while (true) { + u32 cached_flags = + READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | + _TIF_UPROBE | _TIF_NEED_RESCHED))) + break; + + /* We have work to do. */ + local_irq_enable(); + + if (cached_flags & _TIF_NEED_RESCHED) + schedule(); + + if (cached_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + /* deal with pending signal delivery */ + if (cached_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (cached_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + + if (cached_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + /* Disable IRQs and retry */ + local_irq_disable(); + } + + user_enter(); +} + +/* + * Called with IRQs on and fully valid regs. Returns with IRQs off in a + * state such that we can immediately switch to user mode. + */ +__visible void syscall_return_slowpath(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + u32 cached_flags = READ_ONCE(ti->flags); + bool step; + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", + regs->orig_ax)) + local_irq_enable(); + + /* + * First do one-time work. If these work items are enabled, we + * want to run them exactly once per syscall exit with IRQs on. + */ + if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); + } + +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. + */ + ti->status &= ~TS_COMPAT; +#endif + + local_irq_disable(); + prepare_exit_to_usermode(regs); +} + /* - * notification of userspace execution resumption + * Deprecated notification of userspace execution resumption * - triggered by the TIF_WORK_MASK flags */ __visible void -- cgit v1.2.3 From d132803e6c611d50c19baedc8ae520203a2baca7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 14:25:16 -0700 Subject: x86/entry: Fix _TIF_USER_RETURN_NOTIFY check in prepare_exit_to_usermode Linus noticed that the early return check was missing _TIF_USER_RETURN_NOTIFY. If the only work flag was _TIF_USER_RETURN_NOTIFY, we'd skip user return notifiers. Fix it. (This is the only missing bit.) This fixes double faults on a KVM host. It's the same issue as last time, except that this time it's very easy to trigger. Apparently no one uses -next as a KVM host. ( I'm still not quite sure what it is that KVM does that blows up so badly if we miss a user return notifier. My best guess is that KVM lets KERNEL_GS_BASE (i.e. the user's gs base) be negative and fixes it up in a user return notifier. If we actually end up in user mode with a negative gs base, we blow up pretty badly. ) Reported-by: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: c5c46f59e4e7 ("x86/entry: Add new, comprehensible entry and exit handlers written in C") Link: http://lkml.kernel.org/r/3f801104d24ee7a6bb1446408d9950777aa63277.1436995419.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/entry/common.c') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index febc53086a69..a3e9c7fa15d9 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -264,7 +264,8 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) READ_ONCE(pt_regs_to_thread_info(regs)->flags); if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | - _TIF_UPROBE | _TIF_NEED_RESCHED))) + _TIF_UPROBE | _TIF_NEED_RESCHED | + _TIF_USER_RETURN_NOTIFY))) break; /* We have work to do. */ -- cgit v1.2.3