summaryrefslogtreecommitdiffstats
path: root/kernel/entry/common.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2020-12-14 17:13:53 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2020-12-14 17:13:53 -0800
commit1ac0884d5474fea8dc6ceabbd0e870d1bf4b7b42 (patch)
tree5158d582c0f5040a9c8a28fe9051881c5ec7b5eb /kernel/entry/common.c
parentff6135959a9150ad45cb92ca38da270903a74343 (diff)
parentc6156e1da633f241e132eaea3b676d674376d770 (diff)
downloadlinux-1ac0884d5474fea8dc6ceabbd0e870d1bf4b7b42.tar.bz2
Merge tag 'core-entry-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull core entry/exit updates from Thomas Gleixner: "A set of updates for entry/exit handling: - More generalization of entry/exit functionality - The consolidation work to reclaim TIF flags on x86 and also for non-x86 specific TIF flags which are solely relevant for syscall related work and have been moved into their own storage space. The x86 specific part had to be merged in to avoid a major conflict. - The TIF_NOTIFY_SIGNAL work which replaces the inefficient signal delivery mode of task work and results in an impressive performance improvement for io_uring. The non-x86 consolidation of this is going to come seperate via Jens. - The selective syscall redirection facility which provides a clean and efficient way to support the non-Linux syscalls of WINE by catching them at syscall entry and redirecting them to the user space emulation. This can be utilized for other purposes as well and has been designed carefully to avoid overhead for the regular fastpath. This includes the core changes and the x86 support code. - Simplification of the context tracking entry/exit handling for the users of the generic entry code which guarantee the proper ordering and protection. - Preparatory changes to make the generic entry code accomodate S390 specific requirements which are mostly related to their syscall restart mechanism" * tag 'core-entry-2020-12-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) entry: Add syscall_exit_to_user_mode_work() entry: Add exit_to_user_mode() wrapper entry_Add_enter_from_user_mode_wrapper entry: Rename exit_to_user_mode() entry: Rename enter_from_user_mode() docs: Document Syscall User Dispatch selftests: Add benchmark for syscall user dispatch selftests: Add kselftest for syscall user dispatch entry: Support Syscall User Dispatch on common syscall entry kernel: Implement selective syscall userspace redirection signal: Expose SYS_USER_DISPATCH si_code type x86: vdso: Expose sigreturn address on vdso to the kernel MAINTAINERS: Add entry for common entry code entry: Fix boot for !CONFIG_GENERIC_ENTRY x86: Support HAVE_CONTEXT_TRACKING_OFFSTACK context_tracking: Only define schedule_user() on !HAVE_CONTEXT_TRACKING_OFFSTACK archs sched: Detect call to schedule from critical entry code context_tracking: Don't implement exception_enter/exit() on CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK context_tracking: Introduce HAVE_CONTEXT_TRACKING_OFFSTACK x86: Reclaim unused x86 TI flags ...
Diffstat (limited to 'kernel/entry/common.c')
-rw-r--r--kernel/entry/common.c188
1 files changed, 129 insertions, 59 deletions
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index e9e2df3f3f9e..d6b73937dab3 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -5,20 +5,13 @@
#include <linux/livepatch.h>
#include <linux/audit.h>
+#include "common.h"
+
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
-/**
- * enter_from_user_mode - Establish state when coming from user mode
- *
- * Syscall/interrupt entry disables interrupts, but user mode is traced as
- * interrupts enabled. Also with NO_HZ_FULL RCU might be idle.
- *
- * 1) Tell lockdep that interrupts are disabled
- * 2) Invoke context tracking if enabled to reactivate RCU
- * 3) Trace interrupts off state
- */
-static __always_inline void enter_from_user_mode(struct pt_regs *regs)
+/* See comment for enter_from_user_mode() in entry-common.h */
+static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
{
arch_check_user_regs(regs);
lockdep_hardirqs_off(CALLER_ADDR0);
@@ -31,6 +24,11 @@ static __always_inline void enter_from_user_mode(struct pt_regs *regs)
instrumentation_end();
}
+void noinstr enter_from_user_mode(struct pt_regs *regs)
+{
+ __enter_from_user_mode(regs);
+}
+
static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
{
if (unlikely(audit_context())) {
@@ -42,19 +40,29 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall)
}
static long syscall_trace_enter(struct pt_regs *regs, long syscall,
- unsigned long ti_work)
+ unsigned long work)
{
long ret = 0;
+ /*
+ * Handle Syscall User Dispatch. This must comes first, since
+ * the ABI here can be something that doesn't make sense for
+ * other syscall_work features.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (syscall_user_dispatch(regs))
+ return -1L;
+ }
+
/* Handle ptrace */
- if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) {
+ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) {
ret = arch_syscall_enter_tracehook(regs);
- if (ret || (ti_work & _TIF_SYSCALL_EMU))
+ if (ret || (work & SYSCALL_WORK_SYSCALL_EMU))
return -1L;
}
/* Do seccomp after ptrace, to catch any tracer changes. */
- if (ti_work & _TIF_SECCOMP) {
+ if (work & SYSCALL_WORK_SECCOMP) {
ret = __secure_computing(NULL);
if (ret == -1L)
return ret;
@@ -63,7 +71,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
/* Either of the above might have changed the syscall number */
syscall = syscall_get_nr(current, regs);
- if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
+ if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, syscall);
syscall_enter_audit(regs, syscall);
@@ -74,11 +82,10 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
static __always_inline long
__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
- unsigned long ti_work;
+ unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
- ti_work = READ_ONCE(current_thread_info()->flags);
- if (ti_work & SYSCALL_ENTER_WORK)
- syscall = syscall_trace_enter(regs, syscall, ti_work);
+ if (work & SYSCALL_WORK_ENTER)
+ syscall = syscall_trace_enter(regs, syscall, work);
return syscall;
}
@@ -92,7 +99,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
{
long ret;
- enter_from_user_mode(regs);
+ __enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
@@ -104,25 +111,14 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
{
- enter_from_user_mode(regs);
+ __enter_from_user_mode(regs);
instrumentation_begin();
local_irq_enable();
instrumentation_end();
}
-/**
- * exit_to_user_mode - Fixup state when exiting to user mode
- *
- * Syscall/interupt exit enables interrupts, but the kernel state is
- * interrupts disabled when this is invoked. Also tell RCU about it.
- *
- * 1) Trace interrupts on state
- * 2) Invoke context tracking if enabled to adjust RCU state
- * 3) Invoke architecture specific last minute exit code, e.g. speculation
- * mitigations, etc.
- * 4) Tell lockdep that interrupts are enabled
- */
-static __always_inline void exit_to_user_mode(void)
+/* See comment for exit_to_user_mode() in entry-common.h */
+static __always_inline void __exit_to_user_mode(void)
{
instrumentation_begin();
trace_hardirqs_on_prepare();
@@ -134,8 +130,21 @@ static __always_inline void exit_to_user_mode(void)
lockdep_hardirqs_on(CALLER_ADDR0);
}
+void noinstr exit_to_user_mode(void)
+{
+ __exit_to_user_mode();
+}
+
/* Workaround to allow gradual conversion of architecture code */
-void __weak arch_do_signal(struct pt_regs *regs) { }
+void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { }
+
+static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work)
+{
+ if (ti_work & _TIF_NOTIFY_SIGNAL)
+ tracehook_notify_signal();
+
+ arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING);
+}
static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
unsigned long ti_work)
@@ -157,8 +166,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
if (ti_work & _TIF_PATCH_PENDING)
klp_update_patch_state(current);
- if (ti_work & _TIF_SIGPENDING)
- arch_do_signal(regs);
+ if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL))
+ handle_signal_work(regs, ti_work);
if (ti_work & _TIF_NOTIFY_RESUME) {
tracehook_notify_resume(regs);
@@ -199,35 +208,50 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs)
}
#ifndef _TIF_SINGLESTEP
-static inline bool report_single_step(unsigned long ti_work)
+static inline bool report_single_step(unsigned long work)
{
return false;
}
#else
/*
- * If TIF_SYSCALL_EMU is set, then the only reason to report is when
+ * If SYSCALL_EMU is set, then the only reason to report is when
* TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
* instruction has been already reported in syscall_enter_from_user_mode().
*/
-#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
-
-static inline bool report_single_step(unsigned long ti_work)
+static inline bool report_single_step(unsigned long work)
{
- return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP;
+ if (!(work & SYSCALL_WORK_SYSCALL_EMU))
+ return false;
+
+ return !!(current_thread_info()->flags & _TIF_SINGLESTEP);
}
#endif
-static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
+
+static void syscall_exit_work(struct pt_regs *regs, unsigned long work)
{
bool step;
+ /*
+ * If the syscall was rolled back due to syscall user dispatching,
+ * then the tracers below are not invoked for the same reason as
+ * the entry side was not invoked in syscall_trace_enter(): The ABI
+ * of these syscalls is unknown.
+ */
+ if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) {
+ if (unlikely(current->syscall_dispatch.on_dispatch)) {
+ current->syscall_dispatch.on_dispatch = false;
+ return;
+ }
+ }
+
audit_syscall_exit(regs);
- if (ti_work & _TIF_SYSCALL_TRACEPOINT)
+ if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT)
trace_sys_exit(regs, syscall_get_return_value(current, regs));
- step = report_single_step(ti_work);
- if (step || ti_work & _TIF_SYSCALL_TRACE)
+ step = report_single_step(work);
+ if (step || work & SYSCALL_WORK_SYSCALL_TRACE)
arch_syscall_exit_tracehook(regs, step);
}
@@ -237,7 +261,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work)
*/
static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
{
- u32 cached_flags = READ_ONCE(current_thread_info()->flags);
+ unsigned long work = READ_ONCE(current_thread_info()->syscall_work);
unsigned long nr = syscall_get_nr(current, regs);
CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -254,23 +278,33 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs)
* enabled, we want to run them exactly once per syscall exit with
* interrupts enabled.
*/
- if (unlikely(cached_flags & SYSCALL_EXIT_WORK))
- syscall_exit_work(regs, cached_flags);
+ if (unlikely(work & SYSCALL_WORK_EXIT))
+ syscall_exit_work(regs, work);
}
-__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs)
{
- instrumentation_begin();
syscall_exit_to_user_mode_prepare(regs);
local_irq_disable_exit_to_user();
exit_to_user_mode_prepare(regs);
+}
+
+void syscall_exit_to_user_mode_work(struct pt_regs *regs)
+{
+ __syscall_exit_to_user_mode_work(regs);
+}
+
+__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ __syscall_exit_to_user_mode_work(regs);
instrumentation_end();
- exit_to_user_mode();
+ __exit_to_user_mode();
}
noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs)
{
- enter_from_user_mode(regs);
+ __enter_from_user_mode(regs);
}
noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
@@ -278,7 +312,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs)
instrumentation_begin();
exit_to_user_mode_prepare(regs);
instrumentation_end();
- exit_to_user_mode();
+ __exit_to_user_mode();
}
noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
@@ -296,7 +330,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* If this entry hit the idle task invoke rcu_irq_enter() whether
* RCU is watching or not.
*
- * Interupts can nest when the first interrupt invokes softirq
+ * Interrupts can nest when the first interrupt invokes softirq
* processing on return which enables interrupts.
*
* Scheduler ticks in the idle task can mark quiescent state and
@@ -307,7 +341,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
* interrupt to invoke rcu_irq_enter(). If that nested interrupt is
* the tick then rcu_flavor_sched_clock_irq() would wrongfully
* assume that it is the first interupt and eventually claim
- * quiescient state and end grace periods prematurely.
+ * quiescent state and end grace periods prematurely.
*
* Unconditionally invoke rcu_irq_enter() so RCU state stays
* consistent.
@@ -319,7 +353,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
/*
* If RCU is not watching then the same careful
* sequence vs. lockdep and tracing is required
- * as in irq_enter_from_user_mode().
+ * as in irqentry_enter_from_user_mode().
*/
lockdep_hardirqs_off(CALLER_ADDR0);
rcu_irq_enter();
@@ -397,3 +431,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state)
rcu_irq_exit();
}
}
+
+irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs)
+{
+ irqentry_state_t irq_state;
+
+ irq_state.lockdep = lockdep_hardirqs_enabled();
+
+ __nmi_enter();
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ lockdep_hardirq_enter();
+ rcu_nmi_enter();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ ftrace_nmi_enter();
+ instrumentation_end();
+
+ return irq_state;
+}
+
+void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state)
+{
+ instrumentation_begin();
+ ftrace_nmi_exit();
+ if (irq_state.lockdep) {
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ }
+ instrumentation_end();
+
+ rcu_nmi_exit();
+ lockdep_hardirq_exit();
+ if (irq_state.lockdep)
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ __nmi_exit();
+}