summaryrefslogtreecommitdiffstats
path: root/arch/x86/entry
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/entry')
-rw-r--r--arch/x86/entry/Makefile8
-rw-r--r--arch/x86/entry/calling.h25
-rw-r--r--arch/x86/entry/common.c440
-rw-r--r--arch/x86/entry/entry_32.S485
-rw-r--r--arch/x86/entry/entry_64.S840
-rw-r--r--arch/x86/entry/entry_64_compat.S55
-rw-r--r--arch/x86/entry/thunk_64.S14
7 files changed, 761 insertions, 1106 deletions
diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 85eb381259c2..b7a5790d8d63 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -3,7 +3,13 @@
# Makefile for the x86 low level entry code
#
-OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
+KASAN_SANITIZE := n
+UBSAN_SANITIZE := n
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong
+CFLAGS_REMOVE_syscall_32.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong
+CFLAGS_REMOVE_syscall_64.o = $(CC_FLAGS_FTRACE) -fstack-protector -fstack-protector-strong
CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,)
CFLAGS_syscall_32.o += $(call cc-option,-Wno-override-init,)
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 1c7f13bb6728..4208c1e3f601 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -341,30 +341,13 @@ For 32-bit we have the following conventions - kernel is built with
#endif
.endm
-#endif /* CONFIG_X86_64 */
+#else /* CONFIG_X86_64 */
+# undef UNWIND_HINT_IRET_REGS
+# define UNWIND_HINT_IRET_REGS
+#endif /* !CONFIG_X86_64 */
.macro STACKLEAK_ERASE
#ifdef CONFIG_GCC_PLUGIN_STACKLEAK
call stackleak_erase
#endif
.endm
-
-/*
- * This does 'call enter_from_user_mode' unless we can avoid it based on
- * kernel config or using the static jump infrastructure.
- */
-.macro CALL_enter_from_user_mode
-#ifdef CONFIG_CONTEXT_TRACKING
-#ifdef CONFIG_JUMP_LABEL
- STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0
-#endif
- call enter_from_user_mode
-.Lafter_call_\@:
-#endif
-.endm
-
-#ifdef CONFIG_PARAVIRT_XXL
-#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
-#else
-#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
-#endif
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 76735ec813e6..bd3f14175193 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,11 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
+#ifdef CONFIG_XEN_PV
+#include <xen/xen-ops.h>
+#include <xen/events.h>
+#endif
+
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/vdso.h>
@@ -35,21 +40,67 @@
#include <asm/nospec-branch.h>
#include <asm/io_bitmap.h>
#include <asm/syscall.h>
+#include <asm/irq_stack.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
#ifdef CONFIG_CONTEXT_TRACKING
-/* Called on entry from user mode with IRQs off. */
-__visible inline void enter_from_user_mode(void)
+/**
+ * enter_from_user_mode - Establish state when coming from user mode
+ *
+ * Syscall entry disables interrupts, but user mode is traced as interrupts
+ * enabled. Also with NO_HZ_FULL RCU might be idle.
+ *
+ * 1) Tell lockdep that interrupts are disabled
+ * 2) Invoke context tracking if enabled to reactivate RCU
+ * 3) Trace interrupts off state
+ */
+static noinstr void enter_from_user_mode(void)
{
- CT_WARN_ON(ct_state() != CONTEXT_USER);
+ enum ctx_state state = ct_state();
+
+ lockdep_hardirqs_off(CALLER_ADDR0);
user_exit_irqoff();
+
+ instrumentation_begin();
+ CT_WARN_ON(state != CONTEXT_USER);
+ trace_hardirqs_off_finish();
+ instrumentation_end();
}
#else
-static inline void enter_from_user_mode(void) {}
+static __always_inline void enter_from_user_mode(void)
+{
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
#endif
+/**
+ * exit_to_user_mode - Fixup state when exiting to user mode
+ *
+ * Syscall exit enables interrupts, but the kernel state is interrupts
+ * disabled when this is invoked. Also tell RCU about it.
+ *
+ * 1) Trace interrupts on state
+ * 2) Invoke context tracking if enabled to adjust RCU state
+ * 3) Clear CPU buffers if CPU is affected by MDS and the migitation is on.
+ * 4) Tell lockdep that interrupts are enabled
+ */
+static __always_inline void exit_to_user_mode(void)
+{
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ user_enter_irqoff();
+ mds_user_clear_cpu_buffers();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+}
+
static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
{
#ifdef CONFIG_X86_64
@@ -179,8 +230,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
}
}
-/* Called with IRQs disabled. */
-__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
+static void __prepare_exit_to_usermode(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info();
u32 cached_flags;
@@ -219,10 +269,14 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
*/
ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED);
#endif
+}
- user_enter_irqoff();
-
- mds_user_clear_cpu_buffers();
+__visible noinstr void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ __prepare_exit_to_usermode(regs);
+ instrumentation_end();
+ exit_to_user_mode();
}
#define SYSCALL_EXIT_WORK_FLAGS \
@@ -251,11 +305,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
tracehook_report_syscall_exit(regs, step);
}
-/*
- * Called with IRQs on and fully valid regs. Returns with IRQs off in a
- * state such that we can immediately switch to user mode.
- */
-__visible inline void syscall_return_slowpath(struct pt_regs *regs)
+static void __syscall_return_slowpath(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info();
u32 cached_flags = READ_ONCE(ti->flags);
@@ -276,15 +326,29 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
syscall_slow_exit_work(regs, cached_flags);
local_irq_disable();
- prepare_exit_to_usermode(regs);
+ __prepare_exit_to_usermode(regs);
+}
+
+/*
+ * Called with IRQs on and fully valid regs. Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible noinstr void syscall_return_slowpath(struct pt_regs *regs)
+{
+ instrumentation_begin();
+ __syscall_return_slowpath(regs);
+ instrumentation_end();
+ exit_to_user_mode();
}
#ifdef CONFIG_X86_64
-__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
+__visible noinstr void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
struct thread_info *ti;
enter_from_user_mode();
+ instrumentation_begin();
+
local_irq_enable();
ti = current_thread_info();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
@@ -301,8 +365,10 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
regs->ax = x32_sys_call_table[nr](regs);
#endif
}
+ __syscall_return_slowpath(regs);
- syscall_return_slowpath(regs);
+ instrumentation_end();
+ exit_to_user_mode();
}
#endif
@@ -313,7 +379,7 @@ __visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
* extremely hot in workloads that use it, and it's usually called from
* do_fast_syscall_32, so forcibly inline it to improve performance.
*/
-static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
+static void do_syscall_32_irqs_on(struct pt_regs *regs)
{
struct thread_info *ti = current_thread_info();
unsigned int nr = (unsigned int)regs->orig_ax;
@@ -337,27 +403,62 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
regs->ax = ia32_sys_call_table[nr](regs);
}
- syscall_return_slowpath(regs);
+ __syscall_return_slowpath(regs);
}
/* Handles int $0x80 */
-__visible void do_int80_syscall_32(struct pt_regs *regs)
+__visible noinstr void do_int80_syscall_32(struct pt_regs *regs)
{
enter_from_user_mode();
+ instrumentation_begin();
+
local_irq_enable();
do_syscall_32_irqs_on(regs);
+
+ instrumentation_end();
+ exit_to_user_mode();
+}
+
+static bool __do_fast_syscall_32(struct pt_regs *regs)
+{
+ int res;
+
+ /* Fetch EBP from where the vDSO stashed it. */
+ if (IS_ENABLED(CONFIG_X86_64)) {
+ /*
+ * Micro-optimization: the pointer we're following is
+ * explicitly 32 bits, so it can't be out of range.
+ */
+ res = __get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ } else {
+ res = get_user(*(u32 *)&regs->bp,
+ (u32 __user __force *)(unsigned long)(u32)regs->sp);
+ }
+
+ if (res) {
+ /* User code screwed up. */
+ regs->ax = -EFAULT;
+ local_irq_disable();
+ __prepare_exit_to_usermode(regs);
+ return false;
+ }
+
+ /* Now this is just like a normal syscall. */
+ do_syscall_32_irqs_on(regs);
+ return true;
}
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
-__visible long do_fast_syscall_32(struct pt_regs *regs)
+__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
{
/*
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
* convention. Adjust regs so it looks like we entered using int80.
*/
-
unsigned long landing_pad = (unsigned long)current->mm->context.vdso +
- vdso_image_32.sym_int80_landing_pad;
+ vdso_image_32.sym_int80_landing_pad;
+ bool success;
/*
* SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward
@@ -367,33 +468,17 @@ __visible long do_fast_syscall_32(struct pt_regs *regs)
regs->ip = landing_pad;
enter_from_user_mode();
+ instrumentation_begin();
local_irq_enable();
+ success = __do_fast_syscall_32(regs);
- /* Fetch EBP from where the vDSO stashed it. */
- if (
-#ifdef CONFIG_X86_64
- /*
- * Micro-optimization: the pointer we're following is explicitly
- * 32 bits, so it can't be out of range.
- */
- __get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
-#else
- get_user(*(u32 *)&regs->bp,
- (u32 __user __force *)(unsigned long)(u32)regs->sp)
-#endif
- ) {
-
- /* User code screwed up. */
- local_irq_disable();
- regs->ax = -EFAULT;
- prepare_exit_to_usermode(regs);
- return 0; /* Keep it simple: use IRET. */
- }
+ instrumentation_end();
+ exit_to_user_mode();
- /* Now this is just like a normal syscall. */
- do_syscall_32_irqs_on(regs);
+ /* If it failed, keep it simple: use IRET. */
+ if (!success)
+ return 0;
#ifdef CONFIG_X86_64
/*
@@ -431,3 +516,266 @@ SYSCALL_DEFINE0(ni_syscall)
{
return -ENOSYS;
}
+
+/**
+ * idtentry_enter_cond_rcu - Handle state tracking on idtentry with conditional
+ * RCU handling
+ * @regs: Pointer to pt_regs of interrupted context
+ *
+ * Invokes:
+ * - lockdep irqflag state tracking as low level ASM entry disabled
+ * interrupts.
+ *
+ * - Context tracking if the exception hit user mode.
+ *
+ * - The hardirq tracer to keep the state consistent as low level ASM
+ * entry disabled interrupts.
+ *
+ * For kernel mode entries RCU handling is done conditional. If RCU is
+ * watching then the only RCU requirement is to check whether the tick has
+ * to be restarted. If RCU is not watching then rcu_irq_enter() has to be
+ * invoked on entry and rcu_irq_exit() on exit.
+ *
+ * Avoiding the rcu_irq_enter/exit() calls is an optimization but also
+ * solves the problem of kernel mode pagefaults which can schedule, which
+ * is not possible after invoking rcu_irq_enter() without undoing it.
+ *
+ * For user mode entries enter_from_user_mode() must be invoked to
+ * establish the proper context for NOHZ_FULL. Otherwise scheduling on exit
+ * would not be possible.
+ *
+ * Returns: True if RCU has been adjusted on a kernel entry
+ * False otherwise
+ *
+ * The return value must be fed into the rcu_exit argument of
+ * idtentry_exit_cond_rcu().
+ */
+bool noinstr idtentry_enter_cond_rcu(struct pt_regs *regs)
+{
+ if (user_mode(regs)) {
+ enter_from_user_mode();
+ return false;
+ }
+
+ /*
+ * If this entry hit the idle task invoke rcu_irq_enter() whether
+ * RCU is watching or not.
+ *
+ * Interupts can nest when the first interrupt invokes softirq
+ * processing on return which enables interrupts.
+ *
+ * Scheduler ticks in the idle task can mark quiescent state and
+ * terminate a grace period, if and only if the timer interrupt is
+ * not nested into another interrupt.
+ *
+ * Checking for __rcu_is_watching() here would prevent the nesting
+ * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
+ * the tick then rcu_flavor_sched_clock_irq() would wrongfully
+ * assume that it is the first interupt and eventually claim
+ * quiescient state and end grace periods prematurely.
+ *
+ * Unconditionally invoke rcu_irq_enter() so RCU state stays
+ * consistent.
+ *
+ * TINY_RCU does not support EQS, so let the compiler eliminate
+ * this part when enabled.
+ */
+ if (!IS_ENABLED(CONFIG_TINY_RCU) && is_idle_task(current)) {
+ /*
+ * If RCU is not watching then the same careful
+ * sequence vs. lockdep and tracing is required
+ * as in enter_from_user_mode().
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ rcu_irq_enter();
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+
+ return true;
+ }
+
+ /*
+ * If RCU is watching then RCU only wants to check whether it needs
+ * to restart the tick in NOHZ mode. rcu_irq_enter_check_tick()
+ * already contains a warning when RCU is not watching, so no point
+ * in having another one here.
+ */
+ instrumentation_begin();
+ rcu_irq_enter_check_tick();
+ /* Use the combo lockdep/tracing function */
+ trace_hardirqs_off();
+ instrumentation_end();
+
+ return false;
+}
+
+static void idtentry_exit_cond_resched(struct pt_regs *regs, bool may_sched)
+{
+ if (may_sched && !preempt_count()) {
+ /* Sanity check RCU and thread stack */
+ rcu_irq_exit_check_preempt();
+ if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
+ WARN_ON_ONCE(!on_thread_stack());
+ if (need_resched())
+ preempt_schedule_irq();
+ }
+ /* Covers both tracing and lockdep */
+ trace_hardirqs_on();
+}
+
+/**
+ * idtentry_exit_cond_rcu - Handle return from exception with conditional RCU
+ * handling
+ * @regs: Pointer to pt_regs (exception entry regs)
+ * @rcu_exit: Invoke rcu_irq_exit() if true
+ *
+ * Depending on the return target (kernel/user) this runs the necessary
+ * preemption and work checks if possible and reguired and returns to
+ * the caller with interrupts disabled and no further work pending.
+ *
+ * This is the last action before returning to the low level ASM code which
+ * just needs to return to the appropriate context.
+ *
+ * Counterpart to idtentry_enter_cond_rcu(). The return value of the entry
+ * function must be fed into the @rcu_exit argument.
+ */
+void noinstr idtentry_exit_cond_rcu(struct pt_regs *regs, bool rcu_exit)
+{
+ lockdep_assert_irqs_disabled();
+
+ /* Check whether this returns to user mode */
+ if (user_mode(regs)) {
+ prepare_exit_to_usermode(regs);
+ } else if (regs->flags & X86_EFLAGS_IF) {
+ /*
+ * If RCU was not watching on entry this needs to be done
+ * carefully and needs the same ordering of lockdep/tracing
+ * and RCU as the return to user mode path.
+ */
+ if (rcu_exit) {
+ instrumentation_begin();
+ /* Tell the tracer that IRET will enable interrupts */
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+ rcu_irq_exit();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+ return;
+ }
+
+ instrumentation_begin();
+ idtentry_exit_cond_resched(regs, IS_ENABLED(CONFIG_PREEMPTION));
+ instrumentation_end();
+ } else {
+ /*
+ * IRQ flags state is correct already. Just tell RCU if it
+ * was not watching on entry.
+ */
+ if (rcu_exit)
+ rcu_irq_exit();
+ }
+}
+
+/**
+ * idtentry_enter_user - Handle state tracking on idtentry from user mode
+ * @regs: Pointer to pt_regs of interrupted context
+ *
+ * Invokes enter_from_user_mode() to establish the proper context for
+ * NOHZ_FULL. Otherwise scheduling on exit would not be possible.
+ */
+void noinstr idtentry_enter_user(struct pt_regs *regs)
+{
+ enter_from_user_mode();
+}
+
+/**
+ * idtentry_exit_user - Handle return from exception to user mode
+ * @regs: Pointer to pt_regs (exception entry regs)
+ *
+ * Runs the necessary preemption and work checks and returns to the caller
+ * with interrupts disabled and no further work pending.
+ *
+ * This is the last action before returning to the low level ASM code which
+ * just needs to return to the appropriate context.
+ *
+ * Counterpart to idtentry_enter_user().
+ */
+void noinstr idtentry_exit_user(struct pt_regs *regs)
+{
+ lockdep_assert_irqs_disabled();
+
+ prepare_exit_to_usermode(regs);
+}
+
+#ifdef CONFIG_XEN_PV
+#ifndef CONFIG_PREEMPTION
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+/*
+ * In case of scheduling the flag must be cleared and restored after
+ * returning from schedule as the task might move to a different CPU.
+ */
+static __always_inline bool get_and_clear_inhcall(void)
+{
+ bool inhcall = __this_cpu_read(xen_in_preemptible_hcall);
+
+ __this_cpu_write(xen_in_preemptible_hcall, false);
+ return inhcall;
+}
+
+static __always_inline void restore_inhcall(bool inhcall)
+{
+ __this_cpu_write(xen_in_preemptible_hcall, inhcall);
+}
+#else
+static __always_inline bool get_and_clear_inhcall(void) { return false; }
+static __always_inline void restore_inhcall(bool inhcall) { }
+#endif
+
+static void __xen_pv_evtchn_do_upcall(void)
+{
+ irq_enter_rcu();
+ inc_irq_stat(irq_hv_callback_count);
+
+ xen_hvm_evtchn_do_upcall();
+
+ irq_exit_rcu();
+}
+
+__visible noinstr void xen_pv_evtchn_do_upcall(struct pt_regs *regs)
+{
+ struct pt_regs *old_regs;
+ bool inhcall, rcu_exit;
+
+ rcu_exit = idtentry_enter_cond_rcu(regs);
+ old_regs = set_irq_regs(regs);
+
+ instrumentation_begin();
+ run_on_irqstack_cond(__xen_pv_evtchn_do_upcall, NULL, regs);
+ instrumentation_begin();
+
+ set_irq_regs(old_regs);
+
+ inhcall = get_and_clear_inhcall();
+ if (inhcall && !WARN_ON_ONCE(rcu_exit)) {
+ instrumentation_begin();
+ idtentry_exit_cond_resched(regs, true);
+ instrumentation_end();
+ restore_inhcall(inhcall);
+ } else {
+ idtentry_exit_cond_rcu(regs, rcu_exit);
+ }
+}
+#endif /* CONFIG_XEN_PV */
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a5eed844e948..024d7d276cd4 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,40 +44,13 @@
#include <asm/asm.h>
#include <asm/smap.h>
#include <asm/frame.h>
+#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
#include "calling.h"
.section .entry.text, "ax"
-/*
- * We use macros for low-level operations which need to be overridden
- * for paravirtualization. The following will never clobber any registers:
- * INTERRUPT_RETURN (aka. "iret")
- * GET_CR0_INTO_EAX (aka. "movl %cr0, %eax")
- * ENABLE_INTERRUPTS_SYSEXIT (aka "sti; sysexit").
- *
- * For DISABLE_INTERRUPTS/ENABLE_INTERRUPTS (aka "cli"/"sti"), you must
- * specify what registers can be overwritten (CLBR_NONE, CLBR_EAX/EDX/ECX/ANY).
- * Allowing a register to be clobbered can shrink the paravirt replacement
- * enough to patch inline, increasing performance.
- */
-
-#ifdef CONFIG_PREEMPTION
-# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
-#else
-# define preempt_stop(clobbers)
-#endif
-
-.macro TRACE_IRQS_IRET
-#ifdef CONFIG_TRACE_IRQFLAGS
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off?
- jz 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
#define PTI_SWITCH_MASK (1 << PAGE_SHIFT)
/*
@@ -726,10 +699,68 @@
.Lend_\@:
.endm
+
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ ASM_CLAC
+ cld
+
+ .if \has_error_code == 0
+ pushl $0 /* Clear the error code */
+ .endif
+
+ /* Push the C-function address into the GS slot */
+ pushl $\cfunc
+ /* Invoke the common exception entry */
+ jmp handle_exception
+SYM_CODE_END(\asmsym)
+.endm
+
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+SYM_CODE_START_LOCAL(asm_\cfunc)
+ ASM_CLAC
+ SAVE_ALL switch_stacks=1
+ ENCODE_FRAME_POINTER
+ movl %esp, %eax
+ movl PT_ORIG_EAX(%esp), %edx /* get the vector from stack */
+ movl $-1, PT_ORIG_EAX(%esp) /* no syscall to restart */
+ call \cfunc
+ jmp handle_exception_return
+SYM_CODE_END(asm_\cfunc)
+.endm
+
+.macro idtentry_sysvec vector cfunc
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
+
+/*
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
+ */
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
+
+#include <asm/idtentry.h>
+
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+
/*
* %eax: prev task
* %edx: next task
*/
+.pushsection .text, "ax"
SYM_CODE_START(__switch_to_asm)
/*
* Save callee-saved registers
@@ -776,6 +807,7 @@ SYM_CODE_START(__switch_to_asm)
jmp __switch_to
SYM_CODE_END(__switch_to_asm)
+.popsection
/*
* The unwinder expects the last frame on the stack to always be at the same
@@ -784,6 +816,7 @@ SYM_CODE_END(__switch_to_asm)
* asmlinkage function so its argument has to be pushed on the stack. This
* wrapper creates a proper "end of stack" frame header before the call.
*/
+.pushsection .text, "ax"
SYM_FUNC_START(schedule_tail_wrapper)
FRAME_BEGIN
@@ -794,6 +827,8 @@ SYM_FUNC_START(schedule_tail_wrapper)
FRAME_END
ret
SYM_FUNC_END(schedule_tail_wrapper)
+.popsection
+
/*
* A newly forked process directly context switches into this address.
*
@@ -801,6 +836,7 @@ SYM_FUNC_END(schedule_tail_wrapper)
* ebx: kernel thread func (NULL for user thread)
* edi: kernel thread arg
*/
+.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
call schedule_tail_wrapper
@@ -811,8 +847,7 @@ SYM_CODE_START(ret_from_fork)
/* When we fork, we trace the syscall return in the child, too. */
movl %esp, %eax
call syscall_return_slowpath
- STACKLEAK_ERASE
- jmp restore_all
+ jmp .Lsyscall_32_done
/* kernel thread */
1: movl %edi, %eax
@@ -825,38 +860,7 @@ SYM_CODE_START(ret_from_fork)
movl $0, PT_EAX(%esp)
jmp 2b
SYM_CODE_END(ret_from_fork)
-
-/*
- * Return to user mode is not as complex as all this looks,
- * but we want the default path for a system call return to
- * go as quickly as possible which is why some of this is
- * less clear than it otherwise should be.
- */
-
- # userspace resumption stub bypassing syscall exit tracing
-SYM_CODE_START_LOCAL(ret_from_exception)
- preempt_stop(CLBR_ANY)
-ret_from_intr:
-#ifdef CONFIG_VM86
- movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
- movb PT_CS(%esp), %al
- andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
-#else
- /*
- * We can be coming here from child spawned by kernel_thread().
- */
- movl PT_CS(%esp), %eax
- andl $SEGMENT_RPL_MASK, %eax
-#endif
- cmpl $USER_RPL, %eax
- jb restore_all_kernel # not returning to v8086 or userspace
-
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
- movl %esp, %eax
- call prepare_exit_to_usermode
- jmp restore_all
-SYM_CODE_END(ret_from_exception)
+.popsection
SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE)
/*
@@ -960,12 +964,6 @@ SYM_FUNC_START(entry_SYSENTER_32)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movl %esp, %eax
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
@@ -974,8 +972,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
STACKLEAK_ERASE
-/* Opportunistic SYSEXIT */
- TRACE_IRQS_ON /* User mode traces as IRQs on. */
+ /* Opportunistic SYSEXIT */
/*
* Setup entry stack - we keep the pointer in %eax and do the
@@ -1075,20 +1072,12 @@ SYM_FUNC_START(entry_INT80_32)
SAVE_ALL pt_regs_ax=$-ENOSYS switch_stacks=1 /* save rest */
- /*
- * User mode is traced as though IRQs are on, and the interrupt gate
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movl %esp, %eax
call do_int80_syscall_32
.Lsyscall_32_done:
-
STACKLEAK_ERASE
-restore_all:
- TRACE_IRQS_ON
+restore_all_switch_stack:
SWITCH_TO_ENTRY_STACK
CHECK_AND_APPLY_ESPFIX
@@ -1107,26 +1096,10 @@ restore_all:
*/
INTERRUPT_RETURN
-restore_all_kernel:
-#ifdef CONFIG_PREEMPTION
- DISABLE_INTERRUPTS(CLBR_ANY)
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz .Lno_preempt
- testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ?
- jz .Lno_preempt
- call preempt_schedule_irq
-.Lno_preempt:
-#endif
- TRACE_IRQS_IRET
- PARANOID_EXIT_TO_KERNEL_MODE
- BUG_IF_WRONG_CR3
- RESTORE_REGS 4
- jmp .Lirq_return
-
.section .fixup, "ax"
-SYM_CODE_START(iret_exc)
+SYM_CODE_START(asm_iret_error)
pushl $0 # no error code
- pushl $do_iret_error
+ pushl $iret_error
#ifdef CONFIG_DEBUG_ENTRY
/*
@@ -1140,10 +1113,10 @@ SYM_CODE_START(iret_exc)
popl %eax
#endif
- jmp common_exception
-SYM_CODE_END(iret_exc)
+ jmp handle_exception
+SYM_CODE_END(asm_iret_error)
.previous
- _ASM_EXTABLE(.Lirq_return, iret_exc)
+ _ASM_EXTABLE(.Lirq_return, asm_iret_error)
SYM_FUNC_END(entry_INT80_32)
.macro FIXUP_ESPFIX_STACK
@@ -1193,192 +1166,21 @@ SYM_FUNC_END(entry_INT80_32)
#endif
.endm
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_interrupt
- .align 8
- .endr
-SYM_CODE_END(irq_entries_start)
-
-#ifdef CONFIG_X86_LOCAL_APIC
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- pushl $(~vector+0x80) /* Note: always in signed byte range */
- vector=vector+1
- jmp common_spurious
- .align 8
- .endr
-SYM_CODE_END(spurious_entries_start)
-
-SYM_CODE_START_LOCAL(common_spurious)
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
- SAVE_ALL switch_stacks=1
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call smp_spurious_interrupt
- jmp ret_from_intr
-SYM_CODE_END(common_spurious)
-#endif
-
-/*
- * the CPU automatically disables interrupts when executing an IRQ vector,
- * so IRQ-flags tracing has to follow that:
- */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-SYM_CODE_START_LOCAL(common_interrupt)
- ASM_CLAC
- addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */
-
- SAVE_ALL switch_stacks=1
- ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
- movl %esp, %eax
- call do_IRQ
- jmp ret_from_intr
-SYM_CODE_END(common_interrupt)
-
-#define BUILD_INTERRUPT3(name, nr, fn) \
-SYM_FUNC_START(name) \
- ASM_CLAC; \
- pushl $~(nr); \
- SAVE_ALL switch_stacks=1; \
- ENCODE_FRAME_POINTER; \
- TRACE_IRQS_OFF \
- movl %esp, %eax; \
- call fn; \
- jmp ret_from_intr; \
-SYM_FUNC_END(name)
-
-#define BUILD_INTERRUPT(name, nr) \
- BUILD_INTERRUPT3(name, nr, smp_##name); \
-
-/* The include is where all of the SMP etc. interrupts come from */
-#include <asm/entry_arch.h>
-
-SYM_CODE_START(coprocessor_error)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_error
- jmp common_exception
-SYM_CODE_END(coprocessor_error)
-
-SYM_CODE_START(simd_coprocessor_error)
- ASM_CLAC
- pushl $0
-#ifdef CONFIG_X86_INVD_BUG
- /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
- ALTERNATIVE "pushl $do_general_protection", \
- "pushl $do_simd_coprocessor_error", \
- X86_FEATURE_XMM
-#else
- pushl $do_simd_coprocessor_error
-#endif
- jmp common_exception
-SYM_CODE_END(simd_coprocessor_error)
-
-SYM_CODE_START(device_not_available)
- ASM_CLAC
- pushl $0
- pushl $do_device_not_available
- jmp common_exception
-SYM_CODE_END(device_not_available)
-
#ifdef CONFIG_PARAVIRT
SYM_CODE_START(native_iret)
iret
- _ASM_EXTABLE(native_iret, iret_exc)
+ _ASM_EXTABLE(native_iret, asm_iret_error)
SYM_CODE_END(native_iret)
#endif
-SYM_CODE_START(overflow)
- ASM_CLAC
- pushl $0
- pushl $do_overflow
- jmp common_exception
-SYM_CODE_END(overflow)
-
-SYM_CODE_START(bounds)
- ASM_CLAC
- pushl $0
- pushl $do_bounds
- jmp common_exception
-SYM_CODE_END(bounds)
-
-SYM_CODE_START(invalid_op)
- ASM_CLAC
- pushl $0
- pushl $do_invalid_op
- jmp common_exception
-SYM_CODE_END(invalid_op)
-
-SYM_CODE_START(coprocessor_segment_overrun)
- ASM_CLAC
- pushl $0
- pushl $do_coprocessor_segment_overrun
- jmp common_exception
-SYM_CODE_END(coprocessor_segment_overrun)
-
-SYM_CODE_START(invalid_TSS)
- ASM_CLAC
- pushl $do_invalid_TSS
- jmp common_exception
-SYM_CODE_END(invalid_TSS)
-
-SYM_CODE_START(segment_not_present)
- ASM_CLAC
- pushl $do_segment_not_present
- jmp common_exception
-SYM_CODE_END(segment_not_present)
-
-SYM_CODE_START(stack_segment)
- ASM_CLAC
- pushl $do_stack_segment
- jmp common_exception
-SYM_CODE_END(stack_segment)
-
-SYM_CODE_START(alignment_check)
- ASM_CLAC
- pushl $do_alignment_check
- jmp common_exception
-SYM_CODE_END(alignment_check)
-
-SYM_CODE_START(divide_error)
- ASM_CLAC
- pushl $0 # no error code
- pushl $do_divide_error
- jmp common_exception
-SYM_CODE_END(divide_error)
-
-#ifdef CONFIG_X86_MCE
-SYM_CODE_START(machine_check)
- ASM_CLAC
- pushl $0
- pushl $do_mce
- jmp common_exception
-SYM_CODE_END(machine_check)
-#endif
-
-SYM_CODE_START(spurious_interrupt_bug)
- ASM_CLAC
- pushl $0
- pushl $do_spurious_interrupt_bug
- jmp common_exception
-SYM_CODE_END(spurious_interrupt_bug)
-
#ifdef CONFIG_XEN_PV
-SYM_FUNC_START(xen_hypervisor_callback)
+/*
+ * See comment in entry_64.S for further explanation
+ *
+ * Note: This is not an actual IDT entry point. It's a XEN specific entry
+ * point and therefore named to match the 64-bit trampoline counterpart.
+ */
+SYM_FUNC_START(xen_asm_exc_xen_hypervisor_callback)
/*
* Check to see if we got the event in the critical
* region in xen_iret_direct, after we've reenabled
@@ -1395,14 +1197,11 @@ SYM_FUNC_START(xen_hypervisor_callback)
pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
ENCODE_FRAME_POINTER
- TRACE_IRQS_OFF
+
mov %esp, %eax
- call xen_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp ret_from_intr
-SYM_FUNC_END(xen_hypervisor_callback)
+ call xen_pv_evtchn_do_upcall
+ jmp handle_exception_return
+SYM_FUNC_END(xen_asm_exc_xen_hypervisor_callback)
/*
* Hypervisor uses this for application faults while it executes.
@@ -1429,11 +1228,11 @@ SYM_FUNC_START(xen_failsafe_callback)
popl %eax
lea 16(%esp), %esp
jz 5f
- jmp iret_exc
+ jmp asm_iret_error
5: pushl $-1 /* orig_ax = -1 => not a system call */
SAVE_ALL
ENCODE_FRAME_POINTER
- jmp ret_from_exception
+ jmp handle_exception_return
.section .fixup, "ax"
6: xorl %eax, %eax
@@ -1456,56 +1255,7 @@ SYM_FUNC_START(xen_failsafe_callback)
SYM_FUNC_END(xen_failsafe_callback)
#endif /* CONFIG_XEN_PV */
-#ifdef CONFIG_XEN_PVHVM
-BUILD_INTERRUPT3(xen_hvm_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- xen_evtchn_do_upcall)
-#endif
-
-
-#if IS_ENABLED(CONFIG_HYPERV)
-
-BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR,
- hyperv_vector_handler)
-
-BUILD_INTERRUPT3(hyperv_reenlightenment_vector, HYPERV_REENLIGHTENMENT_VECTOR,
- hyperv_reenlightenment_intr)
-
-BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR,
- hv_stimer0_vector_handler)
-
-#endif /* CONFIG_HYPERV */
-
-SYM_CODE_START(page_fault)
- ASM_CLAC
- pushl $do_page_fault
- jmp common_exception_read_cr2
-SYM_CODE_END(page_fault)
-
-SYM_CODE_START_LOCAL_NOALIGN(common_exception_read_cr2)
- /* the function address is in %gs's slot on the stack */
- SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
-
- ENCODE_FRAME_POINTER
-
- /* fixup %gs */
- GS_TO_REG %ecx
- movl PT_GS(%esp), %edi
- REG_TO_PTGS %ecx
- SET_KERNEL_GS %ecx
-
- GET_CR2_INTO(%ecx) # might clobber %eax
-
- /* fixup orig %eax */
- movl PT_ORIG_EAX(%esp), %edx # get the error code
- movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
-
- TRACE_IRQS_OFF
- movl %esp, %eax # pt_regs pointer
- CALL_NOSPEC edi
- jmp ret_from_exception
-SYM_CODE_END(common_exception_read_cr2)
-
-SYM_CODE_START_LOCAL_NOALIGN(common_exception)
+SYM_CODE_START_LOCAL_NOALIGN(handle_exception)
/* the function address is in %gs's slot on the stack */
SAVE_ALL switch_stacks=1 skip_gs=1 unwind_espfix=1
ENCODE_FRAME_POINTER
@@ -1520,23 +1270,35 @@ SYM_CODE_START_LOCAL_NOALIGN(common_exception)
movl PT_ORIG_EAX(%esp), %edx # get the error code
movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
- TRACE_IRQS_OFF
movl %esp, %eax # pt_regs pointer
CALL_NOSPEC edi
- jmp ret_from_exception
-SYM_CODE_END(common_exception)
-SYM_CODE_START(debug)
+handle_exception_return:
+#ifdef CONFIG_VM86
+ movl PT_EFLAGS(%esp), %eax # mix EFLAGS and CS
+ movb PT_CS(%esp), %al
+ andl $(X86_EFLAGS_VM | SEGMENT_RPL_MASK), %eax
+#else
/*
- * Entry from sysenter is now handled in common_exception
+ * We can be coming here from child spawned by kernel_thread().
*/
- ASM_CLAC
- pushl $0
- pushl $do_debug
- jmp common_exception
-SYM_CODE_END(debug)
+ movl PT_CS(%esp), %eax
+ andl $SEGMENT_RPL_MASK, %eax
+#endif
+ cmpl $USER_RPL, %eax # returning to v8086 or userspace ?
+ jnb ret_to_user
-SYM_CODE_START(double_fault)
+ PARANOID_EXIT_TO_KERNEL_MODE
+ BUG_IF_WRONG_CR3
+ RESTORE_REGS 4
+ jmp .Lirq_return
+
+ret_to_user:
+ movl %esp, %eax
+ jmp restore_all_switch_stack
+SYM_CODE_END(handle_exception)
+
+SYM_CODE_START(asm_exc_double_fault)
1:
/*
* This is a task gate handler, not an interrupt gate handler.
@@ -1574,7 +1336,7 @@ SYM_CODE_START(double_fault)
1:
hlt
jmp 1b
-SYM_CODE_END(double_fault)
+SYM_CODE_END(asm_exc_double_fault)
/*
* NMI is doubly nasty. It can happen on the first instruction of
@@ -1583,7 +1345,7 @@ SYM_CODE_END(double_fault)
* switched stacks. We handle both conditions by simply checking whether we
* interrupted kernel code running on the SYSENTER stack.
*/
-SYM_CODE_START(nmi)
+SYM_CODE_START(asm_exc_nmi)
ASM_CLAC
#ifdef CONFIG_X86_ESPFIX32
@@ -1612,7 +1374,7 @@ SYM_CODE_START(nmi)
jb .Lnmi_from_sysenter_stack
/* Not on SYSENTER stack. */
- call do_nmi
+ call exc_nmi
jmp .Lnmi_return
.Lnmi_from_sysenter_stack:
@@ -1622,7 +1384,7 @@ SYM_CODE_START(nmi)
*/
movl %esp, %ebx
movl PER_CPU_VAR(cpu_current_top_of_stack), %esp
- call do_nmi
+ call exc_nmi
movl %ebx, %esp
.Lnmi_return:
@@ -1676,21 +1438,9 @@ SYM_CODE_START(nmi)
lss (1+5+6)*4(%esp), %esp # back to espfix stack
jmp .Lirq_return
#endif
-SYM_CODE_END(nmi)
-
-SYM_CODE_START(int3)
- ASM_CLAC
- pushl $0
- pushl $do_int3
- jmp common_exception
-SYM_CODE_END(int3)
-
-SYM_CODE_START(general_protection)
- ASM_CLAC
- pushl $do_general_protection
- jmp common_exception
-SYM_CODE_END(general_protection)
+SYM_CODE_END(asm_exc_nmi)
+.pushsection .text, "ax"
SYM_CODE_START(rewind_stack_do_exit)
/* Prevent any naive code from trying to unwind to our caller. */
xorl %ebp, %ebp
@@ -1701,3 +1451,4 @@ SYM_CODE_START(rewind_stack_do_exit)
call do_exit
1: jmp 1b
SYM_CODE_END(rewind_stack_do_exit)
+.popsection
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index eead1e2bebd5..d2a00c97e53f 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -16,7 +16,6 @@
*
* Some macro usage:
* - SYM_FUNC_START/END:Define functions in the symbol table.
- * - TRACE_IRQ_*: Trace hardirq state for lock debugging.
* - idtentry: Define exception entry points.
*/
#include <linux/linkage.h>
@@ -37,6 +36,7 @@
#include <asm/pgtable_types.h>
#include <asm/export.h>
#include <asm/frame.h>
+#include <asm/trapnr.h>
#include <asm/nospec-branch.h>
#include <linux/err.h>
@@ -53,57 +53,6 @@ SYM_CODE_START(native_usergs_sysret64)
SYM_CODE_END(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_FLAGS flags:req
-#ifdef CONFIG_TRACE_IRQFLAGS
- btl $9, \flags /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON
-1:
-#endif
-.endm
-
-.macro TRACE_IRQS_IRETQ
- TRACE_IRQS_FLAGS EFLAGS(%rsp)
-.endm
-
-/*
- * When dynamic function tracer is enabled it will add a breakpoint
- * to all locations that it is about to modify, sync CPUs, update
- * all the code, sync CPUs, then remove the breakpoints. In this time
- * if lockdep is enabled, it might jump back into the debug handler
- * outside the updating of the IST protection. (TRACE_IRQS_ON/OFF).
- *
- * We need to change the IDT table before calling TRACE_IRQS_ON/OFF to
- * make sure the stack pointer does not get reset back to the top
- * of the debug stack, and instead just reuses the current stack.
- */
-#if defined(CONFIG_DYNAMIC_FTRACE) && defined(CONFIG_TRACE_IRQFLAGS)
-
-.macro TRACE_IRQS_OFF_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_OFF
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_ON_DEBUG
- call debug_stack_set_zero
- TRACE_IRQS_ON
- call debug_stack_reset
-.endm
-
-.macro TRACE_IRQS_IRETQ_DEBUG
- btl $9, EFLAGS(%rsp) /* interrupts off? */
- jnc 1f
- TRACE_IRQS_ON_DEBUG
-1:
-.endm
-
-#else
-# define TRACE_IRQS_OFF_DEBUG TRACE_IRQS_OFF
-# define TRACE_IRQS_ON_DEBUG TRACE_IRQS_ON
-# define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ
-#endif
-
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
@@ -144,11 +93,6 @@ SYM_CODE_END(native_usergs_sysret64)
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
- /*
- * Interrupts are off on entry.
- * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
- * it is too small to ever cause noticeable irq latency.
- */
swapgs
/* tss.sp2 is scratch space. */
@@ -167,15 +111,11 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
- TRACE_IRQS_OFF
-
/* IRQs are off. */
movq %rax, %rdi
movq %rsp, %rsi
call do_syscall_64 /* returns with IRQs disabled */
- TRACE_IRQS_ON /* return enables interrupts */
-
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
@@ -279,6 +219,7 @@ SYM_CODE_END(entry_SYSCALL_64)
* %rdi: prev task
* %rsi: next task
*/
+.pushsection .text, "ax"
SYM_FUNC_START(__switch_to_asm)
/*
* Save callee-saved registers
@@ -321,6 +262,7 @@ SYM_FUNC_START(__switch_to_asm)
jmp __switch_to
SYM_FUNC_END(__switch_to_asm)
+.popsection
/*
* A newly forked process directly context switches into this address.
@@ -329,6 +271,7 @@ SYM_FUNC_END(__switch_to_asm)
* rbx: kernel thread func (NULL for user thread)
* r12: kernel thread arg
*/
+.pushsection .text, "ax"
SYM_CODE_START(ret_from_fork)
UNWIND_HINT_EMPTY
movq %rax, %rdi
@@ -341,7 +284,6 @@ SYM_CODE_START(ret_from_fork)
UNWIND_HINT_REGS
movq %rsp, %rdi
call syscall_return_slowpath /* returns with IRQs disabled */
- TRACE_IRQS_ON /* user mode is traced as IRQS on */
jmp swapgs_restore_regs_and_return_to_usermode
1:
@@ -357,34 +299,7 @@ SYM_CODE_START(ret_from_fork)
movq $0, RAX(%rsp)
jmp 2b
SYM_CODE_END(ret_from_fork)
-
-/*
- * Build the entry stubs with some assembler magic.
- * We pack 1 stub into every 8-byte block.
- */
- .align 8
-SYM_CODE_START(irq_entries_start)
- vector=FIRST_EXTERNAL_VECTOR
- .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_interrupt
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(irq_entries_start)
-
- .align 8
-SYM_CODE_START(spurious_entries_start)
- vector=FIRST_SYSTEM_VECTOR
- .rept (NR_VECTORS - FIRST_SYSTEM_VECTOR)
- UNWIND_HINT_IRET_REGS
- pushq $(~vector+0x80) /* Note: always in signed byte range */
- jmp common_spurious
- .align 8
- vector=vector+1
- .endr
-SYM_CODE_END(spurious_entries_start)
+.popsection
.macro DEBUG_ENTRY_ASSERT_IRQS_OFF
#ifdef CONFIG_DEBUG_ENTRY
@@ -398,228 +313,185 @@ SYM_CODE_END(spurious_entries_start)
#endif
.endm
-/*
- * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers
- * flags and puts old RSP into old_rsp, and leaves all other GPRs alone.
- * Requires kernel GSBASE.
- *
- * The invariant is that, if irq_count != -1, then the IRQ stack is in use.
+/**
+ * idtentry_body - Macro to emit code calling the C function
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
*/
-.macro ENTER_IRQ_STACK regs=1 old_rsp save_ret=0
- DEBUG_ENTRY_ASSERT_IRQS_OFF
+.macro idtentry_body cfunc has_error_code:req
- .if \save_ret
- /*
- * If save_ret is set, the original stack contains one additional
- * entry -- the return address. Therefore, move the address one
- * entry below %rsp to \old_rsp.
- */
- leaq 8(%rsp), \old_rsp
- .else
- movq %rsp, \old_rsp
- .endif
-
- .if \regs
- UNWIND_HINT_REGS base=\old_rsp
- .endif
+ call error_entry
+ UNWIND_HINT_REGS
- incl PER_CPU_VAR(irq_count)
- jnz .Lirq_stack_push_old_rsp_\@
+ movq %rsp, %rdi /* pt_regs pointer into 1st argument*/
- /*
- * Right now, if we just incremented irq_count to zero, we've
- * claimed the IRQ stack but we haven't switched to it yet.
- *
- * If anything is added that can interrupt us here without using IST,
- * it must be *extremely* careful to limit its stack usage. This
- * could include kprobes and a hypothetical future IST-less #DB
- * handler.
- *
- * The OOPS unwinder relies on the word at the top of the IRQ
- * stack linking back to the previous RSP for the entire time we're
- * on the IRQ stack. For this to work reliably, we need to write
- * it before we actually move ourselves to the IRQ stack.
- */
+ .if \has_error_code == 1
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ .endif
- movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8)
- movq PER_CPU_VAR(hardirq_stack_ptr), %rsp
+ call \cfunc
-#ifdef CONFIG_DEBUG_ENTRY
- /*
- * If the first movq above becomes wrong due to IRQ stack layout
- * changes, the only way we'll notice is if we try to unwind right
- * here. Assert that we set up the stack right to catch this type
- * of bug quickly.
- */
- cmpq -8(%rsp), \old_rsp
- je .Lirq_stack_okay\@
- ud2
- .Lirq_stack_okay\@:
-#endif
+ jmp error_return
+.endm
-.Lirq_stack_push_old_rsp_\@:
- pushq \old_rsp
+/**
+ * idtentry - Macro to generate entry stubs for simple IDT entries
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ * @has_error_code: Hardware pushed error code on stack
+ *
+ * The macro emits code to set up the kernel context for straight forward
+ * and simple IDT entries. No IST stack, no paranoid entry checks.
+ */
+.macro idtentry vector asmsym cfunc has_error_code:req
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS offset=\has_error_code*8
+ ASM_CLAC
- .if \regs
- UNWIND_HINT_REGS indirect=1
+ .if \has_error_code == 0
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
.endif
- .if \save_ret
- /*
- * Push the return address to the stack. This return address can
- * be found at the "real" original RSP, which was offset by 8 at
- * the beginning of this macro.
- */
- pushq -8(\old_rsp)
+ .if \vector == X86_TRAP_BP
+ /*
+ * If coming from kernel space, create a 6-word gap to allow the
+ * int3 handler to emulate a call instruction.
+ */
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_no_gap_\@
+ .rept 6
+ pushq 5*8(%rsp)
+ .endr
+ UNWIND_HINT_IRET_REGS offset=8
+.Lfrom_usermode_no_gap_\@:
.endif
+
+ idtentry_body \cfunc \has_error_code
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
.endm
/*
- * Undoes ENTER_IRQ_STACK.
+ * Interrupt entry/exit.
+ *
+ + The interrupt stubs push (vector) onto the stack, which is the error_code
+ * position of idtentry exceptions, and jump to one of the two idtentry points
+ * (common/spurious).
+ *
+ * common_interrupt is a hotpath, align it to a cache line
*/
-.macro LEAVE_IRQ_STACK regs=1
- DEBUG_ENTRY_ASSERT_IRQS_OFF
- /* We need to be off the IRQ stack before decrementing irq_count. */
- popq %rsp
-
- .if \regs
- UNWIND_HINT_REGS
- .endif
-
- /*
- * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming
- * the irq stack but we're not on it.
- */
-
- decl PER_CPU_VAR(irq_count)
+.macro idtentry_irq vector cfunc
+ .p2align CONFIG_X86_L1_CACHE_SHIFT
+ idtentry \vector asm_\cfunc \cfunc has_error_code=1
.endm
/*
- * Interrupt entry helper function.
+ * System vectors which invoke their handlers directly and are not
+ * going through the regular common device interrupt handling code.
+ */
+.macro idtentry_sysvec vector cfunc
+ idtentry \vector asm_\cfunc \cfunc has_error_code=0
+.endm
+
+/**
+ * idtentry_mce_db - Macro to generate entry stubs for #MC and #DB
+ * @vector: Vector number
+ * @asmsym: ASM symbol for the entry point
+ * @cfunc: C function to be called
+ *
+ * The macro emits code to set up the kernel context for #MC and #DB
+ *
+ * If the entry comes from user space it uses the normal entry path
+ * including the return to user space work and preemption checks on
+ * exit.
*
- * Entry runs with interrupts off. Stack layout at entry:
- * +----------------------------------------------------+
- * | regs->ss |
- * | regs->rsp |
- * | regs->eflags |
- * | regs->cs |
- * | regs->ip |
- * +----------------------------------------------------+
- * | regs->orig_ax = ~(interrupt number) |
- * +----------------------------------------------------+
- * | return address |
- * +----------------------------------------------------+
+ * If hits in kernel mode then it needs to go through the paranoid
+ * entry as the exception can hit any random state. No preemption
+ * check on exit to keep the paranoid path simple.
*/
-SYM_CODE_START(interrupt_entry)
- UNWIND_HINT_IRET_REGS offset=16
+.macro idtentry_mce_db vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS
ASM_CLAC
- cld
- testb $3, CS-ORIG_RAX+8(%rsp)
- jz 1f
- SWAPGS
- FENCE_SWAPGS_USER_ENTRY
+ pushq $-1 /* ORIG_RAX: no syscall to restart */
+
/*
- * Switch to the thread stack. The IRET frame and orig_ax are
- * on the stack, as well as the return address. RDI..R12 are
- * not (yet) on the stack and space has not (yet) been
- * allocated for them.
+ * If the entry is from userspace, switch stacks and treat it as
+ * a normal entry.
*/
- pushq %rdi
+ testb $3, CS-ORIG_RAX(%rsp)
+ jnz .Lfrom_usermode_switch_stack_\@
- /* Need to switch before accessing the thread stack. */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
- movq %rsp, %rdi
- movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+ /*
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
+ */
+ call paranoid_entry
- /*
- * We have RDI, return address, and orig_ax on the stack on
- * top of the IRET frame. That means offset=24
- */
- UNWIND_HINT_IRET_REGS base=%rdi offset=24
-
- pushq 7*8(%rdi) /* regs->ss */
- pushq 6*8(%rdi) /* regs->rsp */
- pushq 5*8(%rdi) /* regs->eflags */
- pushq 4*8(%rdi) /* regs->cs */
- pushq 3*8(%rdi) /* regs->ip */
- UNWIND_HINT_IRET_REGS
- pushq 2*8(%rdi) /* regs->orig_ax */
- pushq 8(%rdi) /* return address */
+ UNWIND_HINT_REGS
- movq (%rdi), %rdi
- jmp 2f
-1:
- FENCE_SWAPGS_KERNEL_ENTRY
-2:
- PUSH_AND_CLEAR_REGS save_ret=1
- ENCODE_FRAME_POINTER 8
+ movq %rsp, %rdi /* pt_regs pointer */
- testb $3, CS+8(%rsp)
- jz 1f
+ call \cfunc
- /*
- * IRQ from user mode.
- *
- * We need to tell lockdep that IRQs are off. We can't do this until
- * we fix gsbase, and we should do it before enter_from_user_mode
- * (which can take locks). Since TRACE_IRQS_OFF is idempotent,
- * the simplest way to handle it is to just call it twice if
- * we enter from user mode. There's no reason to optimize this since
- * TRACE_IRQS_OFF is a no-op if lockdep is off.
- */
- TRACE_IRQS_OFF
+ jmp paranoid_exit
- CALL_enter_from_user_mode
+ /* Switch to the regular task stack and use the noist entry point */
+.Lfrom_usermode_switch_stack_\@:
+ idtentry_body noist_\cfunc, has_error_code=0
-1:
- ENTER_IRQ_STACK old_rsp=%rdi save_ret=1
- /* We entered an interrupt context - irqs are off: */
- TRACE_IRQS_OFF
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
- ret
-SYM_CODE_END(interrupt_entry)
-_ASM_NOKPROBE(interrupt_entry)
+/*
+ * Double fault entry. Straight paranoid. No checks from which context
+ * this comes because for the espfix induced #DF this would do the wrong
+ * thing.
+ */
+.macro idtentry_df vector asmsym cfunc
+SYM_CODE_START(\asmsym)
+ UNWIND_HINT_IRET_REGS offset=8
+ ASM_CLAC
+ /*
+ * paranoid_entry returns SWAPGS flag for paranoid_exit in EBX.
+ * EBX == 0 -> SWAPGS, EBX == 1 -> no SWAPGS
+ */
+ call paranoid_entry
+ UNWIND_HINT_REGS
-/* Interrupt entry/exit. */
+ movq %rsp, %rdi /* pt_regs pointer into first argument */
+ movq ORIG_RAX(%rsp), %rsi /* get error code into 2nd argument*/
+ movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
+ call \cfunc
+
+ jmp paranoid_exit
+
+_ASM_NOKPROBE(\asmsym)
+SYM_CODE_END(\asmsym)
+.endm
/*
- * The interrupt stubs push (~vector+0x80) onto the stack and
- * then jump to common_spurious/interrupt.
+ * Include the defines which emit the idt entries which are shared
+ * shared between 32 and 64 bit and emit the __irqentry_text_* markers
+ * so the stacktrace boundary checks work.
*/
-SYM_CODE_START_LOCAL(common_spurious)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call smp_spurious_interrupt /* rdi points to pt_regs */
- jmp ret_from_intr
-SYM_CODE_END(common_spurious)
-_ASM_NOKPROBE(common_spurious)
-
-/* common_interrupt is a hotpath. Align it */
- .p2align CONFIG_X86_L1_CACHE_SHIFT
-SYM_CODE_START_LOCAL(common_interrupt)
- addq $-0x80, (%rsp) /* Adjust vector to [-256, -1] range */
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call do_IRQ /* rdi points to pt_regs */
- /* 0(%rsp): old RSP */
-ret_from_intr:
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
-
- LEAVE_IRQ_STACK
+ .align 16
+ .globl __irqentry_text_start
+__irqentry_text_start:
- testb $3, CS(%rsp)
- jz retint_kernel
+#include <asm/idtentry.h>
- /* Interrupt came from user space */
-.Lretint_user:
- mov %rsp,%rdi
- call prepare_exit_to_usermode
- TRACE_IRQS_ON
+ .align 16
+ .globl __irqentry_text_end
+__irqentry_text_end:
+SYM_CODE_START_LOCAL(common_interrupt_return)
SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates user mode. */
@@ -662,23 +534,6 @@ SYM_INNER_LABEL(swapgs_restore_regs_and_return_to_usermode, SYM_L_GLOBAL)
INTERRUPT_RETURN
-/* Returning to kernel space */
-retint_kernel:
-#ifdef CONFIG_PREEMPTION
- /* Interrupts are off */
- /* Check if we need preemption */
- btl $9, EFLAGS(%rsp) /* were interrupts off? */
- jnc 1f
- cmpl $0, PER_CPU_VAR(__preempt_count)
- jnz 1f
- call preempt_schedule_irq
-1:
-#endif
- /*
- * The iretq could re-enable interrupts:
- */
- TRACE_IRQS_IRETQ
-
SYM_INNER_LABEL(restore_regs_and_return_to_kernel, SYM_L_GLOBAL)
#ifdef CONFIG_DEBUG_ENTRY
/* Assert that pt_regs indicates kernel mode. */
@@ -710,7 +565,7 @@ SYM_INNER_LABEL(native_irq_return_iret, SYM_L_GLOBAL)
/*
* This may fault. Non-paranoid faults on return to userspace are
* handled by fixup_bad_iret. These include #SS, #GP, and #NP.
- * Double-faults due to espfix64 are handled in do_double_fault.
+ * Double-faults due to espfix64 are handled in exc_double_fault.
* Other faults here are fatal.
*/
iretq
@@ -788,280 +643,32 @@ native_irq_return_ldt:
*/
jmp native_irq_return_iret
#endif
-SYM_CODE_END(common_interrupt)
-_ASM_NOKPROBE(common_interrupt)
-
-/*
- * APIC interrupts.
- */
-.macro apicinterrupt3 num sym do_sym
-SYM_CODE_START(\sym)
- UNWIND_HINT_IRET_REGS
- pushq $~(\num)
-.Lcommon_\sym:
- call interrupt_entry
- UNWIND_HINT_REGS indirect=1
- call \do_sym /* rdi points to pt_regs */
- jmp ret_from_intr
-SYM_CODE_END(\sym)
-_ASM_NOKPROBE(\sym)
-.endm
-
-/* Make sure APIC interrupt handlers end up in the irqentry section: */
-#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax"
-#define POP_SECTION_IRQENTRY .popsection
-
-.macro apicinterrupt num sym do_sym
-PUSH_SECTION_IRQENTRY
-apicinterrupt3 \num \sym \do_sym
-POP_SECTION_IRQENTRY
-.endm
-
-#ifdef CONFIG_SMP
-apicinterrupt3 IRQ_MOVE_CLEANUP_VECTOR irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
-apicinterrupt3 REBOOT_VECTOR reboot_interrupt smp_reboot_interrupt
-#endif
-
-#ifdef CONFIG_X86_UV
-apicinterrupt3 UV_BAU_MESSAGE uv_bau_message_intr1 uv_bau_message_interrupt
-#endif
-
-apicinterrupt LOCAL_TIMER_VECTOR apic_timer_interrupt smp_apic_timer_interrupt
-apicinterrupt X86_PLATFORM_IPI_VECTOR x86_platform_ipi smp_x86_platform_ipi
-
-#ifdef CONFIG_HAVE_KVM
-apicinterrupt3 POSTED_INTR_VECTOR kvm_posted_intr_ipi smp_kvm_posted_intr_ipi
-apicinterrupt3 POSTED_INTR_WAKEUP_VECTOR kvm_posted_intr_wakeup_ipi smp_kvm_posted_intr_wakeup_ipi
-apicinterrupt3 POSTED_INTR_NESTED_VECTOR kvm_posted_intr_nested_ipi smp_kvm_posted_intr_nested_ipi
-#endif
-
-#ifdef CONFIG_X86_MCE_THRESHOLD
-apicinterrupt THRESHOLD_APIC_VECTOR threshold_interrupt smp_threshold_interrupt
-#endif
-
-#ifdef CONFIG_X86_MCE_AMD
-apicinterrupt DEFERRED_ERROR_VECTOR deferred_error_interrupt smp_deferred_error_interrupt
-#endif
-
-#ifdef CONFIG_X86_THERMAL_VECTOR
-apicinterrupt THERMAL_APIC_VECTOR thermal_interrupt smp_thermal_interrupt
-#endif
-
-#ifdef CONFIG_SMP
-apicinterrupt CALL_FUNCTION_SINGLE_VECTOR call_function_single_interrupt smp_call_function_single_interrupt
-apicinterrupt CALL_FUNCTION_VECTOR call_function_interrupt smp_call_function_interrupt
-apicinterrupt RESCHEDULE_VECTOR reschedule_interrupt smp_reschedule_interrupt
-#endif
-
-apicinterrupt ERROR_APIC_VECTOR error_interrupt smp_error_interrupt
-apicinterrupt SPURIOUS_APIC_VECTOR spurious_interrupt smp_spurious_interrupt
-
-#ifdef CONFIG_IRQ_WORK
-apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt
-#endif
+SYM_CODE_END(common_interrupt_return)
+_ASM_NOKPROBE(common_interrupt_return)
/*
- * Exception entry points.
- */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
-
-.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0
-
- .if \paranoid
- call paranoid_entry
- /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
- .else
- call error_entry
- .endif
- UNWIND_HINT_REGS
-
- .if \read_cr2
- /*
- * Store CR2 early so subsequent faults cannot clobber it. Use R12 as
- * intermediate storage as RDX can be clobbered in enter_from_user_mode().
- * GET_CR2_INTO can clobber RAX.
- */
- GET_CR2_INTO(%r12);
- .endif
-
- .if \shift_ist != -1
- TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
- .else
- TRACE_IRQS_OFF
- .endif
-
- .if \paranoid == 0
- testb $3, CS(%rsp)
- jz .Lfrom_kernel_no_context_tracking_\@
- CALL_enter_from_user_mode
-.Lfrom_kernel_no_context_tracking_\@:
- .endif
-
- movq %rsp, %rdi /* pt_regs pointer */
-
- .if \has_error_code
- movq ORIG_RAX(%rsp), %rsi /* get error code */
- movq $-1, ORIG_RAX(%rsp) /* no syscall to restart */
- .else
- xorl %esi, %esi /* no error code */
- .endif
-
- .if \shift_ist != -1
- subq $\ist_offset, CPU_TSS_IST(\shift_ist)
- .endif
-
- .if \read_cr2
- movq %r12, %rdx /* Move CR2 into 3rd argument */
- .endif
-
- call \do_sym
-
- .if \shift_ist != -1
- addq $\ist_offset, CPU_TSS_IST(\shift_ist)
- .endif
-
- .if \paranoid
- /* this procedure expect "no swapgs" flag in ebx */
- jmp paranoid_exit
- .else
- jmp error_exit
- .endif
-
-.endm
-
-/**
- * idtentry - Generate an IDT entry stub
- * @sym: Name of the generated entry point
- * @do_sym: C function to be called
- * @has_error_code: True if this IDT vector has an error code on the stack
- * @paranoid: non-zero means that this vector may be invoked from
- * kernel mode with user GSBASE and/or user CR3.
- * 2 is special -- see below.
- * @shift_ist: Set to an IST index if entries from kernel mode should
- * decrement the IST stack so that nested entries get a
- * fresh stack. (This is for #DB, which has a nasty habit
- * of recursing.)
- * @create_gap: create a 6-word stack gap when coming from kernel mode.
- * @read_cr2: load CR2 into the 3rd argument; done before calling any C code
- *
- * idtentry generates an IDT stub that sets up a usable kernel context,
- * creates struct pt_regs, and calls @do_sym. The stub has the following
- * special behaviors:
- *
- * On an entry from user mode, the stub switches from the trampoline or
- * IST stack to the normal thread stack. On an exit to user mode, the
- * normal exit-to-usermode path is invoked.
- *
- * On an exit to kernel mode, if @paranoid == 0, we check for preemption,
- * whereas we omit the preemption check if @paranoid != 0. This is purely
- * because the implementation is simpler this way. The kernel only needs
- * to check for asynchronous kernel preemption when IRQ handlers return.
- *
- * If @paranoid == 0, then the stub will handle IRET faults by pretending
- * that the fault came from user mode. It will handle gs_change faults by
- * pretending that the fault happened with kernel GSBASE. Since this handling
- * is omitted for @paranoid != 0, the #GP, #SS, and #NP stubs must have
- * @paranoid == 0. This special handling will do the wrong thing for
- * espfix-induced #DF on IRET, so #DF must not use @paranoid == 0.
+ * Reload gs selector with exception handling
+ * edi: new selector
*
- * @paranoid == 2 is special: the stub will never switch stacks. This is for
- * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
+ * Is in entry.text as it shouldn't be instrumented.
*/
-.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
-SYM_CODE_START(\sym)
- UNWIND_HINT_IRET_REGS offset=\has_error_code*8
-
- /* Sanity check */
- .if \shift_ist != -1 && \paranoid != 1
- .error "using shift_ist requires paranoid=1"
- .endif
-
- .if \create_gap && \paranoid
- .error "using create_gap requires paranoid=0"
- .endif
-
- ASM_CLAC
-
- .if \has_error_code == 0
- pushq $-1 /* ORIG_RAX: no syscall to restart */
- .endif
-
- .if \paranoid == 1
- testb $3, CS-ORIG_RAX(%rsp) /* If coming from userspace, switch stacks */
- jnz .Lfrom_usermode_switch_stack_\@
- .endif
-
- .if \create_gap == 1
- /*
- * If coming from kernel space, create a 6-word gap to allow the
- * int3 handler to emulate a call instruction.
- */
- testb $3, CS-ORIG_RAX(%rsp)
- jnz .Lfrom_usermode_no_gap_\@
- .rept 6
- pushq 5*8(%rsp)
- .endr
- UNWIND_HINT_IRET_REGS offset=8
-.Lfrom_usermode_no_gap_\@:
- .endif
-
- idtentry_part \do_sym, \has_error_code, \read_cr2, \paranoid, \shift_ist, \ist_offset
-
- .if \paranoid == 1
- /*
- * Entry from userspace. Switch stacks and treat it
- * as a normal entry. This means that paranoid handlers
- * run in real process context if user_mode(regs).
- */
-.Lfrom_usermode_switch_stack_\@:
- idtentry_part \do_sym, \has_error_code, \read_cr2, paranoid=0
- .endif
-
-_ASM_NOKPROBE(\sym)
-SYM_CODE_END(\sym)
-.endm
-
-idtentry divide_error do_divide_error has_error_code=0
-idtentry overflow do_overflow has_error_code=0
-idtentry bounds do_bounds has_error_code=0
-idtentry invalid_op do_invalid_op has_error_code=0
-idtentry device_not_available do_device_not_available has_error_code=0
-idtentry double_fault do_double_fault has_error_code=1 paranoid=2 read_cr2=1
-idtentry coprocessor_segment_overrun do_coprocessor_segment_overrun has_error_code=0
-idtentry invalid_TSS do_invalid_TSS has_error_code=1
-idtentry segment_not_present do_segment_not_present has_error_code=1
-idtentry spurious_interrupt_bug do_spurious_interrupt_bug has_error_code=0
-idtentry coprocessor_error do_coprocessor_error has_error_code=0
-idtentry alignment_check do_alignment_check has_error_code=1
-idtentry simd_coprocessor_error do_simd_coprocessor_error has_error_code=0
-
-
- /*
- * Reload gs selector with exception handling
- * edi: new selector
- */
-SYM_FUNC_START(native_load_gs_index)
+SYM_FUNC_START(asm_load_gs_index)
FRAME_BEGIN
- pushfq
- DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
- TRACE_IRQS_OFF
- SWAPGS
+ swapgs
.Lgs_change:
movl %edi, %gs
2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE
- SWAPGS
- TRACE_IRQS_FLAGS (%rsp)
- popfq
+ swapgs
FRAME_END
ret
-SYM_FUNC_END(native_load_gs_index)
-EXPORT_SYMBOL(native_load_gs_index)
+SYM_FUNC_END(asm_load_gs_index)
+EXPORT_SYMBOL(asm_load_gs_index)
_ASM_EXTABLE(.Lgs_change, .Lbad_gs)
.section .fixup, "ax"
/* running with kernelgs */
SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
- SWAPGS /* switch back to user gs */
+ swapgs /* switch back to user gs */
.macro ZAP_GS
/* This can't be a string because the preprocessor needs to see it. */
movl $__USER_DS, %eax
@@ -1074,20 +681,46 @@ SYM_CODE_START_LOCAL_NOALIGN(.Lbad_gs)
SYM_CODE_END(.Lbad_gs)
.previous
-/* Call softirq on interrupt stack. Interrupts are off. */
-SYM_FUNC_START(do_softirq_own_stack)
- pushq %rbp
- mov %rsp, %rbp
- ENTER_IRQ_STACK regs=0 old_rsp=%r11
- call __do_softirq
- LEAVE_IRQ_STACK regs=0
+/*
+ * rdi: New stack pointer points to the top word of the stack
+ * rsi: Function pointer
+ * rdx: Function argument (can be NULL if none)
+ */
+SYM_FUNC_START(asm_call_on_stack)
+ /*
+ * Save the frame pointer unconditionally. This allows the ORC
+ * unwinder to handle the stack switch.
+ */
+ pushq %rbp
+ mov %rsp, %rbp
+
+ /*
+ * The unwinder relies on the word at the top of the new stack
+ * page linking back to the previous RSP.
+ */
+ mov %rsp, (%rdi)
+ mov %rdi, %rsp
+ /* Move the argument to the right place */
+ mov %rdx, %rdi
+
+1:
+ .pushsection .discard.instr_begin
+ .long 1b - .
+ .popsection
+
+ CALL_NOSPEC rsi
+
+2:
+ .pushsection .discard.instr_end
+ .long 2b - .
+ .popsection
+
+ /* Restore the previous stack pointer from RBP. */
leaveq
ret
-SYM_FUNC_END(do_softirq_own_stack)
+SYM_FUNC_END(asm_call_on_stack)
#ifdef CONFIG_XEN_PV
-idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
-
/*
* A note on the "critical region" in our callback handler.
* We want to avoid stacking callback handlers due to events occurring
@@ -1100,9 +733,10 @@ idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0
* So, on entry to the handler we detect whether we interrupted an
* existing activation in its critical region -- if so, we pop the current
* activation and restart the handler using the previous one.
+ *
+ * C calling convention: exc_xen_hypervisor_callback(struct *pt_regs)
*/
-/* do_hypervisor_callback(struct *pt_regs) */
-SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
+SYM_CODE_START_LOCAL(exc_xen_hypervisor_callback)
/*
* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
@@ -1112,15 +746,10 @@ SYM_CODE_START_LOCAL(xen_do_hypervisor_callback)
movq %rdi, %rsp /* we don't return, adjust the stack frame */
UNWIND_HINT_REGS
- ENTER_IRQ_STACK old_rsp=%r10
- call xen_evtchn_do_upcall
- LEAVE_IRQ_STACK
+ call xen_pv_evtchn_do_upcall
-#ifndef CONFIG_PREEMPTION
- call xen_maybe_preempt_hcall
-#endif
- jmp error_exit
-SYM_CODE_END(xen_do_hypervisor_callback)
+ jmp error_return
+SYM_CODE_END(exc_xen_hypervisor_callback)
/*
* Hypervisor uses this for application faults while it executes.
@@ -1155,7 +784,7 @@ SYM_CODE_START(xen_failsafe_callback)
addq $0x30, %rsp
pushq $0 /* RIP */
UNWIND_HINT_IRET_REGS offset=8
- jmp general_protection
+ jmp asm_exc_general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp), %rcx
movq 8(%rsp), %r11
@@ -1164,48 +793,10 @@ SYM_CODE_START(xen_failsafe_callback)
pushq $-1 /* orig_ax = -1 => not a system call */
PUSH_AND_CLEAR_REGS
ENCODE_FRAME_POINTER
- jmp error_exit
+ jmp error_return
SYM_CODE_END(xen_failsafe_callback)
#endif /* CONFIG_XEN_PV */
-#ifdef CONFIG_XEN_PVHVM
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- xen_hvm_callback_vector xen_evtchn_do_upcall
-#endif
-
-
-#if IS_ENABLED(CONFIG_HYPERV)
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- hyperv_callback_vector hyperv_vector_handler
-
-apicinterrupt3 HYPERV_REENLIGHTENMENT_VECTOR \
- hyperv_reenlightenment_vector hyperv_reenlightenment_intr
-
-apicinterrupt3 HYPERV_STIMER0_VECTOR \
- hv_stimer0_callback_vector hv_stimer0_vector_handler
-#endif /* CONFIG_HYPERV */
-
-#if IS_ENABLED(CONFIG_ACRN_GUEST)
-apicinterrupt3 HYPERVISOR_CALLBACK_VECTOR \
- acrn_hv_callback_vector acrn_hv_vector_handler
-#endif
-
-idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET
-idtentry int3 do_int3 has_error_code=0 create_gap=1
-idtentry stack_segment do_stack_segment has_error_code=1
-
-#ifdef CONFIG_XEN_PV
-idtentry xennmi do_nmi has_error_code=0
-idtentry xendebug do_debug has_error_code=0
-#endif
-
-idtentry general_protection do_general_protection has_error_code=1
-idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
-
-#ifdef CONFIG_X86_MCE
-idtentry machine_check do_mce has_error_code=0 paranoid=1
-#endif
-
/*
* Save all registers in pt_regs, and switch gs if needed.
* Use slow, but surefire "are we in kernel?" check.
@@ -1261,17 +852,13 @@ SYM_CODE_END(paranoid_entry)
*/
SYM_CODE_START_LOCAL(paranoid_exit)
UNWIND_HINT_REGS
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF_DEBUG
testl %ebx, %ebx /* swapgs needed? */
jnz .Lparanoid_exit_no_swapgs
- TRACE_IRQS_IRETQ
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
SWAPGS_UNSAFE_STACK
jmp restore_regs_and_return_to_kernel
.Lparanoid_exit_no_swapgs:
- TRACE_IRQS_IRETQ_DEBUG
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%rbx save_reg=%r14
jmp restore_regs_and_return_to_kernel
@@ -1335,7 +922,6 @@ SYM_CODE_START_LOCAL(error_entry)
*/
SWAPGS
FENCE_SWAPGS_USER_ENTRY
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
jmp .Lerror_entry_done
.Lbstep_iret:
@@ -1362,14 +948,13 @@ SYM_CODE_START_LOCAL(error_entry)
jmp .Lerror_entry_from_usermode_after_swapgs
SYM_CODE_END(error_entry)
-SYM_CODE_START_LOCAL(error_exit)
+SYM_CODE_START_LOCAL(error_return)
UNWIND_HINT_REGS
- DISABLE_INTERRUPTS(CLBR_ANY)
- TRACE_IRQS_OFF
+ DEBUG_ENTRY_ASSERT_IRQS_OFF
testb $3, CS(%rsp)
- jz retint_kernel
- jmp .Lretint_user
-SYM_CODE_END(error_exit)
+ jz restore_regs_and_return_to_kernel
+ jmp swapgs_restore_regs_and_return_to_usermode
+SYM_CODE_END(error_return)
/*
* Runs on exception stack. Xen PV does not go through this path at all,
@@ -1379,7 +964,7 @@ SYM_CODE_END(error_exit)
* %r14: Used to save/restore the CR3 of the interrupted context
* when PAGE_TABLE_ISOLATION is in use. Do not clobber.
*/
-SYM_CODE_START(nmi)
+SYM_CODE_START(asm_exc_nmi)
UNWIND_HINT_IRET_REGS
/*
@@ -1464,7 +1049,7 @@ SYM_CODE_START(nmi)
movq %rsp, %rdi
movq $-1, %rsi
- call do_nmi
+ call exc_nmi
/*
* Return back to user mode. We must *not* do the normal exit
@@ -1521,7 +1106,7 @@ SYM_CODE_START(nmi)
* end_repeat_nmi, then we are a nested NMI. We must not
* modify the "iret" frame because it's being written by
* the outer NMI. That's okay; the outer NMI handler is
- * about to about to call do_nmi anyway, so we can just
+ * about to about to call exc_nmi() anyway, so we can just
* resume the outer NMI.
*/
@@ -1640,7 +1225,7 @@ repeat_nmi:
* RSP is pointing to "outermost RIP". gsbase is unknown, but, if
* we're repeating an NMI, gsbase has the same value that it had on
* the first iteration. paranoid_entry will load the kernel
- * gsbase if needed before we call do_nmi. "NMI executing"
+ * gsbase if needed before we call exc_nmi(). "NMI executing"
* is zero.
*/
movq $1, 10*8(%rsp) /* Set "NMI executing". */
@@ -1674,10 +1259,9 @@ end_repeat_nmi:
call paranoid_entry
UNWIND_HINT_REGS
- /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
movq %rsp, %rdi
movq $-1, %rsi
- call do_nmi
+ call exc_nmi
/* Always restore stashed CR3 value (see paranoid_entry) */
RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
@@ -1714,7 +1298,7 @@ nmi_restore:
* about espfix64 on the way back to kernel mode.
*/
iretq
-SYM_CODE_END(nmi)
+SYM_CODE_END(asm_exc_nmi)
#ifndef CONFIG_IA32_EMULATION
/*
@@ -1728,6 +1312,7 @@ SYM_CODE_START(ignore_sysret)
SYM_CODE_END(ignore_sysret)
#endif
+.pushsection .text, "ax"
SYM_CODE_START(rewind_stack_do_exit)
UNWIND_HINT_FUNC
/* Prevent any naive code from trying to unwind to our caller. */
@@ -1739,3 +1324,4 @@ SYM_CODE_START(rewind_stack_do_exit)
call do_exit
SYM_CODE_END(rewind_stack_do_exit)
+.popsection
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index f1d3ccae5dd5..0f974ae01e62 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -46,12 +46,14 @@
* ebp user stack
* 0(%ebp) arg6
*/
-SYM_FUNC_START(entry_SYSENTER_compat)
+SYM_CODE_START(entry_SYSENTER_compat)
+ UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
SWAPGS
- /* We are about to clobber %rsp anyway, clobbering here is OK */
- SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+ pushq %rax
+ SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
+ popq %rax
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -104,6 +106,9 @@ SYM_FUNC_START(entry_SYSENTER_compat)
xorl %r14d, %r14d /* nospec r14 */
pushq $0 /* pt_regs->r15 = 0 */
xorl %r15d, %r15d /* nospec r15 */
+
+ UNWIND_HINT_REGS
+
cld
/*
@@ -129,17 +134,11 @@ SYM_FUNC_START(entry_SYSENTER_compat)
jnz .Lsysenter_fix_flags
.Lsysenter_flags_fixed:
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
-
movq %rsp, %rdi
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
jmp sysret32_from_system_call
.Lsysenter_fix_flags:
@@ -147,7 +146,7 @@ SYM_FUNC_START(entry_SYSENTER_compat)
popfq
jmp .Lsysenter_flags_fixed
SYM_INNER_LABEL(__end_entry_SYSENTER_compat, SYM_L_GLOBAL)
-SYM_FUNC_END(entry_SYSENTER_compat)
+SYM_CODE_END(entry_SYSENTER_compat)
/*
* 32-bit SYSCALL entry.
@@ -197,6 +196,7 @@ SYM_FUNC_END(entry_SYSENTER_compat)
* 0(%esp) arg6
*/
SYM_CODE_START(entry_SYSCALL_compat)
+ UNWIND_HINT_EMPTY
/* Interrupts are off on entry. */
swapgs
@@ -247,17 +247,13 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
pushq $0 /* pt_regs->r15 = 0 */
xorl %r15d, %r15d /* nospec r15 */
- /*
- * User mode is traced as though IRQs are on, and SYSENTER
- * turned them off.
- */
- TRACE_IRQS_OFF
+ UNWIND_HINT_REGS
movq %rsp, %rdi
call do_fast_syscall_32
/* XEN PV guests always use IRET path */
- ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \
- "jmp .Lsyscall_32_done", X86_FEATURE_XENPV
+ ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/* Opportunistic SYSRET */
sysret32_from_system_call:
@@ -266,7 +262,7 @@ sysret32_from_system_call:
* stack. So let's erase the thread stack right now.
*/
STACKLEAK_ERASE
- TRACE_IRQS_ON /* User mode traces as IRQs on. */
+
movq RBX(%rsp), %rbx /* pt_regs->rbx */
movq RBP(%rsp), %rbp /* pt_regs->rbp */
movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */
@@ -340,6 +336,7 @@ SYM_CODE_END(entry_SYSCALL_compat)
* ebp arg6
*/
SYM_CODE_START(entry_INT80_compat)
+ UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
*/
@@ -361,8 +358,11 @@ SYM_CODE_START(entry_INT80_compat)
/* Need to switch before accessing the thread stack. */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
/* In the Xen PV case we already run on the thread stack. */
- ALTERNATIVE "movq %rsp, %rdi", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
+ ALTERNATIVE "", "jmp .Lint80_keep_stack", X86_FEATURE_XENPV
+
+ movq %rsp, %rdi
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
pushq 6*8(%rdi) /* regs->ss */
@@ -401,19 +401,12 @@ SYM_CODE_START(entry_INT80_compat)
xorl %r14d, %r14d /* nospec r14 */
pushq %r15 /* pt_regs->r15 */
xorl %r15d, %r15d /* nospec r15 */
- cld
- /*
- * User mode is traced as though IRQs are on, and the interrupt
- * gate turned them off.
- */
- TRACE_IRQS_OFF
+ UNWIND_HINT_REGS
+
+ cld
movq %rsp, %rdi
call do_int80_syscall_32
-.Lsyscall_32_done:
-
- /* Go back to user mode. */
- TRACE_IRQS_ON
jmp swapgs_restore_regs_and_return_to_usermode
SYM_CODE_END(entry_INT80_compat)
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index dbe4493b534e..ccd32877a3c4 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -3,7 +3,6 @@
* Save registers before calling assembly functions. This avoids
* disturbance of register allocation in some inline assembly constructs.
* Copyright 2001,2002 by Andi Kleen, SuSE Labs.
- * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
*/
#include <linux/linkage.h>
#include "calling.h"
@@ -37,15 +36,6 @@ SYM_FUNC_END(\name)
_ASM_NOKPROBE(\name)
.endm
-#ifdef CONFIG_TRACE_IRQFLAGS
- THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1
- THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
-#endif
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
- THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
-#endif
-
#ifdef CONFIG_PREEMPTION
THUNK preempt_schedule_thunk, preempt_schedule
THUNK preempt_schedule_notrace_thunk, preempt_schedule_notrace
@@ -53,9 +43,7 @@ SYM_FUNC_END(\name)
EXPORT_SYMBOL(preempt_schedule_notrace_thunk)
#endif
-#if defined(CONFIG_TRACE_IRQFLAGS) \
- || defined(CONFIG_DEBUG_LOCK_ALLOC) \
- || defined(CONFIG_PREEMPTION)
+#ifdef CONFIG_PREEMPTION
SYM_CODE_START_LOCAL_NOALIGN(.L_restore)
popq %r11
popq %r10