From 905a36a2851838bca5a424fb758e201990234e6e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 13:37:36 +0200 Subject: x86/asm/entry: Move entry_64.S and entry_32.S to arch/x86/entry/ Create a new directory hierarchy for the low level x86 entry code: arch/x86/entry/* This will host all the low level glue that is currently scattered all across arch/x86/. Start with entry_64.S and entry_32.S. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 arch/x86/entry/Makefile (limited to 'arch/x86/entry/Makefile') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile new file mode 100644 index 000000000000..fa7e0cf6d3c4 --- /dev/null +++ b/arch/x86/entry/Makefile @@ -0,0 +1,4 @@ +# +# Makefile for the x86 low level entry code +# +obj-y := entry_$(BITS).o -- cgit v1.2.3 From 19a433f45135656d065bdf1bbe543796b194ba3a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:00:59 +0200 Subject: x86/asm/entry: Move the compat syscall entry code to arch/x86/entry/ Move the ia32entry.S file over into arch/x86/entry/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 5 +- arch/x86/entry/ia32entry.S | 522 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/ia32/Makefile | 2 +- arch/x86/ia32/ia32entry.S | 522 --------------------------------------------- 4 files changed, 527 insertions(+), 524 deletions(-) create mode 100644 arch/x86/entry/ia32entry.S delete mode 100644 arch/x86/ia32/ia32entry.S (limited to 'arch/x86/entry/Makefile') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index fa7e0cf6d3c4..4a626594fa79 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,4 +1,7 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o +obj-y := entry_$(BITS).o + +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o + diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S new file mode 100644 index 000000000000..2be23c734db5 --- /dev/null +++ b/arch/x86/entry/ia32entry.S @@ -0,0 +1,522 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysexit_audit ia32_ret_from_sys_call +#define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + + /* clobbers %rax */ + .macro CLEAR_RREGS _r9=rax + xorl %eax,%eax + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %\_r9,R9(%rsp) + movq %rax,R8(%rsp) + .endm + + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. Instead, we just truncate that + * value to 32 bits again as we did on entry from user mode. + * If it's a new value set by user_regset during entry tracing, + * this matches the normal truncation of the user-mode value. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + .macro LOAD_ARGS32 _r9=0 + .if \_r9 + movl R9(%rsp),%r9d + .endif + movl RCX(%rsp),%ecx + movl RDX(%rsp),%edx + movl RSI(%rsp),%esi + movl RDI(%rsp),%edi + movl %eax,%eax /* zero extension */ + .endm + + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%rbp),%ebp + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT,EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp),%ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx,%edx /* avoid info leaks */ + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + movl EFLAGS(%rsp),%r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp),%esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r8d /* 5th arg: 4th syscall arg */ + movl %ecx,%r9d /*swap with edx*/ + movl %edx,%ecx /* 4th arg: 3rd syscall arg */ + movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ + movl %ebx,%esi /* 2nd arg: 1st syscall arg */ + movl %eax,%edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp),%eax /* reload syscall number */ + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI(%rsp),%r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax,%esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO,%eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp),%rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + CLEAR_RREGS + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + CLEAR_RREGS + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(ia32_sysenter_target) + +/* + * 32bit SYSCALL instruction entry. + * + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp,%r8d + movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp,%ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + /* + * no need to do an access_ok check here because r8 has been + * 32bit zero extended + */ + ASM_STAC +1: movl (%r8),%r9d + _ASM_EXTABLE(1b,ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + /* r9 already loaded */ /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp),%ecx + movl EFLAGS(%rsp),%r11d + xorq %r10,%r10 + xorq %r9,%r9 + xorq %r8,%r8 + TRACE_IRQS_ON + movl RSP(%rsp),%esp + /* + * 64bit->32bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32bit->32bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + movl %r9d,R9(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9(%rsp),%r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + xchgl %r9d,%ebp + SAVE_EXTRA_REGS + CLEAR_RREGS r9 + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + xchgl %ebp,%r9d + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT,%rax + jmp ia32_sysret + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys +ia32_do_call: + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1),%rax + ja 1f + call *ia32_sys_call_table(,%rax,8) # xxx: rip relative +ia32_sysret: + movq %rax,RAX(%rsp) +1: +ia32_ret_from_sys_call: + CLEAR_RREGS + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + CLEAR_RREGS + movq %rsp,%rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(ia32_syscall) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip),%rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip),%rax + mov %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index bb635c641869..cd4339bae066 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -2,7 +2,7 @@ # Makefile for the ia32 kernel emulation subsystem. # -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o +obj-$(CONFIG_IA32_EMULATION) := sys_ia32.o ia32_signal.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S deleted file mode 100644 index 2be23c734db5..000000000000 --- a/arch/x86/ia32/ia32entry.S +++ /dev/null @@ -1,522 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -#define sysexit_audit ia32_ret_from_sys_call -#define sysretl_audit ia32_ret_from_sys_call -#endif - - .section .entry.text, "ax" - - /* clobbers %rax */ - .macro CLEAR_RREGS _r9=rax - xorl %eax,%eax - movq %rax,R11(%rsp) - movq %rax,R10(%rsp) - movq %\_r9,R9(%rsp) - movq %rax,R8(%rsp) - .endm - - /* - * Reload arg registers from stack in case ptrace changed them. - * We don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. Instead, we just truncate that - * value to 32 bits again as we did on entry from user mode. - * If it's a new value set by user_regset during entry tracing, - * this matches the normal truncation of the user-mode value. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - .macro LOAD_ARGS32 _r9=0 - .if \_r9 - movl R9(%rsp),%r9d - .endif - movl RCX(%rsp),%ecx - movl RDX(%rsp),%edx - movl RSI(%rsp),%esi - movl RDI(%rsp),%edi - movl %eax,%eax /* zero extension */ - .endm - - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - -/* - * 32bit SYSENTER instruction entry. - * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). - * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp user stack - * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp - movl %eax, %eax - - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - - /* - * no need to do an access_ok check here because rbp has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%rbp),%ebp - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - - /* - * Sysenter doesn't filter flags, so we need to clear NT - * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT,EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp),%ecx /* User %eip */ - RESTORE_RSI_RDI - xorl %edx,%edx /* avoid info leaks */ - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - movl EFLAGS(%rsp),%r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp),%esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - movl %esi,%r8d /* 5th arg: 4th syscall arg */ - movl %ecx,%r9d /*swap with edx*/ - movl %edx,%ecx /* 4th arg: 3rd syscall arg */ - movl %r9d,%edx /* 3rd arg: 2nd syscall arg */ - movl %ebx,%esi /* 2nd arg: 1st syscall arg */ - movl %eax,%edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp),%eax /* reload syscall number */ - movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI(%rsp),%r8d /* reload 5th syscall arg */ - .endm - - .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO,%eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al,%edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp),%rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - CLEAR_RREGS - jmp int_with_check - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp,%r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - -sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - CLEAR_RREGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) - -/* - * 32bit SYSCALL instruction entry. - * - * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit - * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes - * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). - * - * Arguments: - * eax system call number - * ecx return address - * ebx arg1 - * ebp arg2 (note: not saved in the stack frame, should not be touched) - * edx arg3 - * esi arg4 - * edi arg5 - * esp user stack - * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movl %esp,%r8d - movq PER_CPU_VAR(cpu_current_top_of_stack),%rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %r8 /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp,%ecx - pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - - /* - * no need to do an access_ok check here because r8 has been - * 32bit zero extended - */ - ASM_STAC -1: movl (%r8),%r9d - _ASM_EXTABLE(1b,ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - RESTORE_RSI_RDI_RDX - movl RIP(%rsp),%ecx - movl EFLAGS(%rsp),%r11d - xorq %r10,%r10 - xorq %r9,%r9 - xorq %r8,%r8 - TRACE_IRQS_ON - movl RSP(%rsp),%esp - /* - * 64bit->32bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32bit->32bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - movl %r9d,R9(%rsp) /* register to be clobbered by call */ - auditsys_entry_common - movl R9(%rsp),%r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - xchgl %r9d,%ebp - SAVE_EXTRA_REGS - CLEAR_RREGS r9 - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - xchgl %ebp,%r9d - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - ASM_CLAC - movq $-EFAULT,%rax - jmp ia32_sysret - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax,%eax - - /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys -ia32_do_call: - /* 32bit syscall -> 64bit C ABI argument conversion */ - movl %edi,%r8d /* arg5 */ - movl %ebp,%r9d /* arg6 */ - xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ - movl %ebx,%edi /* arg1 */ - movl %edx,%edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1),%rax - ja 1f - call *ia32_sys_call_table(,%rax,8) # xxx: rip relative -ia32_sysret: - movq %rax,RAX(%rsp) -1: -ia32_ret_from_sys_call: - CLEAR_RREGS - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - CLEAR_RREGS - movq %rsp,%rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ - RESTORE_EXTRA_REGS - jmp ia32_do_call -END(ia32_syscall) - - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip),%rax - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - - ALIGN -GLOBAL(stub32_clone) - leaq sys_clone(%rip),%rax - mov %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) -- cgit v1.2.3 From d603c8e184d882cc3f55c599f4185bad956ee0a7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:05:44 +0200 Subject: x86/asm/entry, x86/vdso: Move the vDSO code to arch/x86/entry/vdso/ Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- MAINTAINERS | 2 +- arch/x86/Kbuild | 2 +- arch/x86/Makefile | 2 +- arch/x86/entry/Makefile | 2 + arch/x86/entry/vdso/.gitignore | 7 + arch/x86/entry/vdso/Makefile | 209 +++++++++++++++ arch/x86/entry/vdso/checkundef.sh | 10 + arch/x86/entry/vdso/vclock_gettime.c | 351 +++++++++++++++++++++++++ arch/x86/entry/vdso/vdso-layout.lds.S | 118 +++++++++ arch/x86/entry/vdso/vdso-note.S | 12 + arch/x86/entry/vdso/vdso.lds.S | 29 ++ arch/x86/entry/vdso/vdso2c.c | 253 ++++++++++++++++++ arch/x86/entry/vdso/vdso2c.h | 175 ++++++++++++ arch/x86/entry/vdso/vdso32-setup.c | 120 +++++++++ arch/x86/entry/vdso/vdso32/.gitignore | 1 + arch/x86/entry/vdso/vdso32/int80.S | 56 ++++ arch/x86/entry/vdso/vdso32/note.S | 44 ++++ arch/x86/entry/vdso/vdso32/sigreturn.S | 145 ++++++++++ arch/x86/entry/vdso/vdso32/syscall.S | 75 ++++++ arch/x86/entry/vdso/vdso32/sysenter.S | 116 ++++++++ arch/x86/entry/vdso/vdso32/vclock_gettime.c | 30 +++ arch/x86/entry/vdso/vdso32/vdso-fakesections.c | 1 + arch/x86/entry/vdso/vdso32/vdso32.lds.S | 37 +++ arch/x86/entry/vdso/vdsox32.lds.S | 25 ++ arch/x86/entry/vdso/vgetcpu.c | 28 ++ arch/x86/entry/vdso/vma.c | 300 +++++++++++++++++++++ arch/x86/vdso/.gitignore | 7 - arch/x86/vdso/Makefile | 209 --------------- arch/x86/vdso/checkundef.sh | 10 - arch/x86/vdso/vclock_gettime.c | 351 ------------------------- arch/x86/vdso/vdso-layout.lds.S | 118 --------- arch/x86/vdso/vdso-note.S | 12 - arch/x86/vdso/vdso.lds.S | 29 -- arch/x86/vdso/vdso2c.c | 253 ------------------ arch/x86/vdso/vdso2c.h | 175 ------------ arch/x86/vdso/vdso32-setup.c | 120 --------- arch/x86/vdso/vdso32/.gitignore | 1 - arch/x86/vdso/vdso32/int80.S | 56 ---- arch/x86/vdso/vdso32/note.S | 44 ---- arch/x86/vdso/vdso32/sigreturn.S | 145 ---------- arch/x86/vdso/vdso32/syscall.S | 75 ------ arch/x86/vdso/vdso32/sysenter.S | 116 -------- arch/x86/vdso/vdso32/vclock_gettime.c | 30 --- arch/x86/vdso/vdso32/vdso-fakesections.c | 1 - arch/x86/vdso/vdso32/vdso32.lds.S | 37 --- arch/x86/vdso/vdsox32.lds.S | 25 -- arch/x86/vdso/vgetcpu.c | 28 -- arch/x86/vdso/vma.c | 300 --------------------- 48 files changed, 2147 insertions(+), 2145 deletions(-) create mode 100644 arch/x86/entry/vdso/.gitignore create mode 100644 arch/x86/entry/vdso/Makefile create mode 100755 arch/x86/entry/vdso/checkundef.sh create mode 100644 arch/x86/entry/vdso/vclock_gettime.c create mode 100644 arch/x86/entry/vdso/vdso-layout.lds.S create mode 100644 arch/x86/entry/vdso/vdso-note.S create mode 100644 arch/x86/entry/vdso/vdso.lds.S create mode 100644 arch/x86/entry/vdso/vdso2c.c create mode 100644 arch/x86/entry/vdso/vdso2c.h create mode 100644 arch/x86/entry/vdso/vdso32-setup.c create mode 100644 arch/x86/entry/vdso/vdso32/.gitignore create mode 100644 arch/x86/entry/vdso/vdso32/int80.S create mode 100644 arch/x86/entry/vdso/vdso32/note.S create mode 100644 arch/x86/entry/vdso/vdso32/sigreturn.S create mode 100644 arch/x86/entry/vdso/vdso32/syscall.S create mode 100644 arch/x86/entry/vdso/vdso32/sysenter.S create mode 100644 arch/x86/entry/vdso/vdso32/vclock_gettime.c create mode 100644 arch/x86/entry/vdso/vdso32/vdso-fakesections.c create mode 100644 arch/x86/entry/vdso/vdso32/vdso32.lds.S create mode 100644 arch/x86/entry/vdso/vdsox32.lds.S create mode 100644 arch/x86/entry/vdso/vgetcpu.c create mode 100644 arch/x86/entry/vdso/vma.c delete mode 100644 arch/x86/vdso/.gitignore delete mode 100644 arch/x86/vdso/Makefile delete mode 100755 arch/x86/vdso/checkundef.sh delete mode 100644 arch/x86/vdso/vclock_gettime.c delete mode 100644 arch/x86/vdso/vdso-layout.lds.S delete mode 100644 arch/x86/vdso/vdso-note.S delete mode 100644 arch/x86/vdso/vdso.lds.S delete mode 100644 arch/x86/vdso/vdso2c.c delete mode 100644 arch/x86/vdso/vdso2c.h delete mode 100644 arch/x86/vdso/vdso32-setup.c delete mode 100644 arch/x86/vdso/vdso32/.gitignore delete mode 100644 arch/x86/vdso/vdso32/int80.S delete mode 100644 arch/x86/vdso/vdso32/note.S delete mode 100644 arch/x86/vdso/vdso32/sigreturn.S delete mode 100644 arch/x86/vdso/vdso32/syscall.S delete mode 100644 arch/x86/vdso/vdso32/sysenter.S delete mode 100644 arch/x86/vdso/vdso32/vclock_gettime.c delete mode 100644 arch/x86/vdso/vdso32/vdso-fakesections.c delete mode 100644 arch/x86/vdso/vdso32/vdso32.lds.S delete mode 100644 arch/x86/vdso/vdsox32.lds.S delete mode 100644 arch/x86/vdso/vgetcpu.c delete mode 100644 arch/x86/vdso/vma.c (limited to 'arch/x86/entry/Makefile') diff --git a/MAINTAINERS b/MAINTAINERS index f8e0afb708b4..68c0cc365432 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10892,7 +10892,7 @@ M: Andy Lutomirski L: linux-kernel@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git x86/vdso S: Maintained -F: arch/x86/vdso/ +F: arch/x86/entry/vdso/ XC2028/3028 TUNER DRIVER M: Mauro Carvalho Chehab diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index b9b816277e72..1538562cc720 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -14,7 +14,7 @@ obj-y += kernel/ obj-y += mm/ obj-y += crypto/ -obj-y += vdso/ + obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 43e8328a23e4..90b1c3bfec46 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -244,7 +244,7 @@ install: PHONY += vdso_install vdso_install: - $(Q)$(MAKE) $(build)=arch/x86/vdso $@ + $(Q)$(MAKE) $(build)=arch/x86/entry/vdso $@ archclean: $(Q)rm -rf $(objtree)/arch/i386 diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 4a626594fa79..c97532492ef5 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -3,5 +3,7 @@ # obj-y := entry_$(BITS).o +obj-y += vdso/ + obj-$(CONFIG_IA32_EMULATION) += ia32entry.o diff --git a/arch/x86/entry/vdso/.gitignore b/arch/x86/entry/vdso/.gitignore new file mode 100644 index 000000000000..aae8ffdd5880 --- /dev/null +++ b/arch/x86/entry/vdso/.gitignore @@ -0,0 +1,7 @@ +vdso.lds +vdsox32.lds +vdso32-syscall-syms.lds +vdso32-sysenter-syms.lds +vdso32-int80-syms.lds +vdso-image-*.c +vdso2c diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile new file mode 100644 index 000000000000..e97032069f88 --- /dev/null +++ b/arch/x86/entry/vdso/Makefile @@ -0,0 +1,209 @@ +# +# Building vDSO images for x86. +# + +KBUILD_CFLAGS += $(DISABLE_LTO) +KASAN_SANITIZE := n + +VDSO64-$(CONFIG_X86_64) := y +VDSOX32-$(CONFIG_X86_X32_ABI) := y +VDSO32-$(CONFIG_X86_32) := y +VDSO32-$(CONFIG_COMPAT) := y + +# files to link into the vdso +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o + +# files to link into kernel +obj-y += vma.o + +# vDSO images to build +vdso_img-$(VDSO64-y) += 64 +vdso_img-$(VDSOX32-y) += x32 +vdso_img-$(VDSO32-y) += 32-int80 +vdso_img-$(CONFIG_COMPAT) += 32-syscall +vdso_img-$(VDSO32-y) += 32-sysenter + +obj-$(VDSO32-y) += vdso32-setup.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.lds $(vobjs-y) + +# Build the vDSO image C files and link them in. +vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) +vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) +vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) +obj-y += $(vdso_img_objs) +targets += $(vdso_img_cfiles) +targets += $(vdso_img_sodbg) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ + $(vdso_img-y:%=$(obj)/vdso%.so) + +export CPPFLAGS_vdso.lds += -P -C + +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,--no-undefined \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ + $(DISABLE_LTO) + +$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,vdso) + +HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi +hostprogs-y += vdso2c + +quiet_cmd_vdso2c = VDSO2C $@ +define cmd_vdso2c + $(obj)/vdso2c $< $(<:%.dbg=%) $@ +endef + +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE + $(call if_changed,vdso2c) + +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# +CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -DDISABLE_BRANCH_PROFILING + +$(vobjs): KBUILD_CFLAGS += $(CFL) + +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vvar.o = -pg + +# +# X32 processes use x32 vDSO to access 64bit kernel data. +# +# Build x32 vDSO image: +# 1. Compile x32 vDSO as 64bit. +# 2. Convert object files to x32. +# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes +# so that it can reach 64bit address space with 64bit pointers. +# + +CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ + -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 \ + -Wl,-z,common-page-size=4096 + +# 64-bit objects to re-brand as x32 +vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y)) + +# x32-rebranded versions +vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o) + +# same thing, but in the output directory +vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) + +# Convert 64bit object file to x32 for x32 vDSO. +quiet_cmd_x32 = X32 $@ + cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@ + +$(obj)/%-x32.o: $(obj)/%.o FORCE + $(call if_changed,x32) + +targets += vdsox32.lds $(vobjx32s-y) + +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg + $(call if_changed,objcopy) + +$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE + $(call if_changed,vdso) + +# +# Build multiple 32-bit vDSO images to choose from at boot time. +# +vdso32.so-$(VDSO32-y) += int80 +vdso32.so-$(CONFIG_COMPAT) += syscall +vdso32.so-$(VDSO32-y) += sysenter + +vdso32-images = $(vdso32.so-y:%=vdso32-%.so) + +CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) +VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 + +# This makes sure the $(obj) subdirectory exists even though vdso32/ +# is not a kbuild sub-make subdirectory. +override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ + +targets += vdso32/vdso32.lds +targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/vclock_gettime.o + +$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) +KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic +KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) +KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) +KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING +$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) + +$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/%.o + $(call if_changed,vdso) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ + $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) +GCOV_PROFILE := n + +# +# Install the unstripped copies of vdso*.so. If our toolchain supports +# build-id, install .build-id links as well. +# +quiet_cmd_vdso_install = INSTALL $(@:install_%=%) +define cmd_vdso_install + cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ + if readelf -n $< |grep -q 'Build ID'; then \ + buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ + first=`echo $$buildid | cut -b-2`; \ + last=`echo $$buildid | cut -b3-`; \ + mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ + ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ + fi +endef + +vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) + +$(MODLIB)/vdso: FORCE + @mkdir -p $(MODLIB)/vdso + +$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE + $(call cmd,vdso_install) + +PHONY += vdso_install $(vdso_img_insttargets) +vdso_install: $(vdso_img_insttargets) FORCE + +clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/checkundef.sh b/arch/x86/entry/vdso/checkundef.sh new file mode 100755 index 000000000000..7ee90a9b549d --- /dev/null +++ b/arch/x86/entry/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +$nm "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c new file mode 100644 index 000000000000..9793322751e0 --- /dev/null +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -0,0 +1,351 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of clock_gettime, gettimeofday, and time. + * + * 32 Bit compat layer by Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * The code should have no internal unresolved relocations. + * Check with readelf after changing. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define gtod (&VVAR(vsyscall_gtod_data)) + +extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); +extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); +extern time_t __vdso_time(time_t *t); + +#ifdef CONFIG_HPET_TIMER +extern u8 hpet_page + __attribute__((visibility("hidden"))); + +static notrace cycle_t vread_hpet(void) +{ + return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); +} +#endif + +#ifndef BUILD_VDSO32 + +#include +#include +#include +#include + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} + +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +{ + const struct pvclock_vsyscall_time_info *pvti_base; + int idx = cpu / (PAGE_SIZE/PVTI_SIZE); + int offset = cpu % (PAGE_SIZE/PVTI_SIZE); + + BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); + + pvti_base = (struct pvclock_vsyscall_time_info *) + __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); + + return &pvti_base[offset]; +} + +static notrace cycle_t vread_pvclock(int *mode) +{ + const struct pvclock_vsyscall_time_info *pvti; + cycle_t ret; + u64 last; + u32 version; + u8 flags; + unsigned cpu, cpu1; + + + /* + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * + */ + do { + cpu = __getcpu() & VGETCPU_CPU_MASK; + /* TODO: We can put vcpu id into higher bits of pvti.version. + * This will save a couple of cycles by getting rid of + * __getcpu() calls (Gleb). + */ + + pvti = get_pvti(cpu); + + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); + + /* + * Test we're still on the cpu as well as the version. + * We could have been migrated just after the first + * vgetcpu but before fetching the version, so we + * wouldn't notice a version change. + */ + cpu1 = __getcpu() & VGETCPU_CPU_MASK; + } while (unlikely(cpu != cpu1 || + (pvti->pvti.version & 1) || + pvti->pvti.version != version)); + + if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + *mode = VCLOCK_NONE; + + /* refer to tsc.c read_tsc() comment for rationale */ + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + return last; +} +#endif + +#else + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "memory", "edx"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "memory", "edx"); + return ret; +} + +#ifdef CONFIG_PARAVIRT_CLOCK + +static notrace cycle_t vread_pvclock(int *mode) +{ + *mode = VCLOCK_NONE; + return 0; +} +#endif + +#endif + +notrace static cycle_t vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)__native_read_tsc(); + + last = gtod->cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} + +notrace static inline u64 vgetsns(int *mode) +{ + u64 v; + cycles_t cycles; + + if (gtod->vclock_mode == VCLOCK_TSC) + cycles = vread_tsc(); +#ifdef CONFIG_HPET_TIMER + else if (gtod->vclock_mode == VCLOCK_HPET) + cycles = vread_hpet(); +#endif +#ifdef CONFIG_PARAVIRT_CLOCK + else if (gtod->vclock_mode == VCLOCK_PVCLOCK) + cycles = vread_pvclock(mode); +#endif + else + return 0; + v = (cycles - gtod->cycle_last) & gtod->mask; + return v * gtod->mult; +} + +/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ +notrace static int __always_inline do_realtime(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->wall_time_sec; + ns = gtod->wall_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static int __always_inline do_monotonic(struct timespec *ts) +{ + unsigned long seq; + u64 ns; + int mode; + + do { + seq = gtod_read_begin(gtod); + mode = gtod->vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ns = gtod->monotonic_time_snsec; + ns += vgetsns(&mode); + ns >>= gtod->shift; + } while (unlikely(gtod_read_retry(gtod, seq))); + + ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + + return mode; +} + +notrace static void do_realtime_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->wall_time_coarse_sec; + ts->tv_nsec = gtod->wall_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace static void do_monotonic_coarse(struct timespec *ts) +{ + unsigned long seq; + do { + seq = gtod_read_begin(gtod); + ts->tv_sec = gtod->monotonic_time_coarse_sec; + ts->tv_nsec = gtod->monotonic_time_coarse_nsec; + } while (unlikely(gtod_read_retry(gtod, seq))); +} + +notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + switch (clock) { + case CLOCK_REALTIME: + if (do_realtime(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_MONOTONIC: + if (do_monotonic(ts) == VCLOCK_NONE) + goto fallback; + break; + case CLOCK_REALTIME_COARSE: + do_realtime_coarse(ts); + break; + case CLOCK_MONOTONIC_COARSE: + do_monotonic_coarse(ts); + break; + default: + goto fallback; + } + + return 0; +fallback: + return vdso_fallback_gettime(clock, ts); +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + if (likely(tv != NULL)) { + if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) + return vdso_fallback_gtod(tv, tz); + tv->tv_usec /= 1000; + } + if (unlikely(tz != NULL)) { + tz->tz_minuteswest = gtod->tz_minuteswest; + tz->tz_dsttime = gtod->tz_dsttime; + } + + return 0; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); + +/* + * This will break when the xtime seconds get inaccurate, but that is + * unlikely + */ +notrace time_t __vdso_time(time_t *t) +{ + /* This is atomic on x86 so we don't need any locks. */ + time_t result = ACCESS_ONCE(gtod->wall_time_sec); + + if (t) + *t = result; + return result; +} +int time(time_t *t) + __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S new file mode 100644 index 000000000000..de2c921025f5 --- /dev/null +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -0,0 +1,118 @@ +#include + +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +#if defined(BUILD_VDSO64) +# define SHDR_SIZE 64 +#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) +# define SHDR_SIZE 40 +#else +# error unknown VDSO target +#endif + +#define NUM_FAKE_SHDRS 13 + +SECTIONS +{ + /* + * User/kernel shared data is before the vDSO. This may be a little + * uglier than putting it after the vDSO, but it avoids issues with + * non-allocatable things that dangle past the end of the PT_LOAD + * segment. + */ + + vvar_start = . - 2 * PAGE_SIZE; + vvar_page = vvar_start; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + hpet_page = vvar_start + PAGE_SIZE; + + . = SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { + *(.rodata*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + + /* + * Ideally this would live in a C file, but that won't + * work cleanly for x32 until we start building the x32 + * C code using an x32 toolchain. + */ + VDSO_FAKE_SECTION_TABLE_START = .; + . = . + NUM_FAKE_SHDRS * SHDR_SIZE; + VDSO_FAKE_SECTION_TABLE_END = .; + } :text + + .fake_shstrtab : { *(.fake_shstrtab) } :text + + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + + /* + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. + */ + + .text : { *(.text*) } :text =0x90909090, + + /* + * At the end so that eu-elflint stays happy when vdso2c strips + * these. A better implementation would avoid allocating space + * for these. + */ + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + /DISCARD/ : { + *(.discard) + *(.discard.*) + *(__bug_table) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/entry/vdso/vdso-note.S b/arch/x86/entry/vdso/vdso-note.S new file mode 100644 index 000000000000..79a071e4357e --- /dev/null +++ b/arch/x86/entry/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include +#include + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S new file mode 100644 index 000000000000..6807932643c2 --- /dev/null +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -0,0 +1,29 @@ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSO64 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + time; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c new file mode 100644 index 000000000000..8627db24a7f6 --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.c @@ -0,0 +1,253 @@ +/* + * vdso2c - A vdso image preparation tool + * Copyright (c) 2014 Andy Lutomirski and others + * Licensed under the GPL v2 + * + * vdso2c requires stripped and unstripped input. It would be trivial + * to fully strip the input in here, but, for reasons described below, + * we need to write a section table. Doing this is more or less + * equivalent to dropping all non-allocatable sections, but it's + * easier to let objcopy handle that instead of doing it ourselves. + * If we ever need to do something fancier than what objcopy provides, + * it would be straightforward to add here. + * + * We're keep a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +const char *outfilename; + +/* Symbols that we need in vdso2c. */ +enum { + sym_vvar_start, + sym_vvar_page, + sym_hpet_page, + sym_VDSO_FAKE_SECTION_TABLE_START, + sym_VDSO_FAKE_SECTION_TABLE_END, +}; + +const int special_pages[] = { + sym_vvar_page, + sym_hpet_page, +}; + +struct vdso_sym { + const char *name; + bool export; +}; + +struct vdso_sym required_syms[] = { + [sym_vvar_start] = {"vvar_start", true}, + [sym_vvar_page] = {"vvar_page", true}, + [sym_hpet_page] = {"hpet_page", true}, + [sym_VDSO_FAKE_SECTION_TABLE_START] = { + "VDSO_FAKE_SECTION_TABLE_START", false + }, + [sym_VDSO_FAKE_SECTION_TABLE_END] = { + "VDSO_FAKE_SECTION_TABLE_END", false + }, + {"VDSO32_NOTE_MASK", true}, + {"VDSO32_SYSENTER_RETURN", true}, + {"__kernel_vsyscall", true}, + {"__kernel_sigreturn", true}, + {"__kernel_rt_sigreturn", true}, +}; + +__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) +static void fail(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + fprintf(stderr, "Error: "); + vfprintf(stderr, format, ap); + if (outfilename) + unlink(outfilename); + exit(1); + va_end(ap); +} + +/* + * Evil macros for little-endian reads and writes + */ +#define GLE(x, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot) + +extern void bad_get_le(void); +#define LAST_GLE(x) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le()) + +#define GET_LE(x) \ + GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x)))) + +#define PLE(x, val, bits, ifnot) \ + __builtin_choose_expr( \ + (sizeof(*(x)) == bits/8), \ + put_unaligned_le##bits((val), (x)), ifnot) + +extern void bad_put_le(void); +#define LAST_PLE(x, val) \ + __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le()) + +#define PUT_LE(x, val) \ + PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) + + +#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) + +#define BITSFUNC3(name, bits, suffix) name##bits##suffix +#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) + +#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) + +#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x +#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) +#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) + +#define ELF_BITS 64 +#include "vdso2c.h" +#undef ELF_BITS + +#define ELF_BITS 32 +#include "vdso2c.h" +#undef ELF_BITS + +static void go(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; + + if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { + go64(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { + go32(raw_addr, raw_len, stripped_addr, stripped_len, + outfile, name); + } else { + fail("unknown ELF class\n"); + } +} + +static void map_input(const char *name, void **addr, size_t *len, int prot) +{ + off_t tmp_len; + + int fd = open(name, O_RDONLY); + if (fd == -1) + err(1, "%s", name); + + tmp_len = lseek(fd, 0, SEEK_END); + if (tmp_len == (off_t)-1) + err(1, "lseek"); + *len = (size_t)tmp_len; + + *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); + if (*addr == MAP_FAILED) + err(1, "mmap"); + + close(fd); +} + +int main(int argc, char **argv) +{ + size_t raw_len, stripped_len; + void *raw_addr, *stripped_addr; + FILE *outfile; + char *name, *tmp; + int namelen; + + if (argc != 4) { + printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); + return 1; + } + + /* + * Figure out the struct name. If we're writing to a .so file, + * generate raw output insted. + */ + name = strdup(argv[3]); + namelen = strlen(name); + if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { + name = NULL; + } else { + tmp = strrchr(name, '/'); + if (tmp) + name = tmp + 1; + tmp = strchr(name, '.'); + if (tmp) + *tmp = '\0'; + for (tmp = name; *tmp; tmp++) + if (*tmp == '-') + *tmp = '_'; + } + + map_input(argv[1], &raw_addr, &raw_len, PROT_READ); + map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); + + outfilename = argv[3]; + outfile = fopen(outfilename, "w"); + if (!outfile) + err(1, "%s", argv[2]); + + go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); + + munmap(raw_addr, raw_len); + munmap(stripped_addr, stripped_len); + fclose(outfile); + + return 0; +} diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h new file mode 100644 index 000000000000..0224987556ce --- /dev/null +++ b/arch/x86/entry/vdso/vdso2c.h @@ -0,0 +1,175 @@ +/* + * This file is included twice from vdso2c.c. It generates code for 32-bit + * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs + * are built for 32-bit userspace. + */ + +static void BITSFUNC(go)(void *raw_addr, size_t raw_len, + void *stripped_addr, size_t stripped_len, + FILE *outfile, const char *name) +{ + int found_load = 0; + unsigned long load_size = -1; /* Work around bogus warning */ + unsigned long mapping_size; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; + int i; + unsigned long j; + ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + *alt_sec = NULL; + ELF(Dyn) *dyn = 0, *dyn_end = 0; + const char *secstrings; + INT_BITS syms[NSYMS] = {}; + + ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); + + /* Walk the segment table. */ + for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { + if (GET_LE(&pt[i].p_type) == PT_LOAD) { + if (found_load) + fail("multiple PT_LOAD segs\n"); + + if (GET_LE(&pt[i].p_offset) != 0 || + GET_LE(&pt[i].p_vaddr) != 0) + fail("PT_LOAD in wrong place\n"); + + if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz)) + fail("cannot handle memsz != filesz\n"); + + load_size = GET_LE(&pt[i].p_memsz); + found_load = 1; + } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { + dyn = raw_addr + GET_LE(&pt[i].p_offset); + dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + + GET_LE(&pt[i].p_memsz); + } + } + if (!found_load) + fail("no PT_LOAD seg\n"); + + if (stripped_len < load_size) + fail("stripped input is too short\n"); + + /* Walk the dynamic table */ + for (i = 0; dyn + i < dyn_end && + GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { + typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); + if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || + tag == DT_RELENT || tag == DT_TEXTREL) + fail("vdso image contains dynamic relocations\n"); + } + + /* Walk the section table */ + secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); + secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); + for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { + ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * i; + if (GET_LE(&sh->sh_type) == SHT_SYMTAB) + symtab_hdr = sh; + + if (!strcmp(secstrings + GET_LE(&sh->sh_name), + ".altinstructions")) + alt_sec = sh; + } + + if (!symtab_hdr) + fail("no symbol table\n"); + + strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); + + /* Walk the symbol table */ + for (i = 0; + i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); + i++) { + int k; + ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + + GET_LE(&symtab_hdr->sh_entsize) * i; + const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + + GET_LE(&sym->st_name); + + for (k = 0; k < NSYMS; k++) { + if (!strcmp(name, required_syms[k].name)) { + if (syms[k]) { + fail("duplicate symbol %s\n", + required_syms[k].name); + } + + /* + * Careful: we use negative addresses, but + * st_value is unsigned, so we rely + * on syms[k] being a signed type of the + * correct width. + */ + syms[k] = GET_LE(&sym->st_value); + } + } + } + + /* Validate mapping addresses. */ + for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { + INT_BITS symval = syms[special_pages[i]]; + + if (!symval) + continue; /* The mapping isn't used; ignore it. */ + + if (symval % 4096) + fail("%s must be a multiple of 4096\n", + required_syms[i].name); + if (symval + 4096 < syms[sym_vvar_start]) + fail("%s underruns vvar_start\n", + required_syms[i].name); + if (symval + 4096 > 0) + fail("%s is on the wrong side of the vdso text\n", + required_syms[i].name); + } + if (syms[sym_vvar_start] % 4096) + fail("vvar_begin must be a multiple of 4096\n"); + + if (!name) { + fwrite(stripped_addr, stripped_len, 1, outfile); + return; + } + + mapping_size = (stripped_len + 4095) / 4096 * 4096; + + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "#include \n"); + fprintf(outfile, "\n"); + fprintf(outfile, + "static unsigned char raw_data[%lu] __page_aligned_data = {", + mapping_size); + for (j = 0; j < stripped_len; j++) { + if (j % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", + (int)((unsigned char *)stripped_addr)[j]); + } + fprintf(outfile, "\n};\n\n"); + + fprintf(outfile, "static struct page *pages[%lu];\n\n", + mapping_size / 4096); + + fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "\t.data = raw_data,\n"); + fprintf(outfile, "\t.size = %lu,\n", mapping_size); + fprintf(outfile, "\t.text_mapping = {\n"); + fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); + fprintf(outfile, "\t\t.pages = pages,\n"); + fprintf(outfile, "\t},\n"); + if (alt_sec) { + fprintf(outfile, "\t.alt = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_offset)); + fprintf(outfile, "\t.alt_len = %lu,\n", + (unsigned long)GET_LE(&alt_sec->sh_size)); + } + for (i = 0; i < NSYMS; i++) { + if (required_syms[i].export && syms[i]) + fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", + required_syms[i].name, (int64_t)syms[i]); + } + fprintf(outfile, "};\n"); +} diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c new file mode 100644 index 000000000000..e904c270573b --- /dev/null +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -0,0 +1,120 @@ +/* + * (C) Copyright 2002 Linus Torvalds + * Portions based on the vdso-randomization code from exec-shield: + * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar + * + * This file contains the needed initializations to support sysenter. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#ifdef CONFIG_COMPAT_VDSO +#define VDSO_DEFAULT 0 +#else +#define VDSO_DEFAULT 1 +#endif + +/* + * Should the kernel map a VDSO page into processes and pass its + * address down to glibc upon exec()? + */ +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; + +static int __init vdso32_setup(char *s) +{ + vdso32_enabled = simple_strtoul(s, NULL, 0); + + if (vdso32_enabled > 1) + pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); + + return 1; +} + +/* + * For consistency, the argument vdso32=[012] affects the 32-bit vDSO + * behavior on both 64-bit and 32-bit kernels. + * On 32-bit kernels, vdso=[012] means the same thing. + */ +__setup("vdso32=", vdso32_setup); + +#ifdef CONFIG_X86_32 +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); +#endif + +#ifdef CONFIG_X86_64 + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) + +#else /* CONFIG_X86_32 */ + +#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) +#define vdso32_syscall() (0) + +#endif /* CONFIG_X86_64 */ + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +const struct vdso_image *selected_vdso32; +#endif + +int __init sysenter_setup(void) +{ +#ifdef CONFIG_COMPAT + if (vdso32_syscall()) + selected_vdso32 = &vdso_image_32_syscall; + else +#endif + if (vdso32_sysenter()) + selected_vdso32 = &vdso_image_32_sysenter; + else + selected_vdso32 = &vdso_image_32_int80; + + init_vdso_image(selected_vdso32); + + return 0; +} + +#ifdef CONFIG_X86_64 + +subsys_initcall(sysenter_setup); + +#ifdef CONFIG_SYSCTL +/* Register vsyscall32 into the ABI table */ +#include + +static struct ctl_table abi_table2[] = { + { + .procname = "vsyscall32", + .data = &vdso32_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + {} +}; + +static struct ctl_table abi_root_table2[] = { + { + .procname = "abi", + .mode = 0555, + .child = abi_table2 + }, + {} +}; + +static __init int ia32_binfmt_init(void) +{ + register_sysctl_table(abi_root_table2); + return 0; +} +__initcall(ia32_binfmt_init); +#endif /* CONFIG_SYSCTL */ + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/entry/vdso/vdso32/.gitignore b/arch/x86/entry/vdso/vdso32/.gitignore new file mode 100644 index 000000000000..e45fba9d0ced --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/.gitignore @@ -0,0 +1 @@ +vdso32.lds diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S new file mode 100644 index 000000000000..b15b7c01aedb --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/int80.S @@ -0,0 +1,56 @@ +/* + * Code for the vDSO. This version uses the old int $0x80 method. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + int $0x80 + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .previous diff --git a/arch/x86/entry/vdso/vdso32/note.S b/arch/x86/entry/vdso/vdso32/note.S new file mode 100644 index 000000000000..c83f25734696 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/note.S @@ -0,0 +1,44 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include +#include + +/* Ideally this would use UTS_NAME, but using a quoted string here + doesn't work. Remember to change this when changing the + kernel's name. */ +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END + +#ifdef CONFIG_XEN +/* + * Add a special note telling glibc's dynamic linker a fake hardware + * flavor that it will use to choose the search path for libraries in the + * same way it uses real hardware capabilities like "mmx". + * We supply "nosegneg" as the fake capability, to indicate that we + * do not like negative offsets in instructions using segment overrides, + * since we implement those inefficiently. This makes it possible to + * install libraries optimized to avoid those access patterns in someplace + * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file + * corresponding to the bits here is needed to make ldconfig work right. + * It should contain: + * hwcap 1 nosegneg + * to match the mapping of bit to name that we give here. + * + * At runtime, the fake hardware feature will be considered to be present + * if its bit is set in the mask word. So, we start with the mask 0, and + * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. + */ + +#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ + +ELFNOTE_START(GNU, 2, "a") + .long 1 /* ncaps */ +VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ + .long 0 /* mask */ + .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ +ELFNOTE_END +#endif diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S new file mode 100644 index 000000000000..d7ec4e251c0a --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -0,0 +1,145 @@ +/* + * Common code for the sigreturn entry points in vDSO images. + * So far this code is the same for both int80 and sysenter versions. + * This file is #include'd by int80.S et al to define them first thing. + * The kernel assumes that the addresses of these routines are constant + * for all vDSO implementations. + */ + +#include +#include +#include + +#ifndef SYSCALL_ENTER_KERNEL +#define SYSCALL_ENTER_KERNEL int $0x80 +#endif + + .text + .globl __kernel_sigreturn + .type __kernel_sigreturn,@function + nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ + ALIGN +__kernel_sigreturn: +.LSTART_sigreturn: + popl %eax /* XXX does this mean it needs unwind info? */ + movl $__NR_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_sigreturn: + nop + .size __kernel_sigreturn,.-.LSTART_sigreturn + + .globl __kernel_rt_sigreturn + .type __kernel_rt_sigreturn,@function + ALIGN +__kernel_rt_sigreturn: +.LSTART_rt_sigreturn: + movl $__NR_rt_sigreturn, %eax + SYSCALL_ENTER_KERNEL +.LEND_rt_sigreturn: + nop + .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI1: + .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 +.LSTARTCIEDLSI1: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zRS" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0 /* DW_CFA_nop */ + .align 4 +.LENDCIEDLSI1: + .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ +.LSTARTFDEDLSI1: + .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: The dwarf2 unwind routines will subtract 1 from the + return address to get an address in the middle of the + presumed call instruction. Since we didn't get here via + a call, we need to include the nop before the real start + to make up for it. */ + .long .LSTART_sigreturn-1-. /* PC-relative start address */ + .long .LEND_sigreturn-.LSTART_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + complicated by the fact that the "CFA" is always assumed to + be the value of the stack pointer in the caller. This means + that we must define the CFA of this body of code to be the + saved value of the stack pointer in the sigcontext. Which + also means that there is no fixed relation to the other + saved registers, which means that we must use DW_CFA_expression + to compute their addresses. It also means that when we + adjust the stack with the popl, we have to do it all over again. */ + +#define do_cfa_expr(offset) \ + .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ + .byte 0x06; /* DW_OP_deref */ \ +1: + +#define do_expr(regno, offset) \ + .byte 0x10; /* DW_CFA_expression */ \ + .uleb128 regno; /* regno */ \ + .uleb128 1f-0f; /* length */ \ +0: .byte 0x74; /* DW_OP_breg4 */ \ + .sleb128 offset; /* offset */ \ +1: + + do_cfa_expr(IA32_SIGCONTEXT_sp+4) + do_expr(0, IA32_SIGCONTEXT_ax+4) + do_expr(1, IA32_SIGCONTEXT_cx+4) + do_expr(2, IA32_SIGCONTEXT_dx+4) + do_expr(3, IA32_SIGCONTEXT_bx+4) + do_expr(5, IA32_SIGCONTEXT_bp+4) + do_expr(6, IA32_SIGCONTEXT_si+4) + do_expr(7, IA32_SIGCONTEXT_di+4) + do_expr(8, IA32_SIGCONTEXT_ip+4) + + .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ + + do_cfa_expr(IA32_SIGCONTEXT_sp) + do_expr(0, IA32_SIGCONTEXT_ax) + do_expr(1, IA32_SIGCONTEXT_cx) + do_expr(2, IA32_SIGCONTEXT_dx) + do_expr(3, IA32_SIGCONTEXT_bx) + do_expr(5, IA32_SIGCONTEXT_bp) + do_expr(6, IA32_SIGCONTEXT_si) + do_expr(7, IA32_SIGCONTEXT_di) + do_expr(8, IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI1: + + .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ +.LSTARTFDEDLSI2: + .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ + /* HACK: See above wrt unwind library assumptions. */ + .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ + .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 + .uleb128 0 /* Augmentation */ + /* What follows are the instructions for the table generation. + We record the locations of each register saved. This is + slightly less complicated than the above, since we don't + modify the stack pointer in the process. */ + + do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) + do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) + do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) + do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) + do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) + do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) + do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) + do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) + do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) + + .align 4 +.LENDFDEDLSI2: + .previous diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S new file mode 100644 index 000000000000..6b286bb5251c --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/syscall.S @@ -0,0 +1,75 @@ +/* + * Code for the vDSO. This version uses the syscall instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#define SYSCALL_ENTER_KERNEL syscall +#include "sigreturn.S" + +#include + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + push %ebp +.Lpush_ebp: + movl %ecx, %ebp + syscall + movl %ebp, %ecx + popl %ebp +.Lpop_ebp: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + + .section .eh_frame,"a",@progbits +.LSTARTFRAME: + .long .LENDCIE-.LSTARTCIE +.LSTARTCIE: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIE: + + .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ +.LSTARTFDE1: + .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 /* Augmentation length */ + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 8 + .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ + .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .uleb128 4 + .align 4 +.LENDFDE1: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 + .previous diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S new file mode 100644 index 000000000000..e354bceee0e0 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/sysenter.S @@ -0,0 +1,116 @@ +/* + * Code for the vDSO. This version uses the sysenter instruction. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + +/* + * The caller puts arg2 in %ecx, which gets pushed. The kernel will use + * %ecx itself for arg2. The pushing is because the sysexit instruction + * (found in entry.S) requires that we clobber %ecx with the desired %esp. + * User code might expect that %ecx is unclobbered though, as it would be + * for returning via the iret instruction, so we must push and pop. + * + * The caller puts arg3 in %edx, which the sysexit instruction requires + * for %eip. Thus, exactly as for arg2, we must push and pop. + * + * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter + * instruction clobbers %esp, the user's %esp won't even survive entry + * into the kernel. We store %esp in %ebp. Code in entry.S must fetch + * arg6 from the stack. + * + * You can not use this vsyscall for the clone() syscall because the + * three words on the parent stack do not get copied to the child. + */ + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + push %ecx +.Lpush_ecx: + push %edx +.Lpush_edx: + push %ebp +.Lenter_kernel: + movl %esp,%ebp + sysenter + + /* 7: align return point with nop's to make disassembly easier */ + .space 7,0x90 + + /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ + int $0x80 + /* 16: System call normal return point is here! */ +VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ + pop %ebp +.Lpop_ebp: + pop %edx +.Lpop_edx: + pop %ecx +.Lpop_ecx: + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + /* What follows are the instructions for the table generation. + We have to record all changes of the stack pointer. */ + .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x10 /* RA at offset 16 now */ + .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ + /* Finally the epilogue. */ + .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x0c /* RA at offset 12 now */ + .byte 0xc5 /* DW_CFA_restore %ebp */ + .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x08 /* RA at offset 8 now */ + .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ + .byte 0x0e /* DW_CFA_def_cfa_offset */ + .byte 0x04 /* RA at offset 4 now */ + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Emit a symbol with the size of this .eh_frame data, + * to verify it matches the other versions. + */ +VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/entry/vdso/vdso32/vclock_gettime.c b/arch/x86/entry/vdso/vdso32/vclock_gettime.c new file mode 100644 index 000000000000..175cc72c0f68 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vclock_gettime.c @@ -0,0 +1,30 @@ +#define BUILD_VDSO32 + +#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE +#undef CONFIG_OPTIMIZE_INLINING +#endif + +#undef CONFIG_X86_PPRO_FENCE + +#ifdef CONFIG_X86_64 + +/* + * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel + * configuration + */ +#undef CONFIG_64BIT +#undef CONFIG_X86_64 +#undef CONFIG_ILLEGAL_POINTER_VALUE +#undef CONFIG_SPARSEMEM_VMEMMAP +#undef CONFIG_NR_CPUS + +#define CONFIG_X86_32 1 +#define CONFIG_PAGE_OFFSET 0 +#define CONFIG_ILLEGAL_POINTER_VALUE 0 +#define CONFIG_NR_CPUS 1 + +#define BUILD_VDSO32_64 + +#endif + +#include "../vclock_gettime.c" diff --git a/arch/x86/entry/vdso/vdso32/vdso-fakesections.c b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c new file mode 100644 index 000000000000..541468e25265 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vdso-fakesections.c @@ -0,0 +1 @@ +#include "../vdso-fakesections.c" diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S new file mode 100644 index 000000000000..31056cf294bf --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -0,0 +1,37 @@ +/* + * Linker script for 32-bit vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#include + +#define BUILD_VDSO32 + +#include "../vdso-layout.lds.S" + +/* The ELF entry point can be used to set the AT_SYSINFO value. */ +ENTRY(__kernel_vsyscall); + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION +{ + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_time; + }; + + LINUX_2.5 { + global: + __kernel_vsyscall; + __kernel_sigreturn; + __kernel_rt_sigreturn; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S new file mode 100644 index 000000000000..697c11ece90c --- /dev/null +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -0,0 +1,25 @@ +/* + * Linker script for x32 vDSO. + * We #include the file to define the layout details. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. + */ + +#define BUILD_VDSOX32 + +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + __vdso_clock_gettime; + __vdso_gettimeofday; + __vdso_getcpu; + __vdso_time; + local: *; + }; +} diff --git a/arch/x86/entry/vdso/vgetcpu.c b/arch/x86/entry/vdso/vgetcpu.c new file mode 100644 index 000000000000..8ec3d1f4ce9a --- /dev/null +++ b/arch/x86/entry/vdso/vgetcpu.c @@ -0,0 +1,28 @@ +/* + * Copyright 2006 Andi Kleen, SUSE Labs. + * Subject to the GNU Public License, v.2 + * + * Fast user context implementation of getcpu() + */ + +#include +#include +#include +#include + +notrace long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + unsigned int p; + + p = __getcpu(); + + if (cpu) + *cpu = p & VGETCPU_CPU_MASK; + if (node) + *node = p >> 12; + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c new file mode 100644 index 000000000000..1c9f750c3859 --- /dev/null +++ b/arch/x86/entry/vdso/vma.c @@ -0,0 +1,300 @@ +/* + * Copyright 2007 Andi Kleen, SUSE Labs. + * Subject to the GPL, v.2 + * + * This contains most of the x86 vDSO kernel-side code. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_X86_64) +unsigned int __read_mostly vdso64_enabled = 1; +#endif + +void __init init_vdso_image(const struct vdso_image *image) +{ + int i; + int npages = (image->size) / PAGE_SIZE; + + BUG_ON(image->size % PAGE_SIZE != 0); + for (i = 0; i < npages; i++) + image->text_mapping.pages[i] = + virt_to_page(image->data + i*PAGE_SIZE); + + apply_alternatives((struct alt_instr *)(image->data + image->alt), + (struct alt_instr *)(image->data + image->alt + + image->alt_len)); +} + +struct linux_binprm; + +/* + * Put the vdso above the (randomized) stack with another randomized + * offset. This way there is no hole in the middle of address space. + * To save memory make sure it is still in the same PTE as the stack + * top. This doesn't give that many random bits. + * + * Note that this algorithm is imperfect: the distribution of the vdso + * start address within a PMD is biased toward the end. + * + * Only used for the 64-bit and x32 vdsos. + */ +static unsigned long vdso_addr(unsigned long start, unsigned len) +{ +#ifdef CONFIG_X86_32 + return 0; +#else + unsigned long addr, end; + unsigned offset; + + /* + * Round up the start address. It can start out unaligned as a result + * of stack start randomization. + */ + start = PAGE_ALIGN(start); + + /* Round the lowest possible end address up to a PMD boundary. */ + end = (start + len + PMD_SIZE - 1) & PMD_MASK; + if (end >= TASK_SIZE_MAX) + end = TASK_SIZE_MAX; + end -= len; + + if (end > start) { + offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); + addr = start + (offset << PAGE_SHIFT); + } else { + addr = start; + } + + /* + * Forcibly align the final address in case we have a hardware + * issue that requires alignment for performance reasons. + */ + addr = align_vdso_addr(addr); + + return addr; +#endif +} + +static int map_vdso(const struct vdso_image *image, bool calculate_addr) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long addr, text_start; + int ret = 0; + static struct page *no_pages[] = {NULL}; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .pages = no_pages, + }; + + if (calculate_addr) { + addr = vdso_addr(current->mm->start_stack, + image->size - image->sym_vvar_start); + } else { + addr = 0; + } + + down_write(&mm->mmap_sem); + + addr = get_unmapped_area(NULL, addr, + image->size - image->sym_vvar_start, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = addr; + goto up_fail; + } + + text_start = addr - image->sym_vvar_start; + current->mm->context.vdso = (void __user *)text_start; + + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + vma = _install_special_mapping(mm, + text_start, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &image->text_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + vma = _install_special_mapping(mm, + addr, + -image->sym_vvar_start, + VM_READ|VM_MAYREAD, + &vvar_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + if (image->sym_vvar_page) + ret = remap_pfn_range(vma, + text_start + image->sym_vvar_page, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + +#ifdef CONFIG_HPET_TIMER + if (hpet_address && image->sym_hpet_page) { + ret = io_remap_pfn_range(vma, + text_start + image->sym_hpet_page, + hpet_address >> PAGE_SHIFT, + PAGE_SIZE, + pgprot_noncached(PAGE_READONLY)); + + if (ret) + goto up_fail; + } +#endif + +up_fail: + if (ret) + current->mm->context.vdso = NULL; + + up_write(&mm->mmap_sem); + return ret; +} + +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +static int load_vdso32(void) +{ + int ret; + + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ + return 0; + + ret = map_vdso(selected_vdso32, false); + if (ret) + return ret; + + if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) + current_thread_info()->sysenter_return = + current->mm->context.vdso + + selected_vdso32->sym_VDSO32_SYSENTER_RETURN; + + return 0; +} +#endif + +#ifdef CONFIG_X86_64 +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_64, true); +} + +#ifdef CONFIG_COMPAT +int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) +{ +#ifdef CONFIG_X86_X32_ABI + if (test_thread_flag(TIF_X32)) { + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_x32, true); + } +#endif + + return load_vdso32(); +} +#endif +#else +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + return load_vdso32(); +} +#endif + +#ifdef CONFIG_X86_64 +static __init int vdso_setup(char *s) +{ + vdso64_enabled = simple_strtoul(s, NULL, 0); + return 0; +} +__setup("vdso=", vdso_setup); +#endif + +#ifdef CONFIG_X86_64 +static void vgetcpu_cpu_init(void *arg) +{ + int cpu = smp_processor_id(); + struct desc_struct d = { }; + unsigned long node = 0; +#ifdef CONFIG_NUMA + node = cpu_to_node(cpu); +#endif + if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); + + /* + * Store cpu number in limit so that it can be loaded + * quickly in user space in vgetcpu. (12 bits for the CPU + * and 8 bits for the node) + */ + d.limit0 = cpu | ((node & 0xf) << 12); + d.limit = node >> 4; + d.type = 5; /* RO data, expand down, accessed */ + d.dpl = 3; /* Visible to user code */ + d.s = 1; /* Not a system segment */ + d.p = 1; /* Present */ + d.d = 1; /* 32-bit */ + + write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); +} + +static int +vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + + if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); + + return NOTIFY_DONE; +} + +static int __init init_vdso(void) +{ + init_vdso_image(&vdso_image_64); + +#ifdef CONFIG_X86_X32_ABI + init_vdso_image(&vdso_image_x32); +#endif + + cpu_notifier_register_begin(); + + on_each_cpu(vgetcpu_cpu_init, NULL, 1); + /* notifier priority > KVM */ + __hotcpu_notifier(vgetcpu_cpu_notifier, 30); + + cpu_notifier_register_done(); + + return 0; +} +subsys_initcall(init_vdso); +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore deleted file mode 100644 index aae8ffdd5880..000000000000 --- a/arch/x86/vdso/.gitignore +++ /dev/null @@ -1,7 +0,0 @@ -vdso.lds -vdsox32.lds -vdso32-syscall-syms.lds -vdso32-sysenter-syms.lds -vdso32-int80-syms.lds -vdso-image-*.c -vdso2c diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile deleted file mode 100644 index e97032069f88..000000000000 --- a/arch/x86/vdso/Makefile +++ /dev/null @@ -1,209 +0,0 @@ -# -# Building vDSO images for x86. -# - -KBUILD_CFLAGS += $(DISABLE_LTO) -KASAN_SANITIZE := n - -VDSO64-$(CONFIG_X86_64) := y -VDSOX32-$(CONFIG_X86_X32_ABI) := y -VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_COMPAT) := y - -# files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o - -# files to link into kernel -obj-y += vma.o - -# vDSO images to build -vdso_img-$(VDSO64-y) += 64 -vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32-int80 -vdso_img-$(CONFIG_COMPAT) += 32-syscall -vdso_img-$(VDSO32-y) += 32-sysenter - -obj-$(VDSO32-y) += vdso32-setup.o - -vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) - -$(obj)/vdso.o: $(obj)/vdso.so - -targets += vdso.lds $(vobjs-y) - -# Build the vDSO image C files and link them in. -vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) -vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) -vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) -obj-y += $(vdso_img_objs) -targets += $(vdso_img_cfiles) -targets += $(vdso_img_sodbg) -.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) \ - $(vdso_img-y:%=$(obj)/vdso%.so) - -export CPPFLAGS_vdso.lds += -P -C - -VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ - -Wl,--no-undefined \ - -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ - $(DISABLE_LTO) - -$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE - $(call if_changed,vdso) - -HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi -I$(srctree)/arch/x86/include/uapi -hostprogs-y += vdso2c - -quiet_cmd_vdso2c = VDSO2C $@ -define cmd_vdso2c - $(obj)/vdso2c $< $(<:%.dbg=%) $@ -endef - -$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE - $(call if_changed,vdso2c) - -# -# Don't omit frame pointers for ease of userspace debugging, but do -# optimize sibling calls. -# -CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ - $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ - -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING - -$(vobjs): KBUILD_CFLAGS += $(CFL) - -# -# vDSO code runs in userspace and -pg doesn't help with profiling anyway. -# -CFLAGS_REMOVE_vdso-note.o = -pg -CFLAGS_REMOVE_vclock_gettime.o = -pg -CFLAGS_REMOVE_vgetcpu.o = -pg -CFLAGS_REMOVE_vvar.o = -pg - -# -# X32 processes use x32 vDSO to access 64bit kernel data. -# -# Build x32 vDSO image: -# 1. Compile x32 vDSO as 64bit. -# 2. Convert object files to x32. -# 3. Build x32 VDSO image with x32 objects, which contains 64bit codes -# so that it can reach 64bit address space with 64bit pointers. -# - -CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ - -Wl,-soname=linux-vdso.so.1 \ - -Wl,-z,max-page-size=4096 \ - -Wl,-z,common-page-size=4096 - -# 64-bit objects to re-brand as x32 -vobjs64-for-x32 := $(filter-out $(vobjs-nox32),$(vobjs-y)) - -# x32-rebranded versions -vobjx32s-y := $(vobjs64-for-x32:.o=-x32.o) - -# same thing, but in the output directory -vobjx32s := $(foreach F,$(vobjx32s-y),$(obj)/$F) - -# Convert 64bit object file to x32 for x32 vDSO. -quiet_cmd_x32 = X32 $@ - cmd_x32 = $(OBJCOPY) -O elf32-x86-64 $< $@ - -$(obj)/%-x32.o: $(obj)/%.o FORCE - $(call if_changed,x32) - -targets += vdsox32.lds $(vobjx32s-y) - -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg - $(call if_changed,objcopy) - -$(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE - $(call if_changed,vdso) - -# -# Build multiple 32-bit vDSO images to choose from at boot time. -# -vdso32.so-$(VDSO32-y) += int80 -vdso32.so-$(CONFIG_COMPAT) += syscall -vdso32.so-$(VDSO32-y) += sysenter - -vdso32-images = $(vdso32.so-y:%=vdso32-%.so) - -CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) -VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 - -# This makes sure the $(obj) subdirectory exists even though vdso32/ -# is not a kbuild sub-make subdirectory. -override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ - -targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) -targets += vdso32/vclock_gettime.o - -$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) - -KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 - -KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) -KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32)) -KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic -KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) -KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) -KBUILD_CFLAGS_32 += -fno-omit-frame-pointer -KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) - -$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/%.o - $(call if_changed,vdso) - -# -# The DSO images are built using a special linker script. -# -quiet_cmd_vdso = VDSO $@ - cmd_vdso = $(CC) -nostdlib -o $@ \ - $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ - -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ - sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' - -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ - $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) -GCOV_PROFILE := n - -# -# Install the unstripped copies of vdso*.so. If our toolchain supports -# build-id, install .build-id links as well. -# -quiet_cmd_vdso_install = INSTALL $(@:install_%=%) -define cmd_vdso_install - cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ - if readelf -n $< |grep -q 'Build ID'; then \ - buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ - first=`echo $$buildid | cut -b-2`; \ - last=`echo $$buildid | cut -b3-`; \ - mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ - ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ - fi -endef - -vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) - -$(MODLIB)/vdso: FORCE - @mkdir -p $(MODLIB)/vdso - -$(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE - $(call cmd,vdso_install) - -PHONY += vdso_install $(vdso_img_insttargets) -vdso_install: $(vdso_img_insttargets) FORCE - -clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/vdso/checkundef.sh b/arch/x86/vdso/checkundef.sh deleted file mode 100755 index 7ee90a9b549d..000000000000 --- a/arch/x86/vdso/checkundef.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -nm="$1" -file="$2" -$nm "$file" | grep '^ *U' > /dev/null 2>&1 -if [ $? -eq 1 ]; then - exit 0 -else - echo "$file: undefined symbols found" >&2 - exit 1 -fi diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c deleted file mode 100644 index 9793322751e0..000000000000 --- a/arch/x86/vdso/vclock_gettime.c +++ /dev/null @@ -1,351 +0,0 @@ -/* - * Copyright 2006 Andi Kleen, SUSE Labs. - * Subject to the GNU Public License, v.2 - * - * Fast user context implementation of clock_gettime, gettimeofday, and time. - * - * 32 Bit compat layer by Stefani Seibold - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * The code should have no internal unresolved relocations. - * Check with readelf after changing. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#define gtod (&VVAR(vsyscall_gtod_data)) - -extern int __vdso_clock_gettime(clockid_t clock, struct timespec *ts); -extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); -extern time_t __vdso_time(time_t *t); - -#ifdef CONFIG_HPET_TIMER -extern u8 hpet_page - __attribute__((visibility("hidden"))); - -static notrace cycle_t vread_hpet(void) -{ - return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); -} -#endif - -#ifndef BUILD_VDSO32 - -#include -#include -#include -#include - -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) -{ - long ret; - asm("syscall" : "=a" (ret) : - "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; -} - -#ifdef CONFIG_PARAVIRT_CLOCK - -static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) -{ - const struct pvclock_vsyscall_time_info *pvti_base; - int idx = cpu / (PAGE_SIZE/PVTI_SIZE); - int offset = cpu % (PAGE_SIZE/PVTI_SIZE); - - BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); - - pvti_base = (struct pvclock_vsyscall_time_info *) - __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); - - return &pvti_base[offset]; -} - -static notrace cycle_t vread_pvclock(int *mode) -{ - const struct pvclock_vsyscall_time_info *pvti; - cycle_t ret; - u64 last; - u32 version; - u8 flags; - unsigned cpu, cpu1; - - - /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. - * - */ - do { - cpu = __getcpu() & VGETCPU_CPU_MASK; - /* TODO: We can put vcpu id into higher bits of pvti.version. - * This will save a couple of cycles by getting rid of - * __getcpu() calls (Gleb). - */ - - pvti = get_pvti(cpu); - - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); - - /* - * Test we're still on the cpu as well as the version. - * We could have been migrated just after the first - * vgetcpu but before fetching the version, so we - * wouldn't notice a version change. - */ - cpu1 = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1 || - (pvti->pvti.version & 1) || - pvti->pvti.version != version)); - - if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) - *mode = VCLOCK_NONE; - - /* refer to tsc.c read_tsc() comment for rationale */ - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - return last; -} -#endif - -#else - -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) - : "memory", "edx"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) - : "memory", "edx"); - return ret; -} - -#ifdef CONFIG_PARAVIRT_CLOCK - -static notrace cycle_t vread_pvclock(int *mode) -{ - *mode = VCLOCK_NONE; - return 0; -} -#endif - -#endif - -notrace static cycle_t vread_tsc(void) -{ - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)__native_read_tsc(); - - last = gtod->cycle_last; - - if (likely(ret >= last)) - return ret; - - /* - * GCC likes to generate cmov here, but this branch is extremely - * predictable (it's just a funciton of time and the likely is - * very likely) and there's a data dependence, so force GCC - * to generate a branch instead. I don't barrier() because - * we don't actually need a barrier, and if this function - * ever gets inlined it will generate worse code. - */ - asm volatile (""); - return last; -} - -notrace static inline u64 vgetsns(int *mode) -{ - u64 v; - cycles_t cycles; - - if (gtod->vclock_mode == VCLOCK_TSC) - cycles = vread_tsc(); -#ifdef CONFIG_HPET_TIMER - else if (gtod->vclock_mode == VCLOCK_HPET) - cycles = vread_hpet(); -#endif -#ifdef CONFIG_PARAVIRT_CLOCK - else if (gtod->vclock_mode == VCLOCK_PVCLOCK) - cycles = vread_pvclock(mode); -#endif - else - return 0; - v = (cycles - gtod->cycle_last) & gtod->mask; - return v * gtod->mult; -} - -/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ -notrace static int __always_inline do_realtime(struct timespec *ts) -{ - unsigned long seq; - u64 ns; - int mode; - - do { - seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->wall_time_sec; - ns = gtod->wall_time_snsec; - ns += vgetsns(&mode); - ns >>= gtod->shift; - } while (unlikely(gtod_read_retry(gtod, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; -} - -notrace static int __always_inline do_monotonic(struct timespec *ts) -{ - unsigned long seq; - u64 ns; - int mode; - - do { - seq = gtod_read_begin(gtod); - mode = gtod->vclock_mode; - ts->tv_sec = gtod->monotonic_time_sec; - ns = gtod->monotonic_time_snsec; - ns += vgetsns(&mode); - ns >>= gtod->shift; - } while (unlikely(gtod_read_retry(gtod, seq))); - - ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns); - ts->tv_nsec = ns; - - return mode; -} - -notrace static void do_realtime_coarse(struct timespec *ts) -{ - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->wall_time_coarse_sec; - ts->tv_nsec = gtod->wall_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} - -notrace static void do_monotonic_coarse(struct timespec *ts) -{ - unsigned long seq; - do { - seq = gtod_read_begin(gtod); - ts->tv_sec = gtod->monotonic_time_coarse_sec; - ts->tv_nsec = gtod->monotonic_time_coarse_nsec; - } while (unlikely(gtod_read_retry(gtod, seq))); -} - -notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) -{ - switch (clock) { - case CLOCK_REALTIME: - if (do_realtime(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_MONOTONIC: - if (do_monotonic(ts) == VCLOCK_NONE) - goto fallback; - break; - case CLOCK_REALTIME_COARSE: - do_realtime_coarse(ts); - break; - case CLOCK_MONOTONIC_COARSE: - do_monotonic_coarse(ts); - break; - default: - goto fallback; - } - - return 0; -fallback: - return vdso_fallback_gettime(clock, ts); -} -int clock_gettime(clockid_t, struct timespec *) - __attribute__((weak, alias("__vdso_clock_gettime"))); - -notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) -{ - if (likely(tv != NULL)) { - if (unlikely(do_realtime((struct timespec *)tv) == VCLOCK_NONE)) - return vdso_fallback_gtod(tv, tz); - tv->tv_usec /= 1000; - } - if (unlikely(tz != NULL)) { - tz->tz_minuteswest = gtod->tz_minuteswest; - tz->tz_dsttime = gtod->tz_dsttime; - } - - return 0; -} -int gettimeofday(struct timeval *, struct timezone *) - __attribute__((weak, alias("__vdso_gettimeofday"))); - -/* - * This will break when the xtime seconds get inaccurate, but that is - * unlikely - */ -notrace time_t __vdso_time(time_t *t) -{ - /* This is atomic on x86 so we don't need any locks. */ - time_t result = ACCESS_ONCE(gtod->wall_time_sec); - - if (t) - *t = result; - return result; -} -int time(time_t *t) - __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S deleted file mode 100644 index de2c921025f5..000000000000 --- a/arch/x86/vdso/vdso-layout.lds.S +++ /dev/null @@ -1,118 +0,0 @@ -#include - -/* - * Linker script for vDSO. This is an ELF shared object prelinked to - * its virtual address, and with only one read-only segment. - * This script controls its layout. - */ - -#if defined(BUILD_VDSO64) -# define SHDR_SIZE 64 -#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) -# define SHDR_SIZE 40 -#else -# error unknown VDSO target -#endif - -#define NUM_FAKE_SHDRS 13 - -SECTIONS -{ - /* - * User/kernel shared data is before the vDSO. This may be a little - * uglier than putting it after the vDSO, but it avoids issues with - * non-allocatable things that dangle past the end of the PT_LOAD - * segment. - */ - - vvar_start = . - 2 * PAGE_SIZE; - vvar_page = vvar_start; - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; -#define __VVAR_KERNEL_LDS -#include -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR - - hpet_page = vvar_start + PAGE_SIZE; - - . = SIZEOF_HEADERS; - - .hash : { *(.hash) } :text - .gnu.hash : { *(.gnu.hash) } - .dynsym : { *(.dynsym) } - .dynstr : { *(.dynstr) } - .gnu.version : { *(.gnu.version) } - .gnu.version_d : { *(.gnu.version_d) } - .gnu.version_r : { *(.gnu.version_r) } - - .dynamic : { *(.dynamic) } :text :dynamic - - .rodata : { - *(.rodata*) - *(.data*) - *(.sdata*) - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.bss*) - *(.dynbss*) - *(.gnu.linkonce.b.*) - - /* - * Ideally this would live in a C file, but that won't - * work cleanly for x32 until we start building the x32 - * C code using an x32 toolchain. - */ - VDSO_FAKE_SECTION_TABLE_START = .; - . = . + NUM_FAKE_SHDRS * SHDR_SIZE; - VDSO_FAKE_SECTION_TABLE_END = .; - } :text - - .fake_shstrtab : { *(.fake_shstrtab) } :text - - - .note : { *(.note.*) } :text :note - - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - - - /* - * Text is well-separated from actual data: there's plenty of - * stuff that isn't used at runtime in between. - */ - - .text : { *(.text*) } :text =0x90909090, - - /* - * At the end so that eu-elflint stays happy when vdso2c strips - * these. A better implementation would avoid allocating space - * for these. - */ - .altinstructions : { *(.altinstructions) } :text - .altinstr_replacement : { *(.altinstr_replacement) } :text - - /DISCARD/ : { - *(.discard) - *(.discard.*) - *(__bug_table) - } -} - -/* - * Very old versions of ld do not recognize this name token; use the constant. - */ -#define PT_GNU_EH_FRAME 0x6474e550 - -/* - * We must supply the ELF program headers explicitly to get just one - * PT_LOAD segment, and set the flags explicitly to make segments read-only. - */ -PHDRS -{ - text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ - dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ - note PT_NOTE FLAGS(4); /* PF_R */ - eh_frame_hdr PT_GNU_EH_FRAME; -} diff --git a/arch/x86/vdso/vdso-note.S b/arch/x86/vdso/vdso-note.S deleted file mode 100644 index 79a071e4357e..000000000000 --- a/arch/x86/vdso/vdso-note.S +++ /dev/null @@ -1,12 +0,0 @@ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include -#include -#include - -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S deleted file mode 100644 index 6807932643c2..000000000000 --- a/arch/x86/vdso/vdso.lds.S +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Linker script for 64-bit vDSO. - * We #include the file to define the layout details. - * - * This file defines the version script giving the user-exported symbols in - * the DSO. - */ - -#define BUILD_VDSO64 - -#include "vdso-layout.lds.S" - -/* - * This controls what userland symbols we export from the vDSO. - */ -VERSION { - LINUX_2.6 { - global: - clock_gettime; - __vdso_clock_gettime; - gettimeofday; - __vdso_gettimeofday; - getcpu; - __vdso_getcpu; - time; - __vdso_time; - local: *; - }; -} diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c deleted file mode 100644 index 8627db24a7f6..000000000000 --- a/arch/x86/vdso/vdso2c.c +++ /dev/null @@ -1,253 +0,0 @@ -/* - * vdso2c - A vdso image preparation tool - * Copyright (c) 2014 Andy Lutomirski and others - * Licensed under the GPL v2 - * - * vdso2c requires stripped and unstripped input. It would be trivial - * to fully strip the input in here, but, for reasons described below, - * we need to write a section table. Doing this is more or less - * equivalent to dropping all non-allocatable sections, but it's - * easier to let objcopy handle that instead of doing it ourselves. - * If we ever need to do something fancier than what objcopy provides, - * it would be straightforward to add here. - * - * We're keep a section table for a few reasons: - * - * The Go runtime had a couple of bugs: it would read the section - * table to try to figure out how many dynamic symbols there were (it - * shouldn't have looked at the section table at all) and, if there - * were no SHT_SYNDYM section table entry, it would use an - * uninitialized value for the number of symbols. An empty DYNSYM - * table would work, but I see no reason not to write a valid one (and - * keep full performance for old Go programs). This hack is only - * needed on x86_64. - * - * The bug was introduced on 2012-08-31 by: - * https://code.google.com/p/go/source/detail?r=56ea40aac72b - * and was fixed on 2014-06-13 by: - * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 - * - * Binutils has issues debugging the vDSO: it reads the section table to - * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which - * would break build-id if we removed the section table. Binutils - * also requires that shstrndx != 0. See: - * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 - * - * elfutils might not look for PT_NOTE if there is a section table at - * all. I don't know whether this matters for any practical purpose. - * - * For simplicity, rather than hacking up a partial section table, we - * just write a mostly complete one. We omit non-dynamic symbols, - * though, since they're rather large. - * - * Once binutils gets fixed, we might be able to drop this for all but - * the 64-bit vdso, since build-id only works in kernel RPMs, and - * systems that update to new enough kernel RPMs will likely update - * binutils in sync. build-id has never worked for home-built kernel - * RPMs without manual symlinking, and I suspect that no one ever does - * that. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include -#include - -const char *outfilename; - -/* Symbols that we need in vdso2c. */ -enum { - sym_vvar_start, - sym_vvar_page, - sym_hpet_page, - sym_VDSO_FAKE_SECTION_TABLE_START, - sym_VDSO_FAKE_SECTION_TABLE_END, -}; - -const int special_pages[] = { - sym_vvar_page, - sym_hpet_page, -}; - -struct vdso_sym { - const char *name; - bool export; -}; - -struct vdso_sym required_syms[] = { - [sym_vvar_start] = {"vvar_start", true}, - [sym_vvar_page] = {"vvar_page", true}, - [sym_hpet_page] = {"hpet_page", true}, - [sym_VDSO_FAKE_SECTION_TABLE_START] = { - "VDSO_FAKE_SECTION_TABLE_START", false - }, - [sym_VDSO_FAKE_SECTION_TABLE_END] = { - "VDSO_FAKE_SECTION_TABLE_END", false - }, - {"VDSO32_NOTE_MASK", true}, - {"VDSO32_SYSENTER_RETURN", true}, - {"__kernel_vsyscall", true}, - {"__kernel_sigreturn", true}, - {"__kernel_rt_sigreturn", true}, -}; - -__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) -static void fail(const char *format, ...) -{ - va_list ap; - va_start(ap, format); - fprintf(stderr, "Error: "); - vfprintf(stderr, format, ap); - if (outfilename) - unlink(outfilename); - exit(1); - va_end(ap); -} - -/* - * Evil macros for little-endian reads and writes - */ -#define GLE(x, bits, ifnot) \ - __builtin_choose_expr( \ - (sizeof(*(x)) == bits/8), \ - (__typeof__(*(x)))get_unaligned_le##bits(x), ifnot) - -extern void bad_get_le(void); -#define LAST_GLE(x) \ - __builtin_choose_expr(sizeof(*(x)) == 1, *(x), bad_get_le()) - -#define GET_LE(x) \ - GLE(x, 64, GLE(x, 32, GLE(x, 16, LAST_GLE(x)))) - -#define PLE(x, val, bits, ifnot) \ - __builtin_choose_expr( \ - (sizeof(*(x)) == bits/8), \ - put_unaligned_le##bits((val), (x)), ifnot) - -extern void bad_put_le(void); -#define LAST_PLE(x, val) \ - __builtin_choose_expr(sizeof(*(x)) == 1, *(x) = (val), bad_put_le()) - -#define PUT_LE(x, val) \ - PLE(x, val, 64, PLE(x, val, 32, PLE(x, val, 16, LAST_PLE(x, val)))) - - -#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) - -#define BITSFUNC3(name, bits, suffix) name##bits##suffix -#define BITSFUNC2(name, bits, suffix) BITSFUNC3(name, bits, suffix) -#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS, ) - -#define INT_BITS BITSFUNC2(int, ELF_BITS, _t) - -#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x -#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) -#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) - -#define ELF_BITS 64 -#include "vdso2c.h" -#undef ELF_BITS - -#define ELF_BITS 32 -#include "vdso2c.h" -#undef ELF_BITS - -static void go(void *raw_addr, size_t raw_len, - void *stripped_addr, size_t stripped_len, - FILE *outfile, const char *name) -{ - Elf64_Ehdr *hdr = (Elf64_Ehdr *)raw_addr; - - if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { - go64(raw_addr, raw_len, stripped_addr, stripped_len, - outfile, name); - } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { - go32(raw_addr, raw_len, stripped_addr, stripped_len, - outfile, name); - } else { - fail("unknown ELF class\n"); - } -} - -static void map_input(const char *name, void **addr, size_t *len, int prot) -{ - off_t tmp_len; - - int fd = open(name, O_RDONLY); - if (fd == -1) - err(1, "%s", name); - - tmp_len = lseek(fd, 0, SEEK_END); - if (tmp_len == (off_t)-1) - err(1, "lseek"); - *len = (size_t)tmp_len; - - *addr = mmap(NULL, tmp_len, prot, MAP_PRIVATE, fd, 0); - if (*addr == MAP_FAILED) - err(1, "mmap"); - - close(fd); -} - -int main(int argc, char **argv) -{ - size_t raw_len, stripped_len; - void *raw_addr, *stripped_addr; - FILE *outfile; - char *name, *tmp; - int namelen; - - if (argc != 4) { - printf("Usage: vdso2c RAW_INPUT STRIPPED_INPUT OUTPUT\n"); - return 1; - } - - /* - * Figure out the struct name. If we're writing to a .so file, - * generate raw output insted. - */ - name = strdup(argv[3]); - namelen = strlen(name); - if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { - name = NULL; - } else { - tmp = strrchr(name, '/'); - if (tmp) - name = tmp + 1; - tmp = strchr(name, '.'); - if (tmp) - *tmp = '\0'; - for (tmp = name; *tmp; tmp++) - if (*tmp == '-') - *tmp = '_'; - } - - map_input(argv[1], &raw_addr, &raw_len, PROT_READ); - map_input(argv[2], &stripped_addr, &stripped_len, PROT_READ); - - outfilename = argv[3]; - outfile = fopen(outfilename, "w"); - if (!outfile) - err(1, "%s", argv[2]); - - go(raw_addr, raw_len, stripped_addr, stripped_len, outfile, name); - - munmap(raw_addr, raw_len); - munmap(stripped_addr, stripped_len); - fclose(outfile); - - return 0; -} diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h deleted file mode 100644 index 0224987556ce..000000000000 --- a/arch/x86/vdso/vdso2c.h +++ /dev/null @@ -1,175 +0,0 @@ -/* - * This file is included twice from vdso2c.c. It generates code for 32-bit - * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs - * are built for 32-bit userspace. - */ - -static void BITSFUNC(go)(void *raw_addr, size_t raw_len, - void *stripped_addr, size_t stripped_len, - FILE *outfile, const char *name) -{ - int found_load = 0; - unsigned long load_size = -1; /* Work around bogus warning */ - unsigned long mapping_size; - ELF(Ehdr) *hdr = (ELF(Ehdr) *)raw_addr; - int i; - unsigned long j; - ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, - *alt_sec = NULL; - ELF(Dyn) *dyn = 0, *dyn_end = 0; - const char *secstrings; - INT_BITS syms[NSYMS] = {}; - - ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); - - /* Walk the segment table. */ - for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { - if (GET_LE(&pt[i].p_type) == PT_LOAD) { - if (found_load) - fail("multiple PT_LOAD segs\n"); - - if (GET_LE(&pt[i].p_offset) != 0 || - GET_LE(&pt[i].p_vaddr) != 0) - fail("PT_LOAD in wrong place\n"); - - if (GET_LE(&pt[i].p_memsz) != GET_LE(&pt[i].p_filesz)) - fail("cannot handle memsz != filesz\n"); - - load_size = GET_LE(&pt[i].p_memsz); - found_load = 1; - } else if (GET_LE(&pt[i].p_type) == PT_DYNAMIC) { - dyn = raw_addr + GET_LE(&pt[i].p_offset); - dyn_end = raw_addr + GET_LE(&pt[i].p_offset) + - GET_LE(&pt[i].p_memsz); - } - } - if (!found_load) - fail("no PT_LOAD seg\n"); - - if (stripped_len < load_size) - fail("stripped input is too short\n"); - - /* Walk the dynamic table */ - for (i = 0; dyn + i < dyn_end && - GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { - typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); - if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || - tag == DT_RELENT || tag == DT_TEXTREL) - fail("vdso image contains dynamic relocations\n"); - } - - /* Walk the section table */ - secstrings_hdr = raw_addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); - secstrings = raw_addr + GET_LE(&secstrings_hdr->sh_offset); - for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - ELF(Shdr) *sh = raw_addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * i; - if (GET_LE(&sh->sh_type) == SHT_SYMTAB) - symtab_hdr = sh; - - if (!strcmp(secstrings + GET_LE(&sh->sh_name), - ".altinstructions")) - alt_sec = sh; - } - - if (!symtab_hdr) - fail("no symbol table\n"); - - strtab_hdr = raw_addr + GET_LE(&hdr->e_shoff) + - GET_LE(&hdr->e_shentsize) * GET_LE(&symtab_hdr->sh_link); - - /* Walk the symbol table */ - for (i = 0; - i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); - i++) { - int k; - ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + - GET_LE(&symtab_hdr->sh_entsize) * i; - const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + - GET_LE(&sym->st_name); - - for (k = 0; k < NSYMS; k++) { - if (!strcmp(name, required_syms[k].name)) { - if (syms[k]) { - fail("duplicate symbol %s\n", - required_syms[k].name); - } - - /* - * Careful: we use negative addresses, but - * st_value is unsigned, so we rely - * on syms[k] being a signed type of the - * correct width. - */ - syms[k] = GET_LE(&sym->st_value); - } - } - } - - /* Validate mapping addresses. */ - for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { - INT_BITS symval = syms[special_pages[i]]; - - if (!symval) - continue; /* The mapping isn't used; ignore it. */ - - if (symval % 4096) - fail("%s must be a multiple of 4096\n", - required_syms[i].name); - if (symval + 4096 < syms[sym_vvar_start]) - fail("%s underruns vvar_start\n", - required_syms[i].name); - if (symval + 4096 > 0) - fail("%s is on the wrong side of the vdso text\n", - required_syms[i].name); - } - if (syms[sym_vvar_start] % 4096) - fail("vvar_begin must be a multiple of 4096\n"); - - if (!name) { - fwrite(stripped_addr, stripped_len, 1, outfile); - return; - } - - mapping_size = (stripped_len + 4095) / 4096 * 4096; - - fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); - fprintf(outfile, "#include \n"); - fprintf(outfile, "#include \n"); - fprintf(outfile, "#include \n"); - fprintf(outfile, "\n"); - fprintf(outfile, - "static unsigned char raw_data[%lu] __page_aligned_data = {", - mapping_size); - for (j = 0; j < stripped_len; j++) { - if (j % 10 == 0) - fprintf(outfile, "\n\t"); - fprintf(outfile, "0x%02X, ", - (int)((unsigned char *)stripped_addr)[j]); - } - fprintf(outfile, "\n};\n\n"); - - fprintf(outfile, "static struct page *pages[%lu];\n\n", - mapping_size / 4096); - - fprintf(outfile, "const struct vdso_image %s = {\n", name); - fprintf(outfile, "\t.data = raw_data,\n"); - fprintf(outfile, "\t.size = %lu,\n", mapping_size); - fprintf(outfile, "\t.text_mapping = {\n"); - fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); - fprintf(outfile, "\t\t.pages = pages,\n"); - fprintf(outfile, "\t},\n"); - if (alt_sec) { - fprintf(outfile, "\t.alt = %lu,\n", - (unsigned long)GET_LE(&alt_sec->sh_offset)); - fprintf(outfile, "\t.alt_len = %lu,\n", - (unsigned long)GET_LE(&alt_sec->sh_size)); - } - for (i = 0; i < NSYMS; i++) { - if (required_syms[i].export && syms[i]) - fprintf(outfile, "\t.sym_%s = %" PRIi64 ",\n", - required_syms[i].name, (int64_t)syms[i]); - } - fprintf(outfile, "};\n"); -} diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c deleted file mode 100644 index e904c270573b..000000000000 --- a/arch/x86/vdso/vdso32-setup.c +++ /dev/null @@ -1,120 +0,0 @@ -/* - * (C) Copyright 2002 Linus Torvalds - * Portions based on the vdso-randomization code from exec-shield: - * Copyright(C) 2005-2006, Red Hat, Inc., Ingo Molnar - * - * This file contains the needed initializations to support sysenter. - */ - -#include -#include -#include -#include - -#include -#include -#include - -#ifdef CONFIG_COMPAT_VDSO -#define VDSO_DEFAULT 0 -#else -#define VDSO_DEFAULT 1 -#endif - -/* - * Should the kernel map a VDSO page into processes and pass its - * address down to glibc upon exec()? - */ -unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; - -static int __init vdso32_setup(char *s) -{ - vdso32_enabled = simple_strtoul(s, NULL, 0); - - if (vdso32_enabled > 1) - pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); - - return 1; -} - -/* - * For consistency, the argument vdso32=[012] affects the 32-bit vDSO - * behavior on both 64-bit and 32-bit kernels. - * On 32-bit kernels, vdso=[012] means the same thing. - */ -__setup("vdso32=", vdso32_setup); - -#ifdef CONFIG_X86_32 -__setup_param("vdso=", vdso_setup, vdso32_setup, 0); -#endif - -#ifdef CONFIG_X86_64 - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) -#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) - -#else /* CONFIG_X86_32 */ - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) -#define vdso32_syscall() (0) - -#endif /* CONFIG_X86_64 */ - -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) -const struct vdso_image *selected_vdso32; -#endif - -int __init sysenter_setup(void) -{ -#ifdef CONFIG_COMPAT - if (vdso32_syscall()) - selected_vdso32 = &vdso_image_32_syscall; - else -#endif - if (vdso32_sysenter()) - selected_vdso32 = &vdso_image_32_sysenter; - else - selected_vdso32 = &vdso_image_32_int80; - - init_vdso_image(selected_vdso32); - - return 0; -} - -#ifdef CONFIG_X86_64 - -subsys_initcall(sysenter_setup); - -#ifdef CONFIG_SYSCTL -/* Register vsyscall32 into the ABI table */ -#include - -static struct ctl_table abi_table2[] = { - { - .procname = "vsyscall32", - .data = &vdso32_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - {} -}; - -static struct ctl_table abi_root_table2[] = { - { - .procname = "abi", - .mode = 0555, - .child = abi_table2 - }, - {} -}; - -static __init int ia32_binfmt_init(void) -{ - register_sysctl_table(abi_root_table2); - return 0; -} -__initcall(ia32_binfmt_init); -#endif /* CONFIG_SYSCTL */ - -#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/vdso/vdso32/.gitignore b/arch/x86/vdso/vdso32/.gitignore deleted file mode 100644 index e45fba9d0ced..000000000000 --- a/arch/x86/vdso/vdso32/.gitignore +++ /dev/null @@ -1 +0,0 @@ -vdso32.lds diff --git a/arch/x86/vdso/vdso32/int80.S b/arch/x86/vdso/vdso32/int80.S deleted file mode 100644 index b15b7c01aedb..000000000000 --- a/arch/x86/vdso/vdso32/int80.S +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Code for the vDSO. This version uses the old int $0x80 method. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - int $0x80 - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 - .previous diff --git a/arch/x86/vdso/vdso32/note.S b/arch/x86/vdso/vdso32/note.S deleted file mode 100644 index c83f25734696..000000000000 --- a/arch/x86/vdso/vdso32/note.S +++ /dev/null @@ -1,44 +0,0 @@ -/* - * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. - * Here we can supply some information useful to userland. - */ - -#include -#include - -/* Ideally this would use UTS_NAME, but using a quoted string here - doesn't work. Remember to change this when changing the - kernel's name. */ -ELFNOTE_START(Linux, 0, "a") - .long LINUX_VERSION_CODE -ELFNOTE_END - -#ifdef CONFIG_XEN -/* - * Add a special note telling glibc's dynamic linker a fake hardware - * flavor that it will use to choose the search path for libraries in the - * same way it uses real hardware capabilities like "mmx". - * We supply "nosegneg" as the fake capability, to indicate that we - * do not like negative offsets in instructions using segment overrides, - * since we implement those inefficiently. This makes it possible to - * install libraries optimized to avoid those access patterns in someplace - * like /lib/i686/tls/nosegneg. Note that an /etc/ld.so.conf.d/file - * corresponding to the bits here is needed to make ldconfig work right. - * It should contain: - * hwcap 1 nosegneg - * to match the mapping of bit to name that we give here. - * - * At runtime, the fake hardware feature will be considered to be present - * if its bit is set in the mask word. So, we start with the mask 0, and - * at boot time we set VDSO_NOTE_NONEGSEG_BIT if running under Xen. - */ - -#include "../../xen/vdso.h" /* Defines VDSO_NOTE_NONEGSEG_BIT. */ - -ELFNOTE_START(GNU, 2, "a") - .long 1 /* ncaps */ -VDSO32_NOTE_MASK: /* Symbol used by arch/x86/xen/setup.c */ - .long 0 /* mask */ - .byte VDSO_NOTE_NONEGSEG_BIT; .asciz "nosegneg" /* bit, name */ -ELFNOTE_END -#endif diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S deleted file mode 100644 index d7ec4e251c0a..000000000000 --- a/arch/x86/vdso/vdso32/sigreturn.S +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Common code for the sigreturn entry points in vDSO images. - * So far this code is the same for both int80 and sysenter versions. - * This file is #include'd by int80.S et al to define them first thing. - * The kernel assumes that the addresses of these routines are constant - * for all vDSO implementations. - */ - -#include -#include -#include - -#ifndef SYSCALL_ENTER_KERNEL -#define SYSCALL_ENTER_KERNEL int $0x80 -#endif - - .text - .globl __kernel_sigreturn - .type __kernel_sigreturn,@function - nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */ - ALIGN -__kernel_sigreturn: -.LSTART_sigreturn: - popl %eax /* XXX does this mean it needs unwind info? */ - movl $__NR_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_sigreturn: - nop - .size __kernel_sigreturn,.-.LSTART_sigreturn - - .globl __kernel_rt_sigreturn - .type __kernel_rt_sigreturn,@function - ALIGN -__kernel_rt_sigreturn: -.LSTART_rt_sigreturn: - movl $__NR_rt_sigreturn, %eax - SYSCALL_ENTER_KERNEL -.LEND_rt_sigreturn: - nop - .size __kernel_rt_sigreturn,.-.LSTART_rt_sigreturn - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI1: - .long .LENDCIEDLSI1-.LSTARTCIEDLSI1 -.LSTARTCIEDLSI1: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zRS" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0 /* DW_CFA_nop */ - .align 4 -.LENDCIEDLSI1: - .long .LENDFDEDLSI1-.LSTARTFDEDLSI1 /* Length FDE */ -.LSTARTFDEDLSI1: - .long .LSTARTFDEDLSI1-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: The dwarf2 unwind routines will subtract 1 from the - return address to get an address in the middle of the - presumed call instruction. Since we didn't get here via - a call, we need to include the nop before the real start - to make up for it. */ - .long .LSTART_sigreturn-1-. /* PC-relative start address */ - .long .LEND_sigreturn-.LSTART_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - complicated by the fact that the "CFA" is always assumed to - be the value of the stack pointer in the caller. This means - that we must define the CFA of this body of code to be the - saved value of the stack pointer in the sigcontext. Which - also means that there is no fixed relation to the other - saved registers, which means that we must use DW_CFA_expression - to compute their addresses. It also means that when we - adjust the stack with the popl, we have to do it all over again. */ - -#define do_cfa_expr(offset) \ - .byte 0x0f; /* DW_CFA_def_cfa_expression */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ - .byte 0x06; /* DW_OP_deref */ \ -1: - -#define do_expr(regno, offset) \ - .byte 0x10; /* DW_CFA_expression */ \ - .uleb128 regno; /* regno */ \ - .uleb128 1f-0f; /* length */ \ -0: .byte 0x74; /* DW_OP_breg4 */ \ - .sleb128 offset; /* offset */ \ -1: - - do_cfa_expr(IA32_SIGCONTEXT_sp+4) - do_expr(0, IA32_SIGCONTEXT_ax+4) - do_expr(1, IA32_SIGCONTEXT_cx+4) - do_expr(2, IA32_SIGCONTEXT_dx+4) - do_expr(3, IA32_SIGCONTEXT_bx+4) - do_expr(5, IA32_SIGCONTEXT_bp+4) - do_expr(6, IA32_SIGCONTEXT_si+4) - do_expr(7, IA32_SIGCONTEXT_di+4) - do_expr(8, IA32_SIGCONTEXT_ip+4) - - .byte 0x42 /* DW_CFA_advance_loc 2 -- nop; popl eax. */ - - do_cfa_expr(IA32_SIGCONTEXT_sp) - do_expr(0, IA32_SIGCONTEXT_ax) - do_expr(1, IA32_SIGCONTEXT_cx) - do_expr(2, IA32_SIGCONTEXT_dx) - do_expr(3, IA32_SIGCONTEXT_bx) - do_expr(5, IA32_SIGCONTEXT_bp) - do_expr(6, IA32_SIGCONTEXT_si) - do_expr(7, IA32_SIGCONTEXT_di) - do_expr(8, IA32_SIGCONTEXT_ip) - - .align 4 -.LENDFDEDLSI1: - - .long .LENDFDEDLSI2-.LSTARTFDEDLSI2 /* Length FDE */ -.LSTARTFDEDLSI2: - .long .LSTARTFDEDLSI2-.LSTARTFRAMEDLSI1 /* CIE pointer */ - /* HACK: See above wrt unwind library assumptions. */ - .long .LSTART_rt_sigreturn-1-. /* PC-relative start address */ - .long .LEND_rt_sigreturn-.LSTART_rt_sigreturn+1 - .uleb128 0 /* Augmentation */ - /* What follows are the instructions for the table generation. - We record the locations of each register saved. This is - slightly less complicated than the above, since we don't - modify the stack pointer in the process. */ - - do_cfa_expr(IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_sp) - do_expr(0, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ax) - do_expr(1, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_cx) - do_expr(2, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_dx) - do_expr(3, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bx) - do_expr(5, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_bp) - do_expr(6, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_si) - do_expr(7, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_di) - do_expr(8, IA32_RT_SIGFRAME_sigcontext-4 + IA32_SIGCONTEXT_ip) - - .align 4 -.LENDFDEDLSI2: - .previous diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S deleted file mode 100644 index 6b286bb5251c..000000000000 --- a/arch/x86/vdso/vdso32/syscall.S +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Code for the vDSO. This version uses the syscall instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#define SYSCALL_ENTER_KERNEL syscall -#include "sigreturn.S" - -#include - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ebp -.Lpush_ebp: - movl %ecx, %ebp - syscall - movl %ebp, %ecx - popl %ebp -.Lpop_ebp: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 8 - .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ - .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 4 - .align 4 -.LENDFDE1: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 - .previous diff --git a/arch/x86/vdso/vdso32/sysenter.S b/arch/x86/vdso/vdso32/sysenter.S deleted file mode 100644 index e354bceee0e0..000000000000 --- a/arch/x86/vdso/vdso32/sysenter.S +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Code for the vDSO. This version uses the sysenter instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - -/* - * The caller puts arg2 in %ecx, which gets pushed. The kernel will use - * %ecx itself for arg2. The pushing is because the sysexit instruction - * (found in entry.S) requires that we clobber %ecx with the desired %esp. - * User code might expect that %ecx is unclobbered though, as it would be - * for returning via the iret instruction, so we must push and pop. - * - * The caller puts arg3 in %edx, which the sysexit instruction requires - * for %eip. Thus, exactly as for arg2, we must push and pop. - * - * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter - * instruction clobbers %esp, the user's %esp won't even survive entry - * into the kernel. We store %esp in %ebp. Code in entry.S must fetch - * arg6 from the stack. - * - * You can not use this vsyscall for the clone() syscall because the - * three words on the parent stack do not get copied to the child. - */ - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - - /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 - - /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - int $0x80 - /* 16: System call normal return point is here! */ -VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Emit a symbol with the size of this .eh_frame data, - * to verify it matches the other versions. - */ -VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/vdso/vdso32/vclock_gettime.c b/arch/x86/vdso/vdso32/vclock_gettime.c deleted file mode 100644 index 175cc72c0f68..000000000000 --- a/arch/x86/vdso/vdso32/vclock_gettime.c +++ /dev/null @@ -1,30 +0,0 @@ -#define BUILD_VDSO32 - -#ifndef CONFIG_CC_OPTIMIZE_FOR_SIZE -#undef CONFIG_OPTIMIZE_INLINING -#endif - -#undef CONFIG_X86_PPRO_FENCE - -#ifdef CONFIG_X86_64 - -/* - * in case of a 32 bit VDSO for a 64 bit kernel fake a 32 bit kernel - * configuration - */ -#undef CONFIG_64BIT -#undef CONFIG_X86_64 -#undef CONFIG_ILLEGAL_POINTER_VALUE -#undef CONFIG_SPARSEMEM_VMEMMAP -#undef CONFIG_NR_CPUS - -#define CONFIG_X86_32 1 -#define CONFIG_PAGE_OFFSET 0 -#define CONFIG_ILLEGAL_POINTER_VALUE 0 -#define CONFIG_NR_CPUS 1 - -#define BUILD_VDSO32_64 - -#endif - -#include "../vclock_gettime.c" diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c deleted file mode 100644 index 541468e25265..000000000000 --- a/arch/x86/vdso/vdso32/vdso-fakesections.c +++ /dev/null @@ -1 +0,0 @@ -#include "../vdso-fakesections.c" diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S deleted file mode 100644 index 31056cf294bf..000000000000 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Linker script for 32-bit vDSO. - * We #include the file to define the layout details. - * - * This file defines the version script giving the user-exported symbols in - * the DSO. - */ - -#include - -#define BUILD_VDSO32 - -#include "../vdso-layout.lds.S" - -/* The ELF entry point can be used to set the AT_SYSINFO value. */ -ENTRY(__kernel_vsyscall); - -/* - * This controls what userland symbols we export from the vDSO. - */ -VERSION -{ - LINUX_2.6 { - global: - __vdso_clock_gettime; - __vdso_gettimeofday; - __vdso_time; - }; - - LINUX_2.5 { - global: - __kernel_vsyscall; - __kernel_sigreturn; - __kernel_rt_sigreturn; - local: *; - }; -} diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S deleted file mode 100644 index 697c11ece90c..000000000000 --- a/arch/x86/vdso/vdsox32.lds.S +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Linker script for x32 vDSO. - * We #include the file to define the layout details. - * - * This file defines the version script giving the user-exported symbols in - * the DSO. - */ - -#define BUILD_VDSOX32 - -#include "vdso-layout.lds.S" - -/* - * This controls what userland symbols we export from the vDSO. - */ -VERSION { - LINUX_2.6 { - global: - __vdso_clock_gettime; - __vdso_gettimeofday; - __vdso_getcpu; - __vdso_time; - local: *; - }; -} diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c deleted file mode 100644 index 8ec3d1f4ce9a..000000000000 --- a/arch/x86/vdso/vgetcpu.c +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright 2006 Andi Kleen, SUSE Labs. - * Subject to the GNU Public License, v.2 - * - * Fast user context implementation of getcpu() - */ - -#include -#include -#include -#include - -notrace long -__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) -{ - unsigned int p; - - p = __getcpu(); - - if (cpu) - *cpu = p & VGETCPU_CPU_MASK; - if (node) - *node = p >> 12; - return 0; -} - -long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) - __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c deleted file mode 100644 index 1c9f750c3859..000000000000 --- a/arch/x86/vdso/vma.c +++ /dev/null @@ -1,300 +0,0 @@ -/* - * Copyright 2007 Andi Kleen, SUSE Labs. - * Subject to the GPL, v.2 - * - * This contains most of the x86 vDSO kernel-side code. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_X86_64) -unsigned int __read_mostly vdso64_enabled = 1; -#endif - -void __init init_vdso_image(const struct vdso_image *image) -{ - int i; - int npages = (image->size) / PAGE_SIZE; - - BUG_ON(image->size % PAGE_SIZE != 0); - for (i = 0; i < npages; i++) - image->text_mapping.pages[i] = - virt_to_page(image->data + i*PAGE_SIZE); - - apply_alternatives((struct alt_instr *)(image->data + image->alt), - (struct alt_instr *)(image->data + image->alt + - image->alt_len)); -} - -struct linux_binprm; - -/* - * Put the vdso above the (randomized) stack with another randomized - * offset. This way there is no hole in the middle of address space. - * To save memory make sure it is still in the same PTE as the stack - * top. This doesn't give that many random bits. - * - * Note that this algorithm is imperfect: the distribution of the vdso - * start address within a PMD is biased toward the end. - * - * Only used for the 64-bit and x32 vdsos. - */ -static unsigned long vdso_addr(unsigned long start, unsigned len) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - unsigned long addr, end; - unsigned offset; - - /* - * Round up the start address. It can start out unaligned as a result - * of stack start randomization. - */ - start = PAGE_ALIGN(start); - - /* Round the lowest possible end address up to a PMD boundary. */ - end = (start + len + PMD_SIZE - 1) & PMD_MASK; - if (end >= TASK_SIZE_MAX) - end = TASK_SIZE_MAX; - end -= len; - - if (end > start) { - offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); - addr = start + (offset << PAGE_SHIFT); - } else { - addr = start; - } - - /* - * Forcibly align the final address in case we have a hardware - * issue that requires alignment for performance reasons. - */ - addr = align_vdso_addr(addr); - - return addr; -#endif -} - -static int map_vdso(const struct vdso_image *image, bool calculate_addr) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long addr, text_start; - int ret = 0; - static struct page *no_pages[] = {NULL}; - static struct vm_special_mapping vvar_mapping = { - .name = "[vvar]", - .pages = no_pages, - }; - - if (calculate_addr) { - addr = vdso_addr(current->mm->start_stack, - image->size - image->sym_vvar_start); - } else { - addr = 0; - } - - down_write(&mm->mmap_sem); - - addr = get_unmapped_area(NULL, addr, - image->size - image->sym_vvar_start, 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - - text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; - - /* - * MAYWRITE to allow gdb to COW and set breakpoints - */ - vma = _install_special_mapping(mm, - text_start, - image->size, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &image->text_mapping); - - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto up_fail; - } - - vma = _install_special_mapping(mm, - addr, - -image->sym_vvar_start, - VM_READ|VM_MAYREAD, - &vvar_mapping); - - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto up_fail; - } - - if (image->sym_vvar_page) - ret = remap_pfn_range(vma, - text_start + image->sym_vvar_page, - __pa_symbol(&__vvar_page) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - -#ifdef CONFIG_HPET_TIMER - if (hpet_address && image->sym_hpet_page) { - ret = io_remap_pfn_range(vma, - text_start + image->sym_hpet_page, - hpet_address >> PAGE_SHIFT, - PAGE_SIZE, - pgprot_noncached(PAGE_READONLY)); - - if (ret) - goto up_fail; - } -#endif - -up_fail: - if (ret) - current->mm->context.vdso = NULL; - - up_write(&mm->mmap_sem); - return ret; -} - -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) -static int load_vdso32(void) -{ - int ret; - - if (vdso32_enabled != 1) /* Other values all mean "disabled" */ - return 0; - - ret = map_vdso(selected_vdso32, false); - if (ret) - return ret; - - if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) - current_thread_info()->sysenter_return = - current->mm->context.vdso + - selected_vdso32->sym_VDSO32_SYSENTER_RETURN; - - return 0; -} -#endif - -#ifdef CONFIG_X86_64 -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - if (!vdso64_enabled) - return 0; - - return map_vdso(&vdso_image_64, true); -} - -#ifdef CONFIG_COMPAT -int compat_arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) -{ -#ifdef CONFIG_X86_X32_ABI - if (test_thread_flag(TIF_X32)) { - if (!vdso64_enabled) - return 0; - - return map_vdso(&vdso_image_x32, true); - } -#endif - - return load_vdso32(); -} -#endif -#else -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - return load_vdso32(); -} -#endif - -#ifdef CONFIG_X86_64 -static __init int vdso_setup(char *s) -{ - vdso64_enabled = simple_strtoul(s, NULL, 0); - return 0; -} -__setup("vdso=", vdso_setup); -#endif - -#ifdef CONFIG_X86_64 -static void vgetcpu_cpu_init(void *arg) -{ - int cpu = smp_processor_id(); - struct desc_struct d = { }; - unsigned long node = 0; -#ifdef CONFIG_NUMA - node = cpu_to_node(cpu); -#endif - if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) - write_rdtscp_aux((node << 12) | cpu); - - /* - * Store cpu number in limit so that it can be loaded - * quickly in user space in vgetcpu. (12 bits for the CPU - * and 8 bits for the node) - */ - d.limit0 = cpu | ((node & 0xf) << 12); - d.limit = node >> 4; - d.type = 5; /* RO data, expand down, accessed */ - d.dpl = 3; /* Visible to user code */ - d.s = 1; /* Not a system segment */ - d.p = 1; /* Present */ - d.d = 1; /* 32-bit */ - - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); -} - -static int -vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) -{ - long cpu = (long)arg; - - if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) - smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); - - return NOTIFY_DONE; -} - -static int __init init_vdso(void) -{ - init_vdso_image(&vdso_image_64); - -#ifdef CONFIG_X86_X32_ABI - init_vdso_image(&vdso_image_x32); -#endif - - cpu_notifier_register_begin(); - - on_each_cpu(vgetcpu_cpu_init, NULL, 1); - /* notifier priority > KVM */ - __hotcpu_notifier(vgetcpu_cpu_notifier, 30); - - cpu_notifier_register_done(); - - return 0; -} -subsys_initcall(init_vdso); -#endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From e6b93f4e48af8fe8373ec1132b8c7787890e7578 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:10:43 +0200 Subject: x86/asm/entry: Move the 'thunk' functions to arch/x86/entry/ These are all calling x86 entry code functions, so move them close to other entry code. Change lib-y to obj-y: there's no real difference between the two as we don't really drop any of them during the linking stage, and obj-y is the more common approach for core kernel object code. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 3 +-- arch/x86/entry/thunk_32.S | 42 +++++++++++++++++++++++++++++ arch/x86/entry/thunk_64.S | 69 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/lib/Makefile | 1 - arch/x86/lib/thunk_32.S | 42 ----------------------------- arch/x86/lib/thunk_64.S | 69 ----------------------------------------------- arch/x86/um/Makefile | 2 +- 7 files changed, 113 insertions(+), 115 deletions(-) create mode 100644 arch/x86/entry/thunk_32.S create mode 100644 arch/x86/entry/thunk_64.S delete mode 100644 arch/x86/lib/thunk_32.S delete mode 100644 arch/x86/lib/thunk_64.S (limited to 'arch/x86/entry/Makefile') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index c97532492ef5..9df72c8a0ac2 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,7 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o - +obj-y := entry_$(BITS).o thunk_$(BITS).o obj-y += vdso/ obj-$(CONFIG_IA32_EMULATION) += ia32entry.o diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S new file mode 100644 index 000000000000..e9acf5f4fc92 --- /dev/null +++ b/arch/x86/entry/thunk_32.S @@ -0,0 +1,42 @@ +/* + * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) + * Copyright 2008 by Steven Rostedt, Red Hat, Inc + * (inspired by Andi Kleen's thunk_64.S) + * Subject to the GNU public license, v.2. No warranty of any kind. + */ + #include + #include + + /* put return address in eax (arg1) */ + .macro THUNK name, func, put_ret_addr_in_eax=0 + .globl \name +\name: + pushl %eax + pushl %ecx + pushl %edx + + .if \put_ret_addr_in_eax + /* Place EIP in the arg1 */ + movl 3*4(%esp), %eax + .endif + + call \func + popl %edx + popl %ecx + popl %eax + ret + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context +#endif +#endif + diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S new file mode 100644 index 000000000000..10f555e435e1 --- /dev/null +++ b/arch/x86/entry/thunk_64.S @@ -0,0 +1,69 @@ +/* + * Save registers before calling assembly functions. This avoids + * disturbance of register allocation in some inline assembly constructs. + * Copyright 2001,2002 by Andi Kleen, SuSE Labs. + * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. + * Subject to the GNU public license, v.2. No warranty of any kind. + */ +#include +#include +#include + + /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ + .macro THUNK name, func, put_ret_addr_in_rdi=0 + .globl \name +\name: + + /* this one pushes 9 elems, the next one would be %rIP */ + pushq %rdi + pushq %rsi + pushq %rdx + pushq %rcx + pushq %rax + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + + .if \put_ret_addr_in_rdi + /* 9*8(%rsp) is return addr on stack */ + movq 9*8(%rsp), %rdi + .endif + + call \func + jmp restore + _ASM_NOKPROBE(\name) + .endm + +#ifdef CONFIG_TRACE_IRQFLAGS + THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 + THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + THUNK lockdep_sys_exit_thunk,lockdep_sys_exit +#endif + +#ifdef CONFIG_PREEMPT + THUNK ___preempt_schedule, preempt_schedule +#ifdef CONFIG_CONTEXT_TRACKING + THUNK ___preempt_schedule_context, preempt_schedule_context +#endif +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) \ + || defined(CONFIG_DEBUG_LOCK_ALLOC) \ + || defined(CONFIG_PREEMPT) +restore: + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi + ret + _ASM_NOKPROBE(restore) +#endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 982989d282ff..f2587888d987 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -17,7 +17,6 @@ clean-files := inat-tables.c obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o -lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S deleted file mode 100644 index e9acf5f4fc92..000000000000 --- a/arch/x86/lib/thunk_32.S +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash) - * Copyright 2008 by Steven Rostedt, Red Hat, Inc - * (inspired by Andi Kleen's thunk_64.S) - * Subject to the GNU public license, v.2. No warranty of any kind. - */ - #include - #include - - /* put return address in eax (arg1) */ - .macro THUNK name, func, put_ret_addr_in_eax=0 - .globl \name -\name: - pushl %eax - pushl %ecx - pushl %edx - - .if \put_ret_addr_in_eax - /* Place EIP in the arg1 */ - movl 3*4(%esp), %eax - .endif - - call \func - popl %edx - popl %ecx - popl %eax - ret - _ASM_NOKPROBE(\name) - .endm - -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - -#ifdef CONFIG_PREEMPT - THUNK ___preempt_schedule, preempt_schedule -#ifdef CONFIG_CONTEXT_TRACKING - THUNK ___preempt_schedule_context, preempt_schedule_context -#endif -#endif - diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S deleted file mode 100644 index 10f555e435e1..000000000000 --- a/arch/x86/lib/thunk_64.S +++ /dev/null @@ -1,69 +0,0 @@ -/* - * Save registers before calling assembly functions. This avoids - * disturbance of register allocation in some inline assembly constructs. - * Copyright 2001,2002 by Andi Kleen, SuSE Labs. - * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc. - * Subject to the GNU public license, v.2. No warranty of any kind. - */ -#include -#include -#include - - /* rdi: arg1 ... normal C conventions. rax is saved/restored. */ - .macro THUNK name, func, put_ret_addr_in_rdi=0 - .globl \name -\name: - - /* this one pushes 9 elems, the next one would be %rIP */ - pushq %rdi - pushq %rsi - pushq %rdx - pushq %rcx - pushq %rax - pushq %r8 - pushq %r9 - pushq %r10 - pushq %r11 - - .if \put_ret_addr_in_rdi - /* 9*8(%rsp) is return addr on stack */ - movq 9*8(%rsp), %rdi - .endif - - call \func - jmp restore - _ASM_NOKPROBE(\name) - .endm - -#ifdef CONFIG_TRACE_IRQFLAGS - THUNK trace_hardirqs_on_thunk,trace_hardirqs_on_caller,1 - THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 -#endif - -#ifdef CONFIG_DEBUG_LOCK_ALLOC - THUNK lockdep_sys_exit_thunk,lockdep_sys_exit -#endif - -#ifdef CONFIG_PREEMPT - THUNK ___preempt_schedule, preempt_schedule -#ifdef CONFIG_CONTEXT_TRACKING - THUNK ___preempt_schedule_context, preempt_schedule_context -#endif -#endif - -#if defined(CONFIG_TRACE_IRQFLAGS) \ - || defined(CONFIG_DEBUG_LOCK_ALLOC) \ - || defined(CONFIG_PREEMPT) -restore: - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rax - popq %rcx - popq %rdx - popq %rsi - popq %rdi - ret - _ASM_NOKPROBE(restore) -#endif diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index acb384d24669..a8fecc226946 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -26,7 +26,7 @@ else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \ +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \ ../lib/rwsem.o endif -- cgit v1.2.3 From 00398a0018d1334fedabfeaabd0fa563121de612 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Jun 2015 18:41:06 +0200 Subject: x86/asm/entry: Move the vsyscall code to arch/x86/entry/vsyscall/ The vsyscall code is entry code too, so move it to arch/x86/entry/vsyscall/. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Brian Gerst Cc: Peter Zijlstra Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 6 +- arch/x86/entry/syscall_32.c | 33 +++ arch/x86/entry/syscall_64.c | 32 +++ arch/x86/entry/vsyscall/Makefile | 7 + arch/x86/entry/vsyscall/vsyscall_64.c | 335 ++++++++++++++++++++++++++++++ arch/x86/entry/vsyscall/vsyscall_emu_64.S | 37 ++++ arch/x86/entry/vsyscall/vsyscall_gtod.c | 70 +++++++ arch/x86/entry/vsyscall/vsyscall_trace.h | 29 +++ arch/x86/kernel/Makefile | 3 - arch/x86/kernel/syscall_32.c | 33 --- arch/x86/kernel/syscall_64.c | 32 --- arch/x86/kernel/vsyscall_64.c | 335 ------------------------------ arch/x86/kernel/vsyscall_emu_64.S | 37 ---- arch/x86/kernel/vsyscall_gtod.c | 70 ------- arch/x86/kernel/vsyscall_trace.h | 29 --- 15 files changed, 547 insertions(+), 541 deletions(-) create mode 100644 arch/x86/entry/syscall_32.c create mode 100644 arch/x86/entry/syscall_64.c create mode 100644 arch/x86/entry/vsyscall/Makefile create mode 100644 arch/x86/entry/vsyscall/vsyscall_64.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_emu_64.S create mode 100644 arch/x86/entry/vsyscall/vsyscall_gtod.c create mode 100644 arch/x86/entry/vsyscall/vsyscall_trace.h delete mode 100644 arch/x86/kernel/syscall_32.c delete mode 100644 arch/x86/kernel/syscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_64.c delete mode 100644 arch/x86/kernel/vsyscall_emu_64.S delete mode 100644 arch/x86/kernel/vsyscall_gtod.c delete mode 100644 arch/x86/kernel/vsyscall_trace.h (limited to 'arch/x86/entry/Makefile') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 9df72c8a0ac2..b93cce1c6bb2 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -1,8 +1,10 @@ # # Makefile for the x86 low level entry code # -obj-y := entry_$(BITS).o thunk_$(BITS).o +obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o + obj-y += vdso/ +obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o +obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c new file mode 100644 index 000000000000..3777189c4a19 --- /dev/null +++ b/arch/x86/entry/syscall_32.c @@ -0,0 +1,33 @@ +/* System call table for i386. */ + +#include +#include +#include +#include + +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#include +#undef __SYSCALL_I386 + +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), + +typedef asmlinkage void (*sys_call_ptr_t)(void); + +extern asmlinkage void sys_ni_syscall(void); + +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c new file mode 100644 index 000000000000..4ac730b37f0b --- /dev/null +++ b/arch/x86/entry/syscall_64.c @@ -0,0 +1,32 @@ +/* System call table for x86-64. */ + +#include +#include +#include +#include +#include + +#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) + +#ifdef CONFIG_X86_X32_ABI +# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) +#else +# define __SYSCALL_X32(nr, sym, compat) /* nothing */ +#endif + +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#include +#undef __SYSCALL_64 + +#define __SYSCALL_64(nr, sym, compat) [nr] = sym, + +extern void sys_ni_syscall(void); + +asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { + /* + * Smells like a compiler bug -- it doesn't work + * when the & below is removed. + */ + [0 ... __NR_syscall_max] = &sys_ni_syscall, +#include +}; diff --git a/arch/x86/entry/vsyscall/Makefile b/arch/x86/entry/vsyscall/Makefile new file mode 100644 index 000000000000..a9f4856f622a --- /dev/null +++ b/arch/x86/entry/vsyscall/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the x86 low level vsyscall code +# +obj-y := vsyscall_gtod.o + +obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o + diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c new file mode 100644 index 000000000000..2dcc6ff6fdcc --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2012-2014 Andy Lutomirski + * + * Based on the original implementation which is: + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Parts of the original code have been moved to arch/x86/vdso/vma.c + * + * This file implements vsyscall emulation. vsyscalls are a legacy ABI: + * Userspace can request certain kernel services by calling fixed + * addresses. This concept is problematic: + * + * - It interferes with ASLR. + * - It's awkward to write code that lives in kernel addresses but is + * callable by userspace at fixed addresses. + * - The whole concept is impossible for 32-bit compat userspace. + * - UML cannot easily virtualize a vsyscall. + * + * As of mid-2014, I believe that there is no new userspace code that + * will use a vsyscall if the vDSO is present. I hope that there will + * soon be no new userspace code that will ever use a vsyscall. + * + * The code in this file emulates vsyscalls when notified of a page + * fault to a vsyscall address. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include "vsyscall_trace.h" + +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; + +static int __init vsyscall_setup(char *str) +{ + if (str) { + if (!strcmp("emulate", str)) + vsyscall_mode = EMULATE; + else if (!strcmp("native", str)) + vsyscall_mode = NATIVE; + else if (!strcmp("none", str)) + vsyscall_mode = NONE; + else + return -EINVAL; + + return 0; + } + + return -EINVAL; +} +early_param("vsyscall", vsyscall_setup); + +static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, + const char *message) +{ + if (!show_unhandled_signals) + return; + + printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", + level, current->comm, task_pid_nr(current), + message, regs->ip, regs->cs, + regs->sp, regs->ax, regs->si, regs->di); +} + +static int addr_to_vsyscall_nr(unsigned long addr) +{ + int nr; + + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) + return -EINVAL; + + nr = (addr & 0xC00UL) >> 10; + if (nr >= 3) + return -EINVAL; + + return nr; +} + +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_nr = X86_TRAP_PF; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + +bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) +{ + struct task_struct *tsk; + unsigned long caller; + int vsyscall_nr, syscall_nr, tmp; + int prev_sig_on_uaccess_error; + long ret; + + /* + * No point in checking CS -- the only way to get here is a user mode + * trap to a high address, which means that we're in 64-bit user code. + */ + + WARN_ON_ONCE(address != regs->ip); + + if (vsyscall_mode == NONE) { + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall attempted with vsyscall=none"); + return false; + } + + vsyscall_nr = addr_to_vsyscall_nr(address); + + trace_emulate_vsyscall(vsyscall_nr); + + if (vsyscall_nr < 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); + goto sigsegv; + } + + if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { + warn_bad_vsyscall(KERN_WARNING, regs, + "vsyscall with bad stack (exploit attempt?)"); + goto sigsegv; + } + + tsk = current; + + /* + * Check for access_ok violations and find the syscall nr. + * + * NULL is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, NULL means "don't write anything" not "write it at + * address 0". + */ + switch (vsyscall_nr) { + case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_gettimeofday; + break; + + case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_time; + break; + + case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) { + ret = -EFAULT; + goto check_fault; + } + + syscall_nr = __NR_getcpu; + break; + } + + /* + * Handle seccomp. regs->ip must be the original value. + * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. + * + * We could optimize the seccomp disabled case, but performance + * here doesn't matter. + */ + regs->orig_ax = syscall_nr; + regs->ax = -ENOSYS; + tmp = secure_computing(); + if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { + warn_bad_vsyscall(KERN_DEBUG, regs, + "seccomp tried to change syscall nr or ip"); + do_exit(SIGSYS); + } + regs->orig_ax = -1; + if (tmp) + goto do_ret; /* skip requested */ + + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + ret = -EFAULT; + switch (vsyscall_nr) { + case 0: + ret = sys_gettimeofday( + (struct timeval __user *)regs->di, + (struct timezone __user *)regs->si); + break; + + case 1: + ret = sys_time((time_t __user *)regs->di); + break; + + case 2: + ret = sys_getcpu((unsigned __user *)regs->di, + (unsigned __user *)regs->si, + NULL); + break; + } + + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + +check_fault: + if (ret == -EFAULT) { + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ + warn_bad_vsyscall(KERN_INFO, regs, + "vsyscall fault (exploit attempt?)"); + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ + } + + regs->ax = ret; + +do_ret: + /* Emulate a ret instruction. */ + regs->ip = caller; + regs->sp += 8; + return true; + +sigsegv: + force_sig(SIGSEGV, current); + return true; +} + +/* + * A pseudo VMA to allow ptrace access for the vsyscall page. This only + * covers the 64bit vsyscall page now. 32bit has a real VMA now and does + * not need special handling anymore: + */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; +static struct vm_area_struct gate_vma = { + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, +}; + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ +#ifdef CONFIG_IA32_EMULATION + if (!mm || mm->context.ia32_compat) + return NULL; +#endif + if (vsyscall_mode == NONE) + return NULL; + return &gate_vma; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} + +/* + * Use this when you have no reliable mm, typically from interrupt + * context. It is less reliable than using a task's mm and may give + * false positives. + */ +int in_gate_area_no_mm(unsigned long addr) +{ + return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; +} + +void __init map_vsyscall(void) +{ + extern char __vsyscall_page; + unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); + + if (vsyscall_mode != NONE) + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, + vsyscall_mode == NATIVE + ? PAGE_KERNEL_VSYSCALL + : PAGE_KERNEL_VVAR); + + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_emu_64.S b/arch/x86/entry/vsyscall/vsyscall_emu_64.S new file mode 100644 index 000000000000..c9596a9af159 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_emu_64.S @@ -0,0 +1,37 @@ +/* + * vsyscall_emu_64.S: Vsyscall emulation page + * + * Copyright (c) 2011 Andy Lutomirski + * + * Subject to the GNU General Public License, version 2 + */ + +#include + +#include +#include +#include + +__PAGE_ALIGNED_DATA + .globl __vsyscall_page + .balign PAGE_SIZE, 0xcc + .type __vsyscall_page, @object +__vsyscall_page: + + mov $__NR_gettimeofday, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_time, %rax + syscall + ret + + .balign 1024, 0xcc + mov $__NR_getcpu, %rax + syscall + ret + + .balign 4096, 0xcc + + .size __vsyscall_page, 4096 diff --git a/arch/x86/entry/vsyscall/vsyscall_gtod.c b/arch/x86/entry/vsyscall/vsyscall_gtod.c new file mode 100644 index 000000000000..51e330416995 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_gtod.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2001 Andrea Arcangeli SuSE + * Copyright 2003 Andi Kleen, SuSE Labs. + * + * Modified for x86 32 bit architecture by + * Stefani Seibold + * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany + * + * Thanks to hpa@transmeta.com for some useful hint. + * Special thanks to Ingo Molnar for his early experience with + * a different vsyscall implementation for Linux/IA32 and for the name. + * + */ + +#include +#include +#include + +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); + +void update_vsyscall_tz(void) +{ + vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; + vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; +} + +void update_vsyscall(struct timekeeper *tk) +{ + struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; + + gtod_write_begin(vdata); + + /* copy vsyscall data */ + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; + + vdata->wall_time_sec = tk->xtime_sec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; + + vdata->monotonic_time_sec = tk->xtime_sec + + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + + ((u64)tk->wall_to_monotonic.tv_nsec + << tk->tkr_mono.shift); + while (vdata->monotonic_time_snsec >= + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { + vdata->monotonic_time_snsec -= + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; + vdata->monotonic_time_sec++; + } + + vdata->wall_time_coarse_sec = tk->xtime_sec; + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); + + vdata->monotonic_time_coarse_sec = + vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; + vdata->monotonic_time_coarse_nsec = + vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; + + while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { + vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; + vdata->monotonic_time_coarse_sec++; + } + + gtod_write_end(vdata); +} diff --git a/arch/x86/entry/vsyscall/vsyscall_trace.h b/arch/x86/entry/vsyscall/vsyscall_trace.h new file mode 100644 index 000000000000..9dd7359a38a8 --- /dev/null +++ b/arch/x86/entry/vsyscall/vsyscall_trace.h @@ -0,0 +1,29 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM vsyscall + +#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define __VSYSCALL_TRACE_H + +#include + +TRACE_EVENT(emulate_vsyscall, + + TP_PROTO(int nr), + + TP_ARGS(nr), + + TP_STRUCT__entry(__field(int, nr)), + + TP_fast_assign( + __entry->nr = nr; + ), + + TP_printk("nr = %d", __entry->nr) +); + +#endif + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../arch/x86/entry/vsyscall/ +#define TRACE_INCLUDE_FILE vsyscall_trace +#include diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 9d3ee054453d..01663ee5f1b7 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -31,9 +31,6 @@ obj-y += probe_roms.o obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o -obj-y += syscall_$(BITS).o vsyscall_gtod.o -obj-$(CONFIG_IA32_EMULATION) += syscall_32.o -obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c deleted file mode 100644 index 3777189c4a19..000000000000 --- a/arch/x86/kernel/syscall_32.c +++ /dev/null @@ -1,33 +0,0 @@ -/* System call table for i386. */ - -#include -#include -#include -#include - -#ifdef CONFIG_IA32_EMULATION -#define SYM(sym, compat) compat -#else -#define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_ia32_syscall_max __NR_syscall_max -#endif - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; -#include -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), - -typedef asmlinkage void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); - -__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c deleted file mode 100644 index 4ac730b37f0b..000000000000 --- a/arch/x86/kernel/syscall_64.c +++ /dev/null @@ -1,32 +0,0 @@ -/* System call table for x86-64. */ - -#include -#include -#include -#include -#include - -#define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) - -#ifdef CONFIG_X86_X32_ABI -# define __SYSCALL_X32(nr, sym, compat) __SYSCALL_64(nr, sym, compat) -#else -# define __SYSCALL_X32(nr, sym, compat) /* nothing */ -#endif - -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; -#include -#undef __SYSCALL_64 - -#define __SYSCALL_64(nr, sym, compat) [nr] = sym, - -extern void sys_ni_syscall(void); - -asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c deleted file mode 100644 index 2dcc6ff6fdcc..000000000000 --- a/arch/x86/kernel/vsyscall_64.c +++ /dev/null @@ -1,335 +0,0 @@ -/* - * Copyright (c) 2012-2014 Andy Lutomirski - * - * Based on the original implementation which is: - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Parts of the original code have been moved to arch/x86/vdso/vma.c - * - * This file implements vsyscall emulation. vsyscalls are a legacy ABI: - * Userspace can request certain kernel services by calling fixed - * addresses. This concept is problematic: - * - * - It interferes with ASLR. - * - It's awkward to write code that lives in kernel addresses but is - * callable by userspace at fixed addresses. - * - The whole concept is impossible for 32-bit compat userspace. - * - UML cannot easily virtualize a vsyscall. - * - * As of mid-2014, I believe that there is no new userspace code that - * will use a vsyscall if the vDSO is present. I hope that there will - * soon be no new userspace code that will ever use a vsyscall. - * - * The code in this file emulates vsyscalls when notified of a page - * fault to a vsyscall address. - */ - -#include -#include -#include -#include - -#include -#include -#include -#include - -#define CREATE_TRACE_POINTS -#include "vsyscall_trace.h" - -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; - -static int __init vsyscall_setup(char *str) -{ - if (str) { - if (!strcmp("emulate", str)) - vsyscall_mode = EMULATE; - else if (!strcmp("native", str)) - vsyscall_mode = NATIVE; - else if (!strcmp("none", str)) - vsyscall_mode = NONE; - else - return -EINVAL; - - return 0; - } - - return -EINVAL; -} -early_param("vsyscall", vsyscall_setup); - -static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, - const char *message) -{ - if (!show_unhandled_signals) - return; - - printk_ratelimited("%s%s[%d] %s ip:%lx cs:%lx sp:%lx ax:%lx si:%lx di:%lx\n", - level, current->comm, task_pid_nr(current), - message, regs->ip, regs->cs, - regs->sp, regs->ax, regs->si, regs->di); -} - -static int addr_to_vsyscall_nr(unsigned long addr) -{ - int nr; - - if ((addr & ~0xC00UL) != VSYSCALL_ADDR) - return -EINVAL; - - nr = (addr & 0xC00UL) >> 10; - if (nr >= 3) - return -EINVAL; - - return nr; -} - -static bool write_ok_or_segv(unsigned long ptr, size_t size) -{ - /* - * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. - */ - - if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { - siginfo_t info; - struct thread_struct *thread = ¤t->thread; - - thread->error_code = 6; /* user fault, no page, write */ - thread->cr2 = ptr; - thread->trap_nr = X86_TRAP_PF; - - memset(&info, 0, sizeof(info)); - info.si_signo = SIGSEGV; - info.si_errno = 0; - info.si_code = SEGV_MAPERR; - info.si_addr = (void __user *)ptr; - - force_sig_info(SIGSEGV, &info, current); - return false; - } else { - return true; - } -} - -bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) -{ - struct task_struct *tsk; - unsigned long caller; - int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_error; - long ret; - - /* - * No point in checking CS -- the only way to get here is a user mode - * trap to a high address, which means that we're in 64-bit user code. - */ - - WARN_ON_ONCE(address != regs->ip); - - if (vsyscall_mode == NONE) { - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall attempted with vsyscall=none"); - return false; - } - - vsyscall_nr = addr_to_vsyscall_nr(address); - - trace_emulate_vsyscall(vsyscall_nr); - - if (vsyscall_nr < 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "misaligned vsyscall (exploit attempt or buggy program) -- look up the vsyscall kernel parameter if you need a workaround"); - goto sigsegv; - } - - if (get_user(caller, (unsigned long __user *)regs->sp) != 0) { - warn_bad_vsyscall(KERN_WARNING, regs, - "vsyscall with bad stack (exploit attempt?)"); - goto sigsegv; - } - - tsk = current; - - /* - * Check for access_ok violations and find the syscall nr. - * - * NULL is a valid user pointer (in the access_ok sense) on 32-bit and - * 64-bit, so we don't need to special-case it here. For all the - * vsyscalls, NULL means "don't write anything" not "write it at - * address 0". - */ - switch (vsyscall_nr) { - case 0: - if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || - !write_ok_or_segv(regs->si, sizeof(struct timezone))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_gettimeofday; - break; - - case 1: - if (!write_ok_or_segv(regs->di, sizeof(time_t))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_time; - break; - - case 2: - if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || - !write_ok_or_segv(regs->si, sizeof(unsigned))) { - ret = -EFAULT; - goto check_fault; - } - - syscall_nr = __NR_getcpu; - break; - } - - /* - * Handle seccomp. regs->ip must be the original value. - * See seccomp_send_sigsys and Documentation/prctl/seccomp_filter.txt. - * - * We could optimize the seccomp disabled case, but performance - * here doesn't matter. - */ - regs->orig_ax = syscall_nr; - regs->ax = -ENOSYS; - tmp = secure_computing(); - if ((!tmp && regs->orig_ax != syscall_nr) || regs->ip != address) { - warn_bad_vsyscall(KERN_DEBUG, regs, - "seccomp tried to change syscall nr or ip"); - do_exit(SIGSYS); - } - regs->orig_ax = -1; - if (tmp) - goto do_ret; /* skip requested */ - - /* - * With a real vsyscall, page faults cause SIGSEGV. We want to - * preserve that behavior to make writing exploits harder. - */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; - - ret = -EFAULT; - switch (vsyscall_nr) { - case 0: - ret = sys_gettimeofday( - (struct timeval __user *)regs->di, - (struct timezone __user *)regs->si); - break; - - case 1: - ret = sys_time((time_t __user *)regs->di); - break; - - case 2: - ret = sys_getcpu((unsigned __user *)regs->di, - (unsigned __user *)regs->si, - NULL); - break; - } - - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; - -check_fault: - if (ret == -EFAULT) { - /* Bad news -- userspace fed a bad pointer to a vsyscall. */ - warn_bad_vsyscall(KERN_INFO, regs, - "vsyscall fault (exploit attempt?)"); - - /* - * If we failed to generate a signal for any reason, - * generate one here. (This should be impossible.) - */ - if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && - !sigismember(&tsk->pending.signal, SIGSEGV))) - goto sigsegv; - - return true; /* Don't emulate the ret. */ - } - - regs->ax = ret; - -do_ret: - /* Emulate a ret instruction. */ - regs->ip = caller; - regs->sp += 8; - return true; - -sigsegv: - force_sig(SIGSEGV, current); - return true; -} - -/* - * A pseudo VMA to allow ptrace access for the vsyscall page. This only - * covers the 64bit vsyscall page now. 32bit has a real VMA now and does - * not need special handling anymore: - */ -static const char *gate_vma_name(struct vm_area_struct *vma) -{ - return "[vsyscall]"; -} -static struct vm_operations_struct gate_vma_ops = { - .name = gate_vma_name, -}; -static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_ADDR, - .vm_end = VSYSCALL_ADDR + PAGE_SIZE, - .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC, - .vm_ops = &gate_vma_ops, -}; - -struct vm_area_struct *get_gate_vma(struct mm_struct *mm) -{ -#ifdef CONFIG_IA32_EMULATION - if (!mm || mm->context.ia32_compat) - return NULL; -#endif - if (vsyscall_mode == NONE) - return NULL; - return &gate_vma; -} - -int in_gate_area(struct mm_struct *mm, unsigned long addr) -{ - struct vm_area_struct *vma = get_gate_vma(mm); - - if (!vma) - return 0; - - return (addr >= vma->vm_start) && (addr < vma->vm_end); -} - -/* - * Use this when you have no reliable mm, typically from interrupt - * context. It is less reliable than using a task's mm and may give - * false positives. - */ -int in_gate_area_no_mm(unsigned long addr) -{ - return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; -} - -void __init map_vsyscall(void) -{ - extern char __vsyscall_page; - unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - - if (vsyscall_mode != NONE) - __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, - vsyscall_mode == NATIVE - ? PAGE_KERNEL_VSYSCALL - : PAGE_KERNEL_VVAR); - - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != - (unsigned long)VSYSCALL_ADDR); -} diff --git a/arch/x86/kernel/vsyscall_emu_64.S b/arch/x86/kernel/vsyscall_emu_64.S deleted file mode 100644 index c9596a9af159..000000000000 --- a/arch/x86/kernel/vsyscall_emu_64.S +++ /dev/null @@ -1,37 +0,0 @@ -/* - * vsyscall_emu_64.S: Vsyscall emulation page - * - * Copyright (c) 2011 Andy Lutomirski - * - * Subject to the GNU General Public License, version 2 - */ - -#include - -#include -#include -#include - -__PAGE_ALIGNED_DATA - .globl __vsyscall_page - .balign PAGE_SIZE, 0xcc - .type __vsyscall_page, @object -__vsyscall_page: - - mov $__NR_gettimeofday, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_time, %rax - syscall - ret - - .balign 1024, 0xcc - mov $__NR_getcpu, %rax - syscall - ret - - .balign 4096, 0xcc - - .size __vsyscall_page, 4096 diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c deleted file mode 100644 index 51e330416995..000000000000 --- a/arch/x86/kernel/vsyscall_gtod.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * Copyright (C) 2001 Andrea Arcangeli SuSE - * Copyright 2003 Andi Kleen, SuSE Labs. - * - * Modified for x86 32 bit architecture by - * Stefani Seibold - * sponsored by Rohde & Schwarz GmbH & Co. KG Munich/Germany - * - * Thanks to hpa@transmeta.com for some useful hint. - * Special thanks to Ingo Molnar for his early experience with - * a different vsyscall implementation for Linux/IA32 and for the name. - * - */ - -#include -#include -#include - -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); - -void update_vsyscall_tz(void) -{ - vsyscall_gtod_data.tz_minuteswest = sys_tz.tz_minuteswest; - vsyscall_gtod_data.tz_dsttime = sys_tz.tz_dsttime; -} - -void update_vsyscall(struct timekeeper *tk) -{ - struct vsyscall_gtod_data *vdata = &vsyscall_gtod_data; - - gtod_write_begin(vdata); - - /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; - vdata->cycle_last = tk->tkr_mono.cycle_last; - vdata->mask = tk->tkr_mono.mask; - vdata->mult = tk->tkr_mono.mult; - vdata->shift = tk->tkr_mono.shift; - - vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; - - vdata->monotonic_time_sec = tk->xtime_sec - + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec - + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr_mono.shift); - while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { - vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; - vdata->monotonic_time_sec++; - } - - vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> - tk->tkr_mono.shift); - - vdata->monotonic_time_coarse_sec = - vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_coarse_nsec = - vdata->wall_time_coarse_nsec + tk->wall_to_monotonic.tv_nsec; - - while (vdata->monotonic_time_coarse_nsec >= NSEC_PER_SEC) { - vdata->monotonic_time_coarse_nsec -= NSEC_PER_SEC; - vdata->monotonic_time_coarse_sec++; - } - - gtod_write_end(vdata); -} diff --git a/arch/x86/kernel/vsyscall_trace.h b/arch/x86/kernel/vsyscall_trace.h deleted file mode 100644 index a8b2edec54fe..000000000000 --- a/arch/x86/kernel/vsyscall_trace.h +++ /dev/null @@ -1,29 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM vsyscall - -#if !defined(__VSYSCALL_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) -#define __VSYSCALL_TRACE_H - -#include - -TRACE_EVENT(emulate_vsyscall, - - TP_PROTO(int nr), - - TP_ARGS(nr), - - TP_STRUCT__entry(__field(int, nr)), - - TP_fast_assign( - __entry->nr = nr; - ), - - TP_printk("nr = %d", __entry->nr) -); - -#endif - -#undef TRACE_INCLUDE_PATH -#define TRACE_INCLUDE_PATH ../../arch/x86/kernel -#define TRACE_INCLUDE_FILE vsyscall_trace -#include -- cgit v1.2.3 From 138bd56a2164c65f68042ba580d88ab70c036fd1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Jun 2015 14:11:49 +0200 Subject: x86/asm/entry/64/compat: Rename ia32entry.S -> entry_64_compat.S So we now have the following system entry code related files, which define the following system call instruction and other entry paths: entry_32.S # 32-bit binaries on 32-bit kernels entry_64.S # 64-bit binaries on 64-bit kernels entry_64_compat.S # 32-bit binaries on 64-bit kernels Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Josh Triplett Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/entry/Makefile | 2 +- arch/x86/entry/entry_64.S | 2 +- arch/x86/entry/entry_64_compat.S | 547 +++++++++++++++++++++++++++++++++++++++ arch/x86/entry/ia32entry.S | 547 --------------------------------------- 4 files changed, 549 insertions(+), 549 deletions(-) create mode 100644 arch/x86/entry/entry_64_compat.S delete mode 100644 arch/x86/entry/ia32entry.S (limited to 'arch/x86/entry/Makefile') diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index b93cce1c6bb2..7a144971db79 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -6,5 +6,5 @@ obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o obj-y += vdso/ obj-y += vsyscall/ -obj-$(CONFIG_IA32_EMULATION) += ia32entry.o syscall_32.o +obj-$(CONFIG_IA32_EMULATION) += entry_64_compat.o syscall_32.o diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 31770662b940..4cf3dd36aa0d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -516,7 +516,7 @@ ENTRY(ret_from_fork) /* * By the time we get here, we have no idea whether our pt_regs, * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. + * the slow path, or one of the 32-bit compat paths. * Use IRET code path to return, since it can safely handle * all of the above. */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S new file mode 100644 index 000000000000..9558dacf32b9 --- /dev/null +++ b/arch/x86/entry/entry_64_compat.S @@ -0,0 +1,547 @@ +/* + * Compatibility mode system call entry point for x86-64. + * + * Copyright 2000-2002 Andi Kleen, SuSE Labs. + */ +#include "calling.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +# define sysexit_audit ia32_ret_from_sys_call +# define sysretl_audit ia32_ret_from_sys_call +#endif + + .section .entry.text, "ax" + +#ifdef CONFIG_PARAVIRT +ENTRY(native_usergs_sysret32) + swapgs + sysretl +ENDPROC(native_usergs_sysret32) +#endif + +/* + * 32-bit SYSENTER instruction entry. + * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_sysenter_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %rbp /* pt_regs->sp */ + pushfq /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + cld + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * no need to do an access_ok check here because rbp has been + * 32-bit zero extended + */ + ASM_STAC +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + + /* + * Sysenter doesn't filter flags, so we need to clear NT + * ourselves. To save a few cycles, we can check whether + * NT was set instead of doing an unconditional popfq. + */ + testl $X86_EFLAGS_NT, EFLAGS(%rsp) + jnz sysenter_fix_flags +sysenter_flags_fixed: + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysenter_tracesys + +sysenter_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ +sysenter_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysexit_audit +sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RIP(%rsp), %ecx /* User %eip */ + RESTORE_RSI_RDI + xorl %edx, %edx /* Do not leak kernel information */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movl EFLAGS(%rsp), %r11d /* User eflags */ + TRACE_IRQS_ON + + /* + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. + */ + movl RSP(%rsp), %esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi, %r8d /* 5th arg: 4th syscall arg */ + movl %ecx, %r9d /* swap with edx */ + movl %edx, %ecx /* 4th arg: 3rd syscall arg */ + movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ + movl %ebx, %esi /* 2nd arg: 1st syscall arg */ + movl %eax, %edi /* 1st arg: syscall number */ + call __audit_syscall_entry + movl ORIG_RAX(%rsp), %eax /* reload syscall number */ + movl %ebx, %edi /* reload 1st syscall arg */ + movl RCX(%rsp), %esi /* reload 2nd syscall arg */ + movl RDX(%rsp), %edx /* reload 3rd syscall arg */ + movl RSI(%rsp), %ecx /* reload 4th syscall arg */ + movl RDI(%rsp), %r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax, %esi /* second arg, syscall return value */ + cmpl $-MAX_ERRNO, %eax /* is it an error ? */ + jbe 1f + movslq %eax, %rsi /* if error sign extend to 64 bits */ +1: setbe %al /* 1 if error, 0 if not */ + movzbl %al, %edi /* zero-extend that into %edi */ + call __audit_syscall_exit + movq RAX(%rsp), %rax /* reload syscall return value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz \exit + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_with_check + .endm + +sysenter_auditsys: + auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_fix_flags: + pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + popfq + jmp sysenter_flags_fixed + +sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz sysenter_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp sysenter_do_call +ENDPROC(ia32_sysenter_target) + +/* + * 32-bit SYSCALL instruction entry. + * + * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * + * Arguments: + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * + * This is purely a fast path. For anything complicated we use the int 0x80 + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ +ENTRY(ia32_cstar_target) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + SWAPGS_UNSAFE_STACK + movl %esp, %r8d + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack */ + pushq $__USER32_DS /* pt_regs->ss */ + pushq %r8 /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER32_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rbp /* pt_regs->cx */ + movl %ebp, %ecx + pushq $-ENOSYS /* pt_regs->ax */ + sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + + /* + * No need to do an access_ok check here because r8 has been + * 32-bit zero extended: + */ + ASM_STAC +1: movl (%r8), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz cstar_tracesys + +cstar_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + +cstar_dispatch: + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) + movq %rax, RAX(%rsp) +1: + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz sysretl_audit + +sysretl_from_sys_call: + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + movl RCX(%rsp), %ebp + RESTORE_RSI_RDI_RDX + movl RIP(%rsp), %ecx + movl EFLAGS(%rsp), %r11d + xorq %r10, %r10 + xorq %r9, %r9 + xorq %r8, %r8 + TRACE_IRQS_ON + movl RSP(%rsp), %esp + /* + * 64-bit->32-bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32-bit->32-bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + * + * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss + * descriptor is not reinitialized. This means that we must + * avoid SYSRET with SS == NULL, which could happen if we schedule, + * exit the kernel, and re-enter using an interrupt vector. (All + * interrupt entries on x86_64 set SS to NULL.) We prevent that + * from happening by reloading SS in __switch_to. + */ + USERGS_SYSRET32 + +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: + auditsys_entry_common + movl %ebp, %r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jz cstar_auditsys +#endif + SAVE_EXTRA_REGS + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + + /* Reload arg registers from stack. (see sysenter_tracesys) */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + + RESTORE_EXTRA_REGS + jmp cstar_do_call +END(ia32_cstar_target) + +ia32_badarg: + ASM_CLAC + movq $-EFAULT, %rax + jmp ia32_sysret + +ia32_ret_from_sys_call: + xorl %eax, %eax /* Do not leak kernel information */ + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + jmp int_ret_from_sys_call + +/* + * Emulated IA32 system calls via int 0x80. + * + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) + * + * Notes: + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). + * Arguments are zero extended. For system calls that want sign extension and + * take long arguments a wrapper is needed. Most calls can just be called + * directly. + * Assumes it is only called from user space and entered with interrupts off. + */ + +ENTRY(ia32_syscall) + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ + PARAVIRT_ADJUST_EXCEPTION_FRAME + SWAPGS + ENABLE_INTERRUPTS(CLBR_NONE) + + /* Zero-extending 32-bit regs, do not remove */ + movl %eax, %eax + + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq %rax /* pt_regs->orig_ax */ + pushq %rdi /* pt_regs->di */ + pushq %rsi /* pt_regs->si */ + pushq %rdx /* pt_regs->dx */ + pushq %rcx /* pt_regs->cx */ + pushq $-ENOSYS /* pt_regs->ax */ + pushq $0 /* pt_regs->r8 */ + pushq $0 /* pt_regs->r9 */ + pushq $0 /* pt_regs->r10 */ + pushq $0 /* pt_regs->r11 */ + cld + sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_tracesys + +ia32_do_call: + /* 32-bit syscall -> 64-bit C ABI argument conversion */ + movl %edi, %r8d /* arg5 */ + movl %ebp, %r9d /* arg6 */ + xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ + movl %ebx, %edi /* arg1 */ + movl %edx, %edx /* arg3 (zero extension) */ + cmpq $(IA32_NR_syscalls-1), %rax + ja 1f + + call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ + +ia32_sysret: + movq %rax, RAX(%rsp) +1: + jmp int_ret_from_sys_call + +ia32_tracesys: + SAVE_EXTRA_REGS + movq %rsp, %rdi /* &pt_regs -> arg1 */ + call syscall_trace_enter + /* + * Reload arg registers from stack in case ptrace changed them. + * Don't reload %eax because syscall_trace_enter() returned + * the %rax value we should see. But do truncate it to 32 bits. + * If it's -1 to make us punt the syscall, then (u32)-1 is still + * an appropriately invalid value. + */ + movl RCX(%rsp), %ecx + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi + movl %eax, %eax /* zero extension */ + RESTORE_EXTRA_REGS + jmp ia32_do_call +END(ia32_syscall) + + .macro PTREGSCALL label, func + ALIGN +GLOBAL(\label) + leaq \func(%rip), %rax + jmp ia32_ptregs_common + .endm + + PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn + PTREGSCALL stub32_sigreturn, sys32_sigreturn + PTREGSCALL stub32_fork, sys_fork + PTREGSCALL stub32_vfork, sys_vfork + + ALIGN +GLOBAL(stub32_clone) + leaq sys_clone(%rip), %rax + /* + * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). + * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). + * + * The native 64-bit kernel's sys_clone() implements the latter, + * so we need to swap arguments here before calling it: + */ + xchg %r8, %rcx + jmp ia32_ptregs_common + + ALIGN +ia32_ptregs_common: + SAVE_EXTRA_REGS 8 + call *%rax + RESTORE_EXTRA_REGS 8 + ret +END(ia32_ptregs_common) diff --git a/arch/x86/entry/ia32entry.S b/arch/x86/entry/ia32entry.S deleted file mode 100644 index 9558dacf32b9..000000000000 --- a/arch/x86/entry/ia32entry.S +++ /dev/null @@ -1,547 +0,0 @@ -/* - * Compatibility mode system call entry point for x86-64. - * - * Copyright 2000-2002 Andi Kleen, SuSE Labs. - */ -#include "calling.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call -# define sysretl_audit ia32_ret_from_sys_call -#endif - - .section .entry.text, "ax" - -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - -/* - * 32-bit SYSENTER instruction entry. - * - * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. - * IF and VM in rflags are cleared (IOW: interrupts are off). - * SYSENTER does not save anything on the stack, - * and does not save old rip (!!!) and rflags. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp user stack - * 0(%ebp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_sysenter_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp - movl %eax, %eax - - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - cld - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - - /* - * no need to do an access_ok check here because rbp has been - * 32-bit zero extended - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - - /* - * Sysenter doesn't filter flags, so we need to clear NT - * ourselves. To save a few cycles, we can check whether - * NT was set instead of doing an unconditional popfq. - */ - testl $X86_EFLAGS_NT, EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp), %ecx /* User %eip */ - RESTORE_RSI_RDI - xorl %edx, %edx /* Do not leak kernel information */ - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 - movl EFLAGS(%rsp), %r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp), %esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - movl %esi, %r8d /* 5th arg: 4th syscall arg */ - movl %ecx, %r9d /* swap with edx */ - movl %edx, %ecx /* 4th arg: 3rd syscall arg */ - movl %r9d, %edx /* 3rd arg: 2nd syscall arg */ - movl %ebx, %esi /* 2nd arg: 1st syscall arg */ - movl %eax, %edi /* 1st arg: syscall number */ - call __audit_syscall_entry - movl ORIG_RAX(%rsp), %eax /* reload syscall number */ - movl %ebx, %edi /* reload 1st syscall arg */ - movl RCX(%rsp), %esi /* reload 2nd syscall arg */ - movl RDX(%rsp), %edx /* reload 3rd syscall arg */ - movl RSI(%rsp), %ecx /* reload 4th syscall arg */ - movl RDI(%rsp), %r8d /* reload 5th syscall arg */ - .endm - - .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax, %esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al, %edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movq RAX(%rsp), %rax /* reload syscall return value */ - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_with_check - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - -sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) - popfq - jmp sysenter_flags_fixed - -sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp sysenter_do_call -ENDPROC(ia32_sysenter_target) - -/* - * 32-bit SYSCALL instruction entry. - * - * 32-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, - * then loads new ss, cs, and rip from previously programmed MSRs. - * rflags gets masked by a value from another MSR (so CLD and CLAC - * are not needed). SYSCALL does not save anything on the stack - * and does not change rsp. - * - * Note: rflags saving+masking-with-MSR happens only in Long mode - * (in legacy 32-bit mode, IF, RF and VM bits are cleared and that's it). - * Don't get confused: rflags saving+masking depends on Long Mode Active bit - * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes - * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). - * - * Arguments: - * eax system call number - * ecx return address - * ebx arg1 - * ebp arg2 (note: not saved in the stack frame, should not be touched) - * edx arg3 - * esi arg4 - * edi arg5 - * esp user stack - * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ -ENTRY(ia32_cstar_target) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - SWAPGS_UNSAFE_STACK - movl %esp, %r8d - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - - /* Construct struct pt_regs on stack */ - pushq $__USER32_DS /* pt_regs->ss */ - pushq %r8 /* pt_regs->sp */ - pushq %r11 /* pt_regs->flags */ - pushq $__USER32_CS /* pt_regs->cs */ - pushq %rcx /* pt_regs->ip */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp, %ecx - pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - - /* - * No need to do an access_ok check here because r8 has been - * 32-bit zero extended: - */ - ASM_STAC -1: movl (%r8), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit - -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RCX(%rsp), %ebp - RESTORE_RSI_RDI_RDX - movl RIP(%rsp), %ecx - movl EFLAGS(%rsp), %r11d - xorq %r10, %r10 - xorq %r9, %r9 - xorq %r8, %r8 - TRACE_IRQS_ON - movl RSP(%rsp), %esp - /* - * 64-bit->32-bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32-bit->32-bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - -cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp cstar_do_call -END(ia32_cstar_target) - -ia32_badarg: - ASM_CLAC - movq $-EFAULT, %rax - jmp ia32_sysret - -ia32_ret_from_sys_call: - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call - -/* - * Emulated IA32 system calls via int 0x80. - * - * Arguments: - * eax system call number - * ebx arg1 - * ecx arg2 - * edx arg3 - * esi arg4 - * edi arg5 - * ebp arg6 (note: not saved in the stack frame, should not be touched) - * - * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except eax must be saved (but ptrace may violate that). - * Arguments are zero extended. For system calls that want sign extension and - * take long arguments a wrapper is needed. Most calls can just be called - * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ - -ENTRY(ia32_syscall) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME - SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax - - /* Construct struct pt_regs on stack (iret frame is already on stack) */ - pushq %rax /* pt_regs->orig_ax */ - pushq %rdi /* pt_regs->di */ - pushq %rsi /* pt_regs->si */ - pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ - pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 */ - pushq $0 /* pt_regs->r9 */ - pushq $0 /* pt_regs->r10 */ - pushq $0 /* pt_regs->r11 */ - cld - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys - -ia32_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) /* RIP relative */ - -ia32_sysret: - movq %rax, RAX(%rsp) -1: - jmp int_ret_from_sys_call - -ia32_tracesys: - SAVE_EXTRA_REGS - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - /* - * Reload arg registers from stack in case ptrace changed them. - * Don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. But do truncate it to 32 bits. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. - */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - RESTORE_EXTRA_REGS - jmp ia32_do_call -END(ia32_syscall) - - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip), %rax - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - - ALIGN -GLOBAL(stub32_clone) - leaq sys_clone(%rip), %rax - /* - * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). - * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). - * - * The native 64-bit kernel's sys_clone() implements the latter, - * so we need to swap arguments here before calling it: - */ - xchg %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) -- cgit v1.2.3