diff options
53 files changed, 1158 insertions, 650 deletions
diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index c584a51add15..afe68ddbe6a4 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -12,6 +12,8 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... +ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks +... unused hole ... ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 25d2c6f7325e..956c7702471e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -909,10 +909,27 @@ config VM86 default y depends on X86_32 ---help--- - This option is required by programs like DOSEMU to run 16-bit legacy - code on X86 processors. It also may be needed by software like - XFree86 to initialize some video cards via BIOS. Disabling this - option saves about 6k. + This option is required by programs like DOSEMU to run + 16-bit real mode legacy code on x86 processors. It also may + be needed by software like XFree86 to initialize some video + cards via BIOS. Disabling this option saves about 6K. + +config X86_16BIT + bool "Enable support for 16-bit segments" if EXPERT + default y + ---help--- + This option is required by programs like Wine to run 16-bit + protected mode legacy code on x86 processors. Disabling + this option saves about 300 bytes on i386, or around 6K text + plus 16K runtime memory on x86-64, + +config X86_ESPFIX32 + def_bool y + depends on X86_16BIT && X86_32 + +config X86_ESPFIX64 + def_bool y + depends on X86_16BIT && X86_64 config TOSHIBA tristate "Toshiba Laptop support" diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 220675795e08..f9e181aaba97 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -383,8 +383,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, } else { /* Return stub is in 32bit vsyscall page */ if (current->mm->context.vdso) - restorer = VDSO32_SYMBOL(current->mm->context.vdso, - sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; else restorer = &frame->retcode; } @@ -462,8 +462,8 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; else - restorer = VDSO32_SYMBOL(current->mm->context.vdso, - rt_sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_rt_sigreturn; put_user_ex(ptr_to_compat(restorer), &frame->pretcode); /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2c71182d30ef..1a055c81d864 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -75,7 +75,12 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #include <asm/vdso.h> -extern unsigned int vdso_enabled; +#ifdef CONFIG_X86_64 +extern unsigned int vdso64_enabled; +#endif +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +extern unsigned int vdso32_enabled; +#endif /* * This is used to ensure we don't load something for the wrong architecture. @@ -269,9 +274,9 @@ extern int force_personality32; struct task_struct; -#define ARCH_DLINFO_IA32(vdso_enabled) \ +#define ARCH_DLINFO_IA32 \ do { \ - if (vdso_enabled) { \ + if (vdso32_enabled) { \ NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \ NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \ } \ @@ -281,7 +286,7 @@ do { \ #define STACK_RND_MASK (0x7ff) -#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled) +#define ARCH_DLINFO ARCH_DLINFO_IA32 /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ @@ -292,16 +297,17 @@ do { \ #define ARCH_DLINFO \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ - (unsigned long)current->mm->context.vdso); \ + (unsigned long __force)current->mm->context.vdso); \ } while (0) +/* As a historical oddity, the x32 and x86_64 vDSOs are controlled together. */ #define ARCH_DLINFO_X32 \ do { \ - if (vdso_enabled) \ + if (vdso64_enabled) \ NEW_AUX_ENT(AT_SYSINFO_EHDR, \ - (unsigned long)current->mm->context.vdso); \ + (unsigned long __force)current->mm->context.vdso); \ } while (0) #define AT_SYSINFO 32 @@ -310,7 +316,7 @@ do { \ if (test_thread_flag(TIF_X32)) \ ARCH_DLINFO_X32; \ else \ - ARCH_DLINFO_IA32(sysctl_vsyscall32) + ARCH_DLINFO_IA32 #define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000) @@ -319,18 +325,17 @@ else \ #define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso) #define VDSO_ENTRY \ - ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall)) + ((unsigned long)current->mm->context.vdso + \ + selected_vdso32->sym___kernel_vsyscall) struct linux_binprm; #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 extern int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); -extern int x32_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); - -extern int syscall32_setup_pages(struct linux_binprm *, int exstack); -#define compat_arch_setup_additional_pages syscall32_setup_pages +extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); +#define compat_arch_setup_additional_pages compat_arch_setup_additional_pages extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define arch_randomize_brk arch_randomize_brk diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h new file mode 100644 index 000000000000..99efebb2f69d --- /dev/null +++ b/arch/x86/include/asm/espfix.h @@ -0,0 +1,16 @@ +#ifndef _ASM_X86_ESPFIX_H +#define _ASM_X86_ESPFIX_H + +#ifdef CONFIG_X86_64 + +#include <asm/percpu.h> + +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +extern void init_espfix_bsp(void); +extern void init_espfix_ap(void); + +#endif /* CONFIG_X86_64 */ + +#endif /* _ASM_X86_ESPFIX_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 43f482a0db37..b0910f97a3ea 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -24,7 +24,7 @@ #include <linux/threads.h> #include <asm/kmap_types.h> #else -#include <asm/vsyscall.h> +#include <uapi/asm/vsyscall.h> #endif /* @@ -41,7 +41,8 @@ extern unsigned long __FIXADDR_TOP; #define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP) #else -#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE) +#define FIXADDR_TOP (round_up(VSYSCALL_ADDR + PAGE_SIZE, 1<<PMD_SHIFT) - \ + PAGE_SIZE) #endif @@ -68,11 +69,7 @@ enum fixed_addresses { #ifdef CONFIG_X86_32 FIX_HOLE, #else - VSYSCALL_LAST_PAGE, - VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE - + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1, - VVAR_PAGE, - VSYSCALL_HPET, + VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, #ifdef CONFIG_PARAVIRT_CLOCK PVCLOCK_FIXMAP_BEGIN, PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 5f55e6962769..876e74e8eec7 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -18,7 +18,7 @@ typedef struct { #endif struct mutex lock; - void *vdso; + void __user *vdso; } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index c883bf726398..7166e25ecb57 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -61,6 +61,8 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) #define EARLY_DYNAMIC_PAGE_TABLES 64 diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h index 6fd3fd769796..a90f8972dad5 100644 --- a/arch/x86/include/asm/proto.h +++ b/arch/x86/include/asm/proto.h @@ -12,8 +12,6 @@ void ia32_syscall(void); void ia32_cstar_target(void); void ia32_sysenter_target(void); -void syscall32_cpu_init(void); - void x86_configure_nx(void); void x86_report_nx(void); diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 9264f04a4c55..ff4e7b236e21 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -59,6 +59,8 @@ static inline void x86_ce4100_early_setup(void) { } #ifndef _SETUP +#include <asm/espfix.h> + /* * This is set up by the setup-routine at boot-time */ diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index d1dc55404ff1..30be253dd283 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -3,63 +3,51 @@ #include <asm/page_types.h> #include <linux/linkage.h> +#include <linux/init.h> -#ifdef __ASSEMBLER__ +#ifndef __ASSEMBLER__ -#define DEFINE_VDSO_IMAGE(symname, filename) \ -__PAGE_ALIGNED_DATA ; \ - .globl symname##_start, symname##_end ; \ - .align PAGE_SIZE ; \ - symname##_start: ; \ - .incbin filename ; \ - symname##_end: ; \ - .align PAGE_SIZE /* extra data here leaks to userspace. */ ; \ - \ -.previous ; \ - \ - .globl symname##_pages ; \ - .bss ; \ - .align 8 ; \ - .type symname##_pages, @object ; \ - symname##_pages: ; \ - .zero (symname##_end - symname##_start + PAGE_SIZE - 1) / PAGE_SIZE * (BITS_PER_LONG / 8) ; \ - .size symname##_pages, .-symname##_pages +#include <linux/mm_types.h> -#else +struct vdso_image { + void *data; + unsigned long size; /* Always a multiple of PAGE_SIZE */ -#define DECLARE_VDSO_IMAGE(symname) \ - extern char symname##_start[], symname##_end[]; \ - extern struct page *symname##_pages[] + /* text_mapping.pages is big enough for data/size page pointers */ + struct vm_special_mapping text_mapping; -#if defined CONFIG_X86_32 || defined CONFIG_COMPAT + unsigned long alt, alt_len; -#include <asm/vdso32.h> + unsigned long sym_end_mapping; /* Total size of the mapping */ -DECLARE_VDSO_IMAGE(vdso32_int80); -#ifdef CONFIG_COMPAT -DECLARE_VDSO_IMAGE(vdso32_syscall); + unsigned long sym_vvar_page; + unsigned long sym_hpet_page; + unsigned long sym_VDSO32_NOTE_MASK; + unsigned long sym___kernel_sigreturn; + unsigned long sym___kernel_rt_sigreturn; + unsigned long sym___kernel_vsyscall; + unsigned long sym_VDSO32_SYSENTER_RETURN; +}; + +#ifdef CONFIG_X86_64 +extern const struct vdso_image vdso_image_64; +#endif + +#ifdef CONFIG_X86_X32 +extern const struct vdso_image vdso_image_x32; #endif -DECLARE_VDSO_IMAGE(vdso32_sysenter); -/* - * Given a pointer to the vDSO image, find the pointer to VDSO32_name - * as that symbol is defined in the vDSO sources or linker script. - */ -#define VDSO32_SYMBOL(base, name) \ -({ \ - extern const char VDSO32_##name[]; \ - (void __user *)(VDSO32_##name + (unsigned long)(base)); \ -}) +#if defined CONFIG_X86_32 || defined CONFIG_COMPAT +extern const struct vdso_image vdso_image_32_int80; +#ifdef CONFIG_COMPAT +extern const struct vdso_image vdso_image_32_syscall; #endif +extern const struct vdso_image vdso_image_32_sysenter; -/* - * These symbols are defined with the addresses in the vsyscall page. - * See vsyscall-sigreturn.S. - */ -extern void __user __kernel_sigreturn; -extern void __user __kernel_rt_sigreturn; +extern const struct vdso_image *selected_vdso32; +#endif -void __init patch_vdso32(void *vdso, size_t len); +extern void __init init_vdso_image(const struct vdso_image *image); #endif /* __ASSEMBLER__ */ diff --git a/arch/x86/include/asm/vdso32.h b/arch/x86/include/asm/vdso32.h deleted file mode 100644 index 7efb7018406e..000000000000 --- a/arch/x86/include/asm/vdso32.h +++ /dev/null @@ -1,11 +0,0 @@ -#ifndef _ASM_X86_VDSO32_H -#define _ASM_X86_VDSO32_H - -#define VDSO_BASE_PAGE 0 -#define VDSO_VVAR_PAGE 1 -#define VDSO_HPET_PAGE 2 -#define VDSO_PAGES 3 -#define VDSO_PREV_PAGES 2 -#define VDSO_OFFSET(x) ((x) * PAGE_SIZE) - -#endif diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h index 081d909bc495..5d2b9ad2c6d2 100644 --- a/arch/x86/include/asm/vvar.h +++ b/arch/x86/include/asm/vvar.h @@ -29,31 +29,13 @@ #else -#ifdef BUILD_VDSO32 +extern char __vvar_page; #define DECLARE_VVAR(offset, type, name) \ extern type vvar_ ## name __attribute__((visibility("hidden"))); #define VVAR(name) (vvar_ ## name) -#else - -extern char __vvar_page; - -/* Base address of vvars. This is not ABI. */ -#ifdef CONFIG_X86_64 -#define VVAR_ADDRESS (-10*1024*1024 - 4096) -#else -#define VVAR_ADDRESS (&__vvar_page) -#endif - -#define DECLARE_VVAR(offset, type, name) \ - static type const * const vvaraddr_ ## name = \ - (void *)(VVAR_ADDRESS + (offset)); - -#define VVAR(name) (*vvaraddr_ ## name) -#endif - #define DEFINE_VVAR(type, name) \ type name \ __attribute__((section(".vvar_" #name), aligned(16))) __visible diff --git a/arch/x86/include/uapi/asm/vsyscall.h b/arch/x86/include/uapi/asm/vsyscall.h index 85dc1b3825ab..b97dd6e263d2 100644 --- a/arch/x86/include/uapi/asm/vsyscall.h +++ b/arch/x86/include/uapi/asm/vsyscall.h @@ -7,11 +7,6 @@ enum vsyscall_num { __NR_vgetcpu, }; -#define VSYSCALL_START (-10UL << 20) -#define VSYSCALL_SIZE 1024 -#define VSYSCALL_END (-2UL << 20) -#define VSYSCALL_MAPPED_PAGES 1 -#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr)) - +#define VSYSCALL_ADDR (-10UL << 20) #endif /* _UAPI_ASM_X86_VSYSCALL_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f4d96000d33a..491ef3e59850 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o obj-$(CONFIG_X86_64) += vsyscall_64.o obj-$(CONFIG_X86_64) += vsyscall_emu_64.o +obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a135239badb7..2cbbf88d8f2c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -20,6 +20,7 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <asm/sections.h> +#include <asm/vsyscall.h> #include <linux/topology.h> #include <linux/cpumask.h> #include <asm/pgtable.h> @@ -953,6 +954,38 @@ static void vgetcpu_set_mode(void) else vgetcpu_mode = VGETCPU_LSL; } + +/* May not be __init: called during resume */ +static void syscall32_cpu_init(void) +{ + /* Load these always in case some future AMD CPU supports + SYSENTER from compat mode too. */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); + + wrmsrl(MSR_CSTAR, ia32_cstar_target); +} +#endif + +#ifdef CONFIG_X86_32 +void enable_sep_cpu(void) +{ + int cpu = get_cpu(); + struct tss_struct *tss = &per_cpu(init_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) { + put_cpu(); + return; + } + + tss->x86_tss.ss1 = __KERNEL_CS; + tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; + wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + put_cpu(); +} #endif void __init identify_boot_cpu(void) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a2a4f4697889..98313ffaae6a 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -527,6 +527,7 @@ syscall_exit: restore_all: TRACE_IRQS_IRET restore_all_notrace: +#ifdef CONFIG_X86_ESPFIX32 movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS # Warning: PT_OLDSS(%esp) contains the wrong/random values if we # are returning to the kernel. @@ -537,6 +538,7 @@ restore_all_notrace: cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax CFI_REMEMBER_STATE je ldt_ss # returning to user-space with LDT SS +#endif restore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code irq_return: @@ -549,13 +551,9 @@ ENTRY(iret_exc) .previous _ASM_EXTABLE(irq_return,iret_exc) +#ifdef CONFIG_X86_ESPFIX32 CFI_RESTORE_STATE ldt_ss: - larl PT_OLDSS(%esp), %eax - jnz restore_nocheck - testl $0x00400000, %eax # returning to 32bit stack? - jnz restore_nocheck # allright, normal return - #ifdef CONFIG_PARAVIRT /* * The kernel can't run on a non-flat stack if paravirt mode @@ -597,6 +595,7 @@ ldt_ss: lss (%esp), %esp /* switch to espfix segment */ CFI_ADJUST_CFA_OFFSET -8 jmp restore_nocheck +#endif CFI_ENDPROC ENDPROC(system_call) @@ -704,6 +703,7 @@ END(syscall_badsys) * the high word of the segment base from the GDT and swiches to the * normal stack and adjusts ESP with the matching offset. */ +#ifdef CONFIG_X86_ESPFIX32 /* fixup the stack */ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ @@ -713,8 +713,10 @@ END(syscall_badsys) pushl_cfi %eax lss (%esp), %esp /* switch to the normal stack segment */ CFI_ADJUST_CFA_OFFSET -8 +#endif .endm .macro UNWIND_ESPFIX_STACK +#ifdef CONFIG_X86_ESPFIX32 movl %ss, %eax /* see if on espfix stack */ cmpw $__ESPFIX_SS, %ax @@ -725,6 +727,7 @@ END(syscall_badsys) /* switch to normal stack */ FIXUP_ESPFIX_STACK 27: +#endif .endm /* @@ -1355,11 +1358,13 @@ END(debug) ENTRY(nmi) RING0_INT_FRAME ASM_CLAC +#ifdef CONFIG_X86_ESPFIX32 pushl_cfi %eax movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl_cfi %eax je nmi_espfix_stack +#endif cmpl $ia32_sysenter_target,(%esp) je nmi_stack_fixup pushl_cfi %eax @@ -1399,6 +1404,7 @@ nmi_debug_stack_check: FIX_STACK 24, nmi_stack_correct, 1 jmp nmi_stack_correct +#ifdef CONFIG_X86_ESPFIX32 nmi_espfix_stack: /* We have a RING0_INT_FRAME here. * @@ -1420,6 +1426,7 @@ nmi_espfix_stack: lss 12+4(%esp), %esp # back to espfix stack CFI_ADJUST_CFA_OFFSET -24 jmp irq_return +#endif CFI_ENDPROC END(nmi) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1e96c3628bf2..da0b9bdcc32e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -58,6 +58,7 @@ #include <asm/asm.h> #include <asm/context_tracking.h> #include <asm/smap.h> +#include <asm/pgtable_types.h> #include <linux/err.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ @@ -1040,8 +1041,18 @@ restore_args: RESTORE_ARGS 1,8,1 irq_return: + /* + * Are we returning to a stack segment from the LDT? Note: in + * 64-bit mode SS:RSP on the exception stack is always valid. + */ +#ifdef CONFIG_X86_ESPFIX64 + testb $4,(SS-RIP)(%rsp) + jnz irq_return_ldt +#endif + +irq_return_iret: INTERRUPT_RETURN - _ASM_EXTABLE(irq_return, bad_iret) + _ASM_EXTABLE(irq_return_iret, bad_iret) #ifdef CONFIG_PARAVIRT ENTRY(native_iret) @@ -1049,6 +1060,32 @@ ENTRY(native_iret) _ASM_EXTABLE(native_iret, bad_iret) #endif +#ifdef CONFIG_X86_ESPFIX64 +irq_return_ldt: + pushq_cfi %rax + pushq_cfi %rdi + SWAPGS + movq PER_CPU_VAR(espfix_waddr),%rdi + movq %rax,(0*8)(%rdi) /* RAX */ + movq (2*8)(%rsp),%rax /* RIP */ + movq %rax,(1*8)(%rdi) + movq (3*8)(%rsp),%rax /* CS */ + movq %rax,(2*8)(%rdi) + movq (4*8)(%rsp),%rax /* RFLAGS */ + movq %rax,(3*8)(%rdi) + movq (6*8)(%rsp),%rax /* SS */ + movq %rax,(5*8)(%rdi) + movq (5*8)(%rsp),%rax /* RSP */ + movq %rax,(4*8)(%rdi) + andl $0xffff0000,%eax + popq_cfi %rdi + orq PER_CPU_VAR(espfix_stack),%rax + SWAPGS + movq %rax,%rsp + popq_cfi %rax + jmp irq_return_iret +#endif + .section .fixup,"ax" bad_iret: /* @@ -1110,9 +1147,45 @@ ENTRY(retint_kernel) call preempt_schedule_irq jmp exit_intr #endif - CFI_ENDPROC END(common_interrupt) + + /* + * If IRET takes a fault on the espfix stack, then we + * end up promoting it to a doublefault. In that case, + * modify the stack to make it look like we just entered + * the #GP handler from user space, similar to bad_iret. + */ +#ifdef CONFIG_X86_ESPFIX64 + ALIGN +__do_double_fault: + XCPT_FRAME 1 RDI+8 + movq RSP(%rdi),%rax /* Trap on the espfix stack? */ + sarq $PGDIR_SHIFT,%rax + cmpl $ESPFIX_PGD_ENTRY,%eax + jne do_double_fault /* No, just deliver the fault */ + cmpl $__KERNEL_CS,CS(%rdi) + jne do_double_fault + movq RIP(%rdi),%rax + cmpq $irq_return_iret,%rax +#ifdef CONFIG_PARAVIRT + je 1f + cmpq $native_iret,%rax +#endif + jne do_double_fault /* This shouldn't happen... */ +1: + movq PER_CPU_VAR(kernel_stack),%rax + subq $(6*8-KERNEL_STACK_OFFSET),%rax /* Reset to original stack */ + movq %rax,RSP(%rdi) + movq $0,(%rax) /* Missing (lost) #GP error code */ + movq $general_protection,RIP(%rdi) + retq + CFI_ENDPROC +END(__do_double_fault) +#else +# define __do_double_fault do_double_fault +#endif + /* * End of kprobes section */ @@ -1314,7 +1387,7 @@ zeroentry overflow do_overflow zeroentry bounds do_bounds zeroentry invalid_op do_invalid_op zeroentry device_not_available do_device_not_available -paranoiderrorentry double_fault do_double_fault +paranoiderrorentry double_fault __do_double_fault zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun errorentry invalid_TSS do_invalid_TSS errorentry segment_not_present do_segment_not_present @@ -1601,7 +1674,7 @@ error_sti: */ error_kernelspace: incl %ebx - leaq irq_return(%rip),%rcx + leaq irq_return_iret(%rip),%rcx cmpq %rcx,RIP+8(%rsp) je error_swapgs movl %ecx,%eax /* zero extend */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c new file mode 100644 index 000000000000..6afbb16e9b79 --- /dev/null +++ b/arch/x86/kernel/espfix_64.c @@ -0,0 +1,209 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright 2014 Intel Corporation; author: H. Peter Anvin + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * ----------------------------------------------------------------------- */ + +/* + * The IRET instruction, when returning to a 16-bit segment, only + * restores the bottom 16 bits of the user space stack pointer. This + * causes some 16-bit software to break, but it also leaks kernel state + * to user space. + * + * This works around this by creating percpu "ministacks", each of which + * is mapped 2^16 times 64K apart. When we detect that the return SS is + * on the LDT, we copy the IRET frame to the ministack and use the + * relevant alias to return to userspace. The ministacks are mapped + * readonly, so if the IRET fault we promote #GP to #DF which is an IST + * vector and thus has its own stack; we then do the fixup in the #DF + * handler. + * + * This file sets up the ministacks and the related page tables. The + * actual ministack invocation is in entry_64.S. + */ + +#include <linux/init.h> +#include <linux/init_task.h> +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/gfp.h> +#include <linux/random.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> +#include <asm/setup.h> +#include <asm/espfix.h> + +/* + * Note: we only need 6*8 = 48 bytes for the espfix stack, but round + * it up to a cache line to avoid unnecessary sharing. + */ +#define ESPFIX_STACK_SIZE (8*8UL) +#define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) + +/* There is address space for how many espfix pages? */ +#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) + +#define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) +#if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS +# error "Need more than one PGD for the ESPFIX hack" +#endif + +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) + +/* This contains the *bottom* address of the espfix stack */ +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); +DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); + +/* Initialization mutex - should this be a spinlock? */ +static DEFINE_MUTEX(espfix_init_mutex); + +/* Page allocation bitmap - each page serves ESPFIX_STACKS_PER_PAGE CPUs */ +#define ESPFIX_MAX_PAGES DIV_ROUND_UP(CONFIG_NR_CPUS, ESPFIX_STACKS_PER_PAGE) +static void *espfix_pages[ESPFIX_MAX_PAGES]; + +static __page_aligned_bss pud_t espfix_pud_page[PTRS_PER_PUD] + __aligned(PAGE_SIZE); + +static unsigned int page_random, slot_random; + +/* + * This returns the bottom address of the espfix stack for a specific CPU. + * The math allows for a non-power-of-two ESPFIX_STACK_SIZE, in which case + * we have to account for some amount of padding at the end of each page. + */ +static inline unsigned long espfix_base_addr(unsigned int cpu) +{ + unsigned long page, slot; + unsigned long addr; + + page = (cpu / ESPFIX_STACKS_PER_PAGE) ^ page_random; + slot = (cpu + slot_random) % ESPFIX_STACKS_PER_PAGE; + addr = (page << PAGE_SHIFT) + (slot * ESPFIX_STACK_SIZE); + addr = (addr & 0xffffUL) | ((addr & ~0xffffUL) << 16); + addr += ESPFIX_BASE_ADDR; + return addr; +} + +#define PTE_STRIDE (65536/PAGE_SIZE) +#define ESPFIX_PTE_CLONES (PTRS_PER_PTE/PTE_STRIDE) +#define ESPFIX_PMD_CLONES PTRS_PER_PMD +#define ESPFIX_PUD_CLONES (65536/(ESPFIX_PTE_CLONES*ESPFIX_PMD_CLONES)) + +#define PGTABLE_PROT ((_KERNPG_TABLE & ~_PAGE_RW) | _PAGE_NX) + +static void init_espfix_random(void) +{ + unsigned long rand; + + /* + * This is run before the entropy pools are initialized, + * but this is hopefully better than nothing. + */ + if (!arch_get_random_long(&rand)) { + /* The constant is an arbitrary large prime */ + rdtscll(rand); + rand *= 0xc345c6b72fd16123UL; + } + + slot_random = rand % ESPFIX_STACKS_PER_PAGE; + page_random = (rand / ESPFIX_STACKS_PER_PAGE) + & (ESPFIX_PAGE_SPACE - 1); +} + +void __init init_espfix_bsp(void) +{ + pgd_t *pgd_p; + pteval_t ptemask; + + ptemask = __supported_pte_mask; + + /* Install the espfix pud into the kernel page directory */ + pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + + /* Randomize the locations */ + init_espfix_random(); + + /* The rest is the same as for any other processor */ + init_espfix_ap(); +} + +void init_espfix_ap(void) +{ + unsigned int cpu, page; + unsigned long addr; + pud_t pud, *pud_p; + pmd_t pmd, *pmd_p; + pte_t pte, *pte_p; + int n; + void *stack_page; + pteval_t ptemask; + + /* We only have to do this once... */ + if (likely(this_cpu_read(espfix_stack))) + return; /* Already initialized */ + + cpu = smp_processor_id(); + addr = espfix_base_addr(cpu); + page = cpu/ESPFIX_STACKS_PER_PAGE; + + /* Did another CPU already set this up? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (likely(stack_page)) + goto done; + + mutex_lock(&espfix_init_mutex); + + /* Did we race on the lock? */ + stack_page = ACCESS_ONCE(espfix_pages[page]); + if (stack_page) + goto unlock_done; + + ptemask = __supported_pte_mask; + + pud_p = &espfix_pud_page[pud_index(addr)]; + pud = *pud_p; + if (!pud_present(pud)) { + pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PUD_CLONES; n++) + set_pud(&pud_p[n], pud); + } + + pmd_p = pmd_offset(&pud, addr); + pmd = *pmd_p; + if (!pmd_present(pmd)) { + pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); + paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PMD_CLONES; n++) + set_pmd(&pmd_p[n], pmd); + } + + pte_p = pte_offset_kernel(&pmd, addr); + stack_page = (void *)__get_free_page(GFP_KERNEL); + pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); + paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); + for (n = 0; n < ESPFIX_PTE_CLONES; n++) + set_pte(&pte_p[n*PTE_STRIDE], pte); + + /* Job is done for this CPU and any CPU which shares this page */ + ACCESS_ONCE(espfix_pages[page]) = stack_page; + +unlock_done: + mutex_unlock(&espfix_init_mutex); +done: + this_cpu_write(espfix_stack, addr); + this_cpu_write(espfix_waddr, (unsigned long)stack_page + + (addr & ~PAGE_MASK)); +} diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4177bfbc80b0..a9777bb5b5a0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -74,9 +74,6 @@ static inline void hpet_writel(unsigned int d, unsigned int a) static inline void hpet_set_mapping(void) { hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE); -#ifdef CONFIG_X86_64 - __set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VVAR_NOCACHE); -#endif } static inline void hpet_clear_mapping(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index dcbbaa165bde..c37886d759cc 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -20,8 +20,6 @@ #include <asm/mmu_context.h> #include <asm/syscalls.h> -int sysctl_ldt16 = 0; - #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) { @@ -231,16 +229,10 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } } - /* - * On x86-64 we do not support 16-bit segments due to - * IRET leaking the high bits of the kernel stack address. - */ -#ifdef CONFIG_X86_64 - if (!ldt_info.seg_32bit && !sysctl_ldt16) { + if (!IS_ENABLED(CONFIG_X86_16BIT) && !ldt_info.seg_32bit) { error = -EINVAL; goto out_unlock; } -#endif fill_ldt(&ldt, &ldt_info); if (oldmode) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 9e5de6813e1f..a0da58db43a8 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -298,7 +298,8 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, } if (current->mm->context.vdso) - restorer = VDSO32_SYMBOL(current->mm->context.vdso, sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; else restorer = &frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) @@ -361,7 +362,8 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, save_altstack_ex(&frame->uc.uc_stack, regs->sp); /* Set up to return from userspace. */ - restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 34826934d4a7..5d93ac1b72db 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -244,6 +244,13 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(); +#endif + + /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8b3b3eb3cead..ea5b5709aa76 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -91,7 +91,7 @@ static int addr_to_vsyscall_nr(unsigned long addr) { int nr; - if ((addr & ~0xC00UL) != VSYSCALL_START) + if ((addr & ~0xC00UL) != VSYSCALL_ADDR) return -EINVAL; nr = (addr & 0xC00UL) >> 10; @@ -330,24 +330,17 @@ void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - unsigned long physaddr_vvar_page = __pa_symbol(&__vvar_page); - __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_vsyscall, + __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_FIRST_PAGE) != - (unsigned long)VSYSCALL_START); - - __set_fixmap(VVAR_PAGE, physaddr_vvar_page, PAGE_KERNEL_VVAR); - BUILD_BUG_ON((unsigned long)__fix_to_virt(VVAR_PAGE) != - (unsigned long)VVAR_ADDRESS); + BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != + (unsigned long)VSYSCALL_ADDR); } static int __init vsyscall_init(void) { - BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)); - cpu_notifier_register_begin(); on_each_cpu(cpu_vsyscall_init, NULL, 1); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 20621d753d5f..167ffcac16ed 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -30,12 +30,14 @@ struct pg_state { unsigned long start_address; unsigned long current_address; const struct addr_marker *marker; + unsigned long lines; bool to_dmesg; }; struct addr_marker { unsigned long start_address; const char *name; + unsigned long max_lines; }; /* indices for address_markers; keep sync'd w/ address_markers below */ @@ -46,6 +48,7 @@ enum address_markers_idx { LOW_KERNEL_NR, VMALLOC_START_NR, VMEMMAP_START_NR, + ESPFIX_START_NR, HIGH_KERNEL_NR, MODULES_VADDR_NR, MODULES_END_NR, @@ -68,6 +71,7 @@ static struct addr_marker address_markers[] = { { PAGE_OFFSET, "Low Kernel Mapping" }, { VMALLOC_START, "vmalloc() Area" }, { VMEMMAP_START, "Vmemmap" }, + { ESPFIX_BASE_ADDR, "ESPfix Area", 16 }, { __START_KERNEL_map, "High Kernel Mapping" }, { MODULES_VADDR, "Modules" }, { MODULES_END, "End Modules" }, @@ -182,7 +186,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, pgprot_t new_prot, int level) { pgprotval_t prot, cur; - static const char units[] = "KMGTPE"; + static const char units[] = "BKMGTPE"; /* * If we have a "break" in the series, we need to flush the state that @@ -197,6 +201,7 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_prot = new_prot; st->level = level; st->marker = address_markers; + st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); } else if (prot != cur || level != st->level || @@ -208,17 +213,24 @@ static void note_page(struct seq_file *m, struct pg_state *st, /* * Now print the actual finished series */ - pt_dump_seq_printf(m, st->to_dmesg, "0x%0*lx-0x%0*lx ", - width, st->start_address, - width, st->current_address); - - delta = (st->current_address - st->start_address) >> 10; - while (!(delta & 1023) && unit[1]) { - delta >>= 10; - unit++; + if (!st->marker->max_lines || + st->lines < st->marker->max_lines) { + pt_dump_seq_printf(m, st->to_dmesg, + "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); + + delta = st->current_address - st->start_address; + while (!(delta & 1023) && unit[1]) { + delta >>= 10; + unit++; + } + pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", + delta, *unit); + printk_prot(m, st->current_prot, st->level, + st->to_dmesg); } - pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ", delta, *unit); - printk_prot(m, st->current_prot, st->level, st->to_dmesg); + st->lines++; /* * We print markers for special areas of address space, @@ -226,7 +238,17 @@ static void note_page(struct seq_file *m, struct pg_state *st, * This helps in the interpretation. */ if (st->current_address >= st->marker[1].start_address) { + if (st->marker->max_lines && + st->lines > st->marker->max_lines) { + unsigned long nskip = + st->lines - st->marker->max_lines; + pt_dump_seq_printf(m, st->to_dmesg, + "... %lu entr%s skipped ... \n", + nskip, + nskip == 1 ? "y" : "ies"); + } st->marker++; + st->lines = 0; pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n", st->marker->name); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 8e5722992677..858b47b5221b 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,7 +18,8 @@ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ -#include <asm/fixmap.h> /* VSYSCALL_START */ +#include <asm/fixmap.h> /* VSYSCALL_ADDR */ +#include <asm/vsyscall.h> /* emulate_vsyscall */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -771,7 +772,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, * emulation. */ if (unlikely((error_code & PF_INSTR) && - ((address & ~0xfff) == VSYSCALL_START))) { + ((address & ~0xfff) == VSYSCALL_ADDR))) { if (emulate_vsyscall(regs, address)) return; } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f35c66c5959a..bdcde58ca9ed 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1055,8 +1055,8 @@ void __init mem_init(void) after_bootmem = 1; /* Register memory areas for /proc/kcore */ - kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START, - VSYSCALL_END - VSYSCALL_START, KCORE_OTHER); + kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, + PAGE_SIZE, KCORE_OTHER); mem_init_print_info(NULL); } @@ -1185,11 +1185,19 @@ int kern_addr_valid(unsigned long addr) * covers the 64bit vsyscall page now. 32bit has a real VMA now and does * not need special handling anymore: */ +static const char *gate_vma_name(struct vm_area_struct *vma) +{ + return "[vsyscall]"; +} +static struct vm_operations_struct gate_vma_ops = { + .name = gate_vma_name, +}; static struct vm_area_struct gate_vma = { - .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE), + .vm_start = VSYSCALL_ADDR, + .vm_end = VSYSCALL_ADDR + PAGE_SIZE, .vm_page_prot = PAGE_READONLY_EXEC, - .vm_flags = VM_READ | VM_EXEC + .vm_flags = VM_READ | VM_EXEC, + .vm_ops = &gate_vma_ops, }; struct vm_area_struct *get_gate_vma(struct mm_struct *mm) @@ -1218,16 +1226,7 @@ int in_gate_area(struct mm_struct *mm, unsigned long addr) */ int in_gate_area_no_mm(unsigned long addr) { - return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); -} - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - if (vma == &gate_vma) - return "[vsyscall]"; - return NULL; + return (addr & PAGE_MASK) == VSYSCALL_ADDR; } #ifdef CONFIG_X86_UV diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 597ac155c91c..6ef98c55a899 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -355,6 +355,12 @@ void __init early_ioremap_init(void) { pmd_t *pmd; +#ifdef CONFIG_X86_64 + BUILD_BUG_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#else + WARN_ON((fix_to_virt(0) + PAGE_SIZE) & ((1 << PMD_SHIFT) - 1)); +#endif + early_ioremap_setup(); pmd = early_ioremap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index c96314abd144..5f8bdda1d1ba 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -449,9 +449,9 @@ void __init reserve_top_address(unsigned long reserve) { #ifdef CONFIG_X86_32 BUG_ON(fixmaps_set > 0); - printk(KERN_INFO "Reserving virtual address space above 0x%08x\n", - (int)-reserve); - __FIXADDR_TOP = -reserve - PAGE_SIZE; + __FIXADDR_TOP = round_down(-reserve, 1 << PMD_SHIFT) - PAGE_SIZE; + printk(KERN_INFO "Reserving virtual address space above 0x%08lx (rounded to 0x%08lx)\n", + -reserve, __FIXADDR_TOP + PAGE_SIZE); #endif } diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index af91901babb8..916cda4cd5b4 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -12,7 +12,7 @@ #include <asm/page.h> #include <linux/init.h> -unsigned int __read_mostly vdso_enabled = 1; +static unsigned int __read_mostly vdso_enabled = 1; unsigned long um_vdso_addr; extern unsigned long task_size; diff --git a/arch/x86/vdso/.gitignore b/arch/x86/vdso/.gitignore index 3282874bc61d..aae8ffdd5880 100644 --- a/arch/x86/vdso/.gitignore +++ b/arch/x86/vdso/.gitignore @@ -1,8 +1,7 @@ vdso.lds -vdso-syms.lds vdsox32.lds -vdsox32-syms.lds -vdso32-syms.lds vdso32-syscall-syms.lds vdso32-sysenter-syms.lds vdso32-int80-syms.lds +vdso-image-*.c +vdso2c diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index c580d1210ffe..895d4b16b7e3 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -24,15 +24,30 @@ vobj64s := $(filter-out $(vobjx32s-compat),$(vobjs-y)) # files to link into kernel obj-y += vma.o -obj-$(VDSO64-y) += vdso.o -obj-$(VDSOX32-y) += vdsox32.o -obj-$(VDSO32-y) += vdso32.o vdso32-setup.o + +# vDSO images to build +vdso_img-$(VDSO64-y) += 64 +vdso_img-$(VDSOX32-y) += x32 +vdso_img-$(VDSO32-y) += 32-int80 +vdso_img-$(CONFIG_COMPAT) += 32-syscall +vdso_img-$(VDSO32-y) += 32-sysenter + +obj-$(VDSO32-y) += vdso32-setup.o vobjs := $(foreach F,$(vobj64s),$(obj)/$F) $(obj)/vdso.o: $(obj)/vdso.so -targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) +targets += vdso.lds $(vobjs-y) + +# Build the vDSO image C files and link them in. +vdso_img_objs := $(vdso_img-y:%=vdso-image-%.o) +vdso_img_cfiles := $(vdso_img-y:%=vdso-image-%.c) +vdso_img_sodbg := $(vdso_img-y:%=vdso%.so.dbg) +obj-y += $(vdso_img_objs) +targets += $(vdso_img_cfiles) +targets += $(vdso_img_sodbg) +.SECONDARY: $(vdso_img-y:%=$(obj)/vdso-image-%.c) export CPPFLAGS_vdso.lds += -P -C @@ -41,14 +56,18 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 \ $(DISABLE_LTO) -$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so - -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE +$(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE $(call if_changed,vdso) -$(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg FORCE - $(call if_changed,objcopy) +hostprogs-y += vdso2c + +quiet_cmd_vdso2c = VDSO2C $@ +define cmd_vdso2c + $(obj)/vdso2c $< $@ +endef + +$(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE + $(call if_changed,vdso2c) # # Don't omit frame pointers for ease of userspace debugging, but do @@ -68,22 +87,6 @@ CFLAGS_REMOVE_vclock_gettime.o = -pg CFLAGS_REMOVE_vgetcpu.o = -pg CFLAGS_REMOVE_vvar.o = -pg -targets += vdso-syms.lds -obj-$(VDSO64-y) += vdso-syms.lds - -# -# Match symbols in the DSO that look like VDSO*; produce a file of constants. -# -sed-vdsosym := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' -quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ -endef - -$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE - $(call if_changed,vdsosym) - # # X32 processes use x32 vDSO to access 64bit kernel data. # @@ -94,9 +97,6 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE # so that it can reach 64bit address space with 64bit pointers. # -targets += vdsox32-syms.lds -obj-$(VDSOX32-y) += vdsox32-syms.lds - CPPFLAGS_vdsox32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdsox32.lds = -Wl,-m,elf32_x86_64 \ -Wl,-soname=linux-vdso.so.1 \ @@ -113,9 +113,7 @@ quiet_cmd_x32 = X32 $@ $(obj)/%-x32.o: $(obj)/%.o FORCE $(call if_changed,x32) -targets += vdsox32.so vdsox32.so.dbg vdsox32.lds $(vobjx32s-y) - -$(obj)/vdsox32.o: $(src)/vdsox32.S $(obj)/vdsox32.so +targets += vdsox32.lds $(vobjx32s-y) $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) @@ -123,7 +121,6 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE # # Build multiple 32-bit vDSO images to choose from at boot time. # -obj-$(VDSO32-y) += vdso32-syms.lds vdso32.so-$(VDSO32-y) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter @@ -138,10 +135,8 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += $(vdso32-images) $(vdso32-images:=.dbg) targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) - -extra-y += $(vdso32-images) +targets += vdso32/vclock_gettime.o $(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) @@ -166,27 +161,6 @@ $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/%.o $(call if_changed,vdso) -# Make vdso32-*-syms.lds from each image, and then make sure they match. -# The only difference should be that some do not define VDSO32_SYSENTER_RETURN. - -targets += vdso32-syms.lds $(vdso32.so-y:%=vdso32-%-syms.lds) - -quiet_cmd_vdso32sym = VDSOSYM $@ -define cmd_vdso32sym - if LC_ALL=C sort -u $(filter-out FORCE,$^) > $(@D)/.tmp_$(@F) && \ - $(foreach H,$(filter-out FORCE,$^),\ - if grep -q VDSO32_SYSENTER_RETURN $H; \ - then diff -u $(@D)/.tmp_$(@F) $H; \ - else sed /VDSO32_SYSENTER_RETURN/d $(@D)/.tmp_$(@F) | \ - diff -u - $H; fi &&) : ;\ - then mv -f $(@D)/.tmp_$(@F) $@; \ - else rm -f $(@D)/.tmp_$(@F); exit 1; \ - fi -endef - -$(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE - $(call if_changed,vdso32sym) - # # The DSO images are built using a special linker script. # @@ -197,7 +171,7 @@ quiet_cmd_vdso = VDSO $@ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ - $(LTO_CFLAGS) + -Wl,-Bsymbolic $(LTO_CFLAGS) GCOV_PROFILE := n # diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 16d686171e9a..b2e4f493e5b0 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -30,9 +30,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); #ifdef CONFIG_HPET_TIMER -static inline u32 read_hpet_counter(const volatile void *addr) +extern u8 hpet_page + __attribute__((visibility("hidden"))); + +static notrace cycle_t vread_hpet(void) { - return *(const volatile u32 *) (addr + HPET_COUNTER); + return *(const volatile u32 *)(&hpet_page + HPET_COUNTER); } #endif @@ -43,11 +46,6 @@ static inline u32 read_hpet_counter(const volatile void *addr) #include <asm/fixmap.h> #include <asm/pvclock.h> -static notrace cycle_t vread_hpet(void) -{ - return read_hpet_counter((const void *)fix_to_virt(VSYSCALL_HPET)); -} - notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -137,16 +135,6 @@ static notrace cycle_t vread_pvclock(int *mode) #else -extern u8 hpet_page - __attribute__((visibility("hidden"))); - -#ifdef CONFIG_HPET_TIMER -static notrace cycle_t vread_hpet(void) -{ - return read_hpet_counter((const void *)(&hpet_page)); -} -#endif - notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -154,7 +142,7 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) asm( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" - "call VDSO32_vsyscall \n" + "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret) : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) @@ -169,7 +157,7 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) asm( "mov %%ebx, %%edx \n" "mov %2, %%ebx \n" - "call VDSO32_vsyscall \n" + "call __kernel_vsyscall \n" "mov %%edx, %%ebx \n" : "=a" (ret) : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 9df017ab2285..2ec72f651ebf 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -1,3 +1,5 @@ +#include <asm/vdso.h> + /* * Linker script for vDSO. This is an ELF shared object prelinked to * its virtual address, and with only one read-only segment. @@ -6,20 +8,6 @@ SECTIONS { -#ifdef BUILD_VDSO32 -#include <asm/vdso32.h> - - hpet_page = . - VDSO_OFFSET(VDSO_HPET_PAGE); - - vvar = . - VDSO_OFFSET(VDSO_VVAR_PAGE); - - /* Place all vvars at the offsets in asm/vvar.h. */ -#define EMIT_VVAR(name, offset) vvar_ ## name = vvar + offset; -#define __VVAR_KERNEL_LDS -#include <asm/vvar.h> -#undef __VVAR_KERNEL_LDS -#undef EMIT_VVAR -#endif . = SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -60,10 +48,30 @@ SECTIONS .text : { *(.text*) } :text =0x90909090, /* - * The comma above works around a bug in gold: - * https://sourceware.org/bugzilla/show_bug.cgi?id=16804 + * The remainder of the vDSO consists of special pages that are + * shared between the kernel and userspace. It needs to be at the + * end so that it doesn't overlap the mapping of the actual + * vDSO image. */ + . = ALIGN(PAGE_SIZE); + vvar_page = .; + + /* Place all vvars at the offsets in asm/vvar.h. */ +#define EMIT_VVAR(name, offset) vvar_ ## name = vvar_page + offset; +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS +#undef EMIT_VVAR + + . = vvar_page + PAGE_SIZE; + + hpet_page = .; + . = . + PAGE_SIZE; + + . = ALIGN(PAGE_SIZE); + end_mapping = .; + /DISCARD/ : { *(.discard) *(.discard.*) diff --git a/arch/x86/vdso/vdso.S b/arch/x86/vdso/vdso.S deleted file mode 100644 index be3f23b09af5..000000000000 --- a/arch/x86/vdso/vdso.S +++ /dev/null @@ -1,3 +0,0 @@ -#include <asm/vdso.h> - -DEFINE_VDSO_IMAGE(vdso, "arch/x86/vdso/vdso.so") diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index b96b2677cad8..75e3404c83b1 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -1,14 +1,11 @@ /* * Linker script for 64-bit vDSO. * We #include the file to define the layout details. - * Here we only choose the prelinked virtual address. * * This file defines the version script giving the user-exported symbols in - * the DSO. We can define local symbols here called VDSO* to make their - * values visible using the asm-x86/vdso.h macros from the kernel proper. + * the DSO. */ -#define VDSO_PRELINK 0xffffffffff700000 #include "vdso-layout.lds.S" /* @@ -28,5 +25,3 @@ VERSION { local: *; }; } - -VDSO64_PRELINK = VDSO_PRELINK; diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c new file mode 100644 index 000000000000..81edd1ec9df8 --- /dev/null +++ b/arch/x86/vdso/vdso2c.c @@ -0,0 +1,156 @@ +#include <inttypes.h> +#include <stdint.h> +#include <unistd.h> +#include <stdarg.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <fcntl.h> +#include <err.h> + +#include <sys/mman.h> +#include <sys/types.h> + +#include <linux/elf.h> +#include <linux/types.h> + +/* Symbols that we need in vdso2c. */ +enum { + sym_vvar_page, + sym_hpet_page, + sym_end_mapping, +}; + +const int special_pages[] = { + sym_vvar_page, + sym_hpet_page, +}; + +char const * const required_syms[] = { + [sym_vvar_page] = "vvar_page", + [sym_hpet_page] = "hpet_page", + [sym_end_mapping] = "end_mapping", + "VDSO32_NOTE_MASK", + "VDSO32_SYSENTER_RETURN", + "__kernel_vsyscall", + "__kernel_sigreturn", + "__kernel_rt_sigreturn", +}; + +__attribute__((format(printf, 1, 2))) __attribute__((noreturn)) +static void fail(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + fprintf(stderr, "Error: "); + vfprintf(stderr, format, ap); + exit(1); + va_end(ap); +} + +#define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) + +#define BITS 64 +#define GOFUNC go64 +#define Elf_Ehdr Elf64_Ehdr +#define Elf_Shdr Elf64_Shdr +#define Elf_Phdr Elf64_Phdr +#define Elf_Sym Elf64_Sym +#define Elf_Dyn Elf64_Dyn +#include "vdso2c.h" +#undef BITS +#undef GOFUNC +#undef Elf_Ehdr +#undef Elf_Shdr +#undef Elf_Phdr +#undef Elf_Sym +#undef Elf_Dyn + +#define BITS 32 +#define GOFUNC go32 +#define Elf_Ehdr Elf32_Ehdr +#define Elf_Shdr Elf32_Shdr +#define Elf_Phdr Elf32_Phdr +#define Elf_Sym Elf32_Sym +#define Elf_Dyn Elf32_Dyn +#include "vdso2c.h" +#undef BITS +#undef GOFUNC +#undef Elf_Ehdr +#undef Elf_Shdr +#undef Elf_Phdr +#undef Elf_Sym +#undef Elf_Dyn + +static int go(void *addr, size_t len, FILE *outfile, const char *name) +{ + Elf64_Ehdr *hdr = (Elf64_Ehdr *)addr; + + if (hdr->e_ident[EI_CLASS] == ELFCLASS64) { + return go64(addr, len, outfile, name); + } else if (hdr->e_ident[EI_CLASS] == ELFCLASS32) { + return go32(addr, len, outfile, name); + } else { + fprintf(stderr, "Error: unknown ELF class\n"); + return 1; + } +} + +int main(int argc, char **argv) +{ + int fd; + off_t len; + void *addr; + FILE *outfile; + int ret; + char *name, *tmp; + int namelen; + + if (argc != 3) { + printf("Usage: vdso2c INPUT OUTPUT\n"); + return 1; + } + + /* + * Figure out the struct name. If we're writing to a .so file, + * generate raw output insted. + */ + name = strdup(argv[2]); + namelen = strlen(name); + if (namelen >= 3 && !strcmp(name + namelen - 3, ".so")) { + name = NULL; + } else { + tmp = strrchr(name, '/'); + if (tmp) + name = tmp + 1; + tmp = strchr(name, '.'); + if (tmp) + *tmp = '\0'; + for (tmp = name; *tmp; tmp++) + if (*tmp == '-') + *tmp = '_'; + } + + fd = open(argv[1], O_RDONLY); + if (fd == -1) + err(1, "%s", argv[1]); + + len = lseek(fd, 0, SEEK_END); + if (len == (off_t)-1) + err(1, "lseek"); + + addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) + err(1, "mmap"); + + outfile = fopen(argv[2], "w"); + if (!outfile) + err(1, "%s", argv[2]); + + ret = go(addr, (size_t)len, outfile, name); + + munmap(addr, len); + fclose(outfile); + + return ret; +} diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h new file mode 100644 index 000000000000..3dcc61e796e9 --- /dev/null +++ b/arch/x86/vdso/vdso2c.h @@ -0,0 +1,157 @@ +/* + * This file is included twice from vdso2c.c. It generates code for 32-bit + * and 64-bit vDSOs. We need both for 64-bit builds, since 32-bit vDSOs + * are built for 32-bit userspace. + */ + +static int GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) +{ + int found_load = 0; + unsigned long load_size = -1; /* Work around bogus warning */ + unsigned long data_size; + Elf_Ehdr *hdr = (Elf_Ehdr *)addr; + int i; + unsigned long j; + Elf_Shdr *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + *alt_sec = NULL; + Elf_Dyn *dyn = 0, *dyn_end = 0; + const char *secstrings; + uint64_t syms[NSYMS] = {}; + + Elf_Phdr *pt = (Elf_Phdr *)(addr + hdr->e_phoff); + + /* Walk the segment table. */ + for (i = 0; i < hdr->e_phnum; i++) { + if (pt[i].p_type == PT_LOAD) { + if (found_load) + fail("multiple PT_LOAD segs\n"); + + if (pt[i].p_offset != 0 || pt[i].p_vaddr != 0) + fail("PT_LOAD in wrong place\n"); + + if (pt[i].p_memsz != pt[i].p_filesz) + fail("cannot handle memsz != filesz\n"); + + load_size = pt[i].p_memsz; + found_load = 1; + } else if (pt[i].p_type == PT_DYNAMIC) { + dyn = addr + pt[i].p_offset; + dyn_end = addr + pt[i].p_offset + pt[i].p_memsz; + } + } + if (!found_load) + fail("no PT_LOAD seg\n"); + data_size = (load_size + 4095) / 4096 * 4096; + + /* Walk the dynamic table */ + for (i = 0; dyn + i < dyn_end && dyn[i].d_tag != DT_NULL; i++) { + if (dyn[i].d_tag == DT_REL || dyn[i].d_tag == DT_RELSZ || + dyn[i].d_tag == DT_RELENT || dyn[i].d_tag == DT_TEXTREL) + fail("vdso image contains dynamic relocations\n"); + } + + /* Walk the section table */ + secstrings_hdr = addr + hdr->e_shoff + hdr->e_shentsize*hdr->e_shstrndx; + secstrings = addr + secstrings_hdr->sh_offset; + for (i = 0; i < hdr->e_shnum; i++) { + Elf_Shdr *sh = addr + hdr->e_shoff + hdr->e_shentsize * i; + if (sh->sh_type == SHT_SYMTAB) + symtab_hdr = sh; + + if (!strcmp(secstrings + sh->sh_name, ".altinstructions")) + alt_sec = sh; + } + + if (!symtab_hdr) { + fail("no symbol table\n"); + return 1; + } + + strtab_hdr = addr + hdr->e_shoff + + hdr->e_shentsize * symtab_hdr->sh_link; + + /* Walk the symbol table */ + for (i = 0; i < symtab_hdr->sh_size / symtab_hdr->sh_entsize; i++) { + int k; + Elf_Sym *sym = addr + symtab_hdr->sh_offset + + symtab_hdr->sh_entsize * i; + const char *name = addr + strtab_hdr->sh_offset + sym->st_name; + for (k = 0; k < NSYMS; k++) { + if (!strcmp(name, required_syms[k])) { + if (syms[k]) { + fail("duplicate symbol %s\n", + required_syms[k]); + } + syms[k] = sym->st_value; + } + } + } + + /* Validate mapping addresses. */ + for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { + if (!syms[i]) + continue; /* The mapping isn't used; ignore it. */ + + if (syms[i] % 4096) + fail("%s must be a multiple of 4096\n", + required_syms[i]); + if (syms[i] < data_size) + fail("%s must be after the text mapping\n", + required_syms[i]); + if (syms[sym_end_mapping] < syms[i] + 4096) + fail("%s overruns end_mapping\n", required_syms[i]); + } + if (syms[sym_end_mapping] % 4096) + fail("end_mapping must be a multiple of 4096\n"); + + /* Remove sections. */ + hdr->e_shoff = 0; + hdr->e_shentsize = 0; + hdr->e_shnum = 0; + hdr->e_shstrndx = SHN_UNDEF; + + if (!name) { + fwrite(addr, load_size, 1, outfile); + return 0; + } + + fprintf(outfile, "/* AUTOMATICALLY GENERATED -- DO NOT EDIT */\n\n"); + fprintf(outfile, "#include <linux/linkage.h>\n"); + fprintf(outfile, "#include <asm/page_types.h>\n"); + fprintf(outfile, "#include <asm/vdso.h>\n"); + fprintf(outfile, "\n"); + fprintf(outfile, + "static unsigned char raw_data[%lu] __page_aligned_data = {", + data_size); + for (j = 0; j < load_size; j++) { + if (j % 10 == 0) + fprintf(outfile, "\n\t"); + fprintf(outfile, "0x%02X, ", (int)((unsigned char *)addr)[j]); + } + fprintf(outfile, "\n};\n\n"); + + fprintf(outfile, "static struct page *pages[%lu];\n\n", + data_size / 4096); + + fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "\t.data = raw_data,\n"); + fprintf(outfile, "\t.size = %lu,\n", data_size); + fprintf(outfile, "\t.text_mapping = {\n"); + fprintf(outfile, "\t\t.name = \"[vdso]\",\n"); + fprintf(outfile, "\t\t.pages = pages,\n"); + fprintf(outfile, "\t},\n"); + if (alt_sec) { + fprintf(outfile, "\t.alt = %lu,\n", + (unsigned long)alt_sec->sh_offset); + fprintf(outfile, "\t.alt_len = %lu,\n", + (unsigned long)alt_sec->sh_size); + } + for (i = 0; i < NSYMS; i++) { + if (syms[i]) + fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n", + required_syms[i], syms[i]); + } + fprintf(outfile, "};\n"); + + return 0; +} diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 310c5f0dbef1..e4f7781ee162 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -8,27 +8,12 @@ #include <linux/init.h> #include <linux/smp.h> -#include <linux/thread_info.h> -#include <linux/sched.h> -#include <linux/gfp.h> -#include <linux/string.h> -#include <linux/elf.h> -#include <linux/mm.h> -#include <linux/err.h> -#include <linux/module.h> -#include <linux/slab.h> +#include <linux/kernel.h> +#include <linux/mm_types.h> #include <asm/cpufeature.h> -#include <asm/msr.h> -#include <asm/pgtable.h> -#include <asm/unistd.h> -#include <asm/elf.h> -#include <asm/tlbflush.h> +#include <asm/processor.h> #include <asm/vdso.h> -#include <asm/proto.h> -#include <asm/fixmap.h> -#include <asm/hpet.h> -#include <asm/vvar.h> #ifdef CONFIG_COMPAT_VDSO #define VDSO_DEFAULT 0 @@ -36,23 +21,17 @@ #define VDSO_DEFAULT 1 #endif -#ifdef CONFIG_X86_64 -#define vdso_enabled sysctl_vsyscall32 -#define arch_setup_additional_pages syscall32_setup_pages -extern int sysctl_ldt16; -#endif - /* * Should the kernel map a VDSO page into processes and pass its * address down to glibc upon exec()? */ -unsigned int __read_mostly vdso_enabled = VDSO_DEFAULT; +unsigned int __read_mostly vdso32_enabled = VDSO_DEFAULT; -static int __init vdso_setup(char *s) +static int __init vdso32_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso32_enabled = simple_strtoul(s, NULL, 0); - if (vdso_enabled > 1) + if (vdso32_enabled > 1) pr_warn("vdso32 values other than 0 and 1 are no longer allowed; vdso disabled\n"); return 1; @@ -63,178 +42,45 @@ static int __init vdso_setup(char *s) * behavior on both 64-bit and 32-bit kernels. * On 32-bit kernels, vdso=[012] means the same thing. */ -__setup("vdso32=", vdso_setup); +__setup("vdso32=", vdso32_setup); #ifdef CONFIG_X86_32 -__setup_param("vdso=", vdso32_setup, vdso_setup, 0); - -EXPORT_SYMBOL_GPL(vdso_enabled); +__setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif -static struct page **vdso32_pages; -static unsigned vdso32_size; - #ifdef CONFIG_X86_64 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) #define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) -/* May not be __init: called during resume */ -void syscall32_cpu_init(void) -{ - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} - #else /* CONFIG_X86_32 */ #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) #define vdso32_syscall() (0) -void enable_sep_cpu(void) -{ - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); - - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); - return; - } - - tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); - put_cpu(); -} - #endif /* CONFIG_X86_64 */ +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +const struct vdso_image *selected_vdso32; +#endif + int __init sysenter_setup(void) { - char *vdso32_start, *vdso32_end; - int npages, i; - #ifdef CONFIG_COMPAT - if (vdso32_syscall()) { - vdso32_start = vdso32_syscall_start; - vdso32_end = vdso32_syscall_end; - vdso32_pages = vdso32_syscall_pages; - } else + if (vdso32_syscall()) + selected_vdso32 = &vdso_image_32_syscall; + else #endif - if (vdso32_sysenter()) { - vdso32_start = vdso32_sysenter_start; - vdso32_end = vdso32_sysenter_end; - vdso32_pages = vdso32_sysenter_pages; - } else { - vdso32_start = vdso32_int80_start; - vdso32_end = vdso32_int80_end; - vdso32_pages = vdso32_int80_pages; - } - - npages = ((vdso32_end - vdso32_start) + PAGE_SIZE - 1) / PAGE_SIZE; - vdso32_size = npages << PAGE_SHIFT; - for (i = 0; i < npages; i++) - vdso32_pages[i] = virt_to_page(vdso32_start + i*PAGE_SIZE); + if (vdso32_sysenter()) + selected_vdso32 = &vdso_image_32_sysenter; + else + selected_vdso32 = &vdso_image_32_int80; - patch_vdso32(vdso32_start, vdso32_size); + init_vdso_image(selected_vdso32); return 0; } -/* Setup a VMA at program startup for the vsyscall page */ -int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) -{ - struct mm_struct *mm = current->mm; - unsigned long addr; - int ret = 0; - struct vm_area_struct *vma; - static struct page *no_pages[] = {NULL}; - -#ifdef CONFIG_X86_X32_ABI - if (test_thread_flag(TIF_X32)) - return x32_setup_additional_pages(bprm, uses_interp); -#endif - - if (vdso_enabled != 1) /* Other values all mean "disabled" */ - return 0; - - down_write(&mm->mmap_sem); - - addr = get_unmapped_area(NULL, 0, vdso32_size + VDSO_OFFSET(VDSO_PREV_PAGES), 0, 0); - if (IS_ERR_VALUE(addr)) { - ret = addr; - goto up_fail; - } - - addr += VDSO_OFFSET(VDSO_PREV_PAGES); - - current->mm->context.vdso = (void *)addr; - - /* - * MAYWRITE to allow gdb to COW and set breakpoints - */ - ret = install_special_mapping(mm, - addr, - vdso32_size, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - vdso32_pages); - - if (ret) - goto up_fail; - - vma = _install_special_mapping(mm, - addr - VDSO_OFFSET(VDSO_PREV_PAGES), - VDSO_OFFSET(VDSO_PREV_PAGES), - VM_READ, - no_pages); - - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto up_fail; - } - - ret = remap_pfn_range(vma, - addr - VDSO_OFFSET(VDSO_VVAR_PAGE), - __pa_symbol(&__vvar_page) >> PAGE_SHIFT, - PAGE_SIZE, - PAGE_READONLY); - - if (ret) - goto up_fail; - -#ifdef CONFIG_HPET_TIMER - if (hpet_address) { - ret = io_remap_pfn_range(vma, - addr - VDSO_OFFSET(VDSO_HPET_PAGE), - hpet_address >> PAGE_SHIFT, - PAGE_SIZE, - pgprot_noncached(PAGE_READONLY)); - - if (ret) - goto up_fail; - } -#endif - - current_thread_info()->sysenter_return = - VDSO32_SYMBOL(addr, SYSENTER_RETURN); - - up_fail: - if (ret) - current->mm->context.vdso = NULL; - - up_write(&mm->mmap_sem); - - return ret; -} - #ifdef CONFIG_X86_64 subsys_initcall(sysenter_setup); @@ -246,14 +92,7 @@ subsys_initcall(sysenter_setup); static struct ctl_table abi_table2[] = { { .procname = "vsyscall32", - .data = &sysctl_vsyscall32, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, - { - .procname = "ldt16", - .data = &sysctl_ldt16, + .data = &vdso32_enabled, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -280,13 +119,6 @@ __initcall(ia32_binfmt_init); #else /* CONFIG_X86_32 */ -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso) - return "[vdso]"; - return NULL; -} - struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { return NULL; diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S deleted file mode 100644 index 018bcd9f97b4..000000000000 --- a/arch/x86/vdso/vdso32.S +++ /dev/null @@ -1,9 +0,0 @@ -#include <asm/vdso.h> - -DEFINE_VDSO_IMAGE(vdso32_int80, "arch/x86/vdso/vdso32-int80.so") - -#ifdef CONFIG_COMPAT -DEFINE_VDSO_IMAGE(vdso32_syscall, "arch/x86/vdso/vdso32-syscall.so") -#endif - -DEFINE_VDSO_IMAGE(vdso32_sysenter, "arch/x86/vdso/vdso32-sysenter.so") diff --git a/arch/x86/vdso/vdso32/vdso32.lds.S b/arch/x86/vdso/vdso32/vdso32.lds.S index aadb8b9994cd..31056cf294bf 100644 --- a/arch/x86/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/vdso/vdso32/vdso32.lds.S @@ -1,17 +1,14 @@ /* * Linker script for 32-bit vDSO. * We #include the file to define the layout details. - * Here we only choose the prelinked virtual address. * * This file defines the version script giving the user-exported symbols in - * the DSO. We can define local symbols here called VDSO* to make their - * values visible using the asm-x86/vdso.h macros from the kernel proper. + * the DSO. */ #include <asm/page.h> #define BUILD_VDSO32 -#define VDSO_PRELINK 0 #include "../vdso-layout.lds.S" @@ -38,13 +35,3 @@ VERSION local: *; }; } - -/* - * Symbols we define here called VDSO* get their values into vdso32-syms.h. - */ -VDSO32_vsyscall = __kernel_vsyscall; -VDSO32_sigreturn = __kernel_sigreturn; -VDSO32_rt_sigreturn = __kernel_rt_sigreturn; -VDSO32_clock_gettime = clock_gettime; -VDSO32_gettimeofday = gettimeofday; -VDSO32_time = time; diff --git a/arch/x86/vdso/vdsox32.S b/arch/x86/vdso/vdsox32.S deleted file mode 100644 index f4aa34e7f370..000000000000 --- a/arch/x86/vdso/vdsox32.S +++ /dev/null @@ -1,3 +0,0 @@ -#include <asm/vdso.h> - -DEFINE_VDSO_IMAGE(vdsox32, "arch/x86/vdso/vdsox32.so") diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S index 62272aa2ae0a..46b991b578a8 100644 --- a/arch/x86/vdso/vdsox32.lds.S +++ b/arch/x86/vdso/vdsox32.lds.S @@ -1,14 +1,11 @@ /* * Linker script for x32 vDSO. * We #include the file to define the layout details. - * Here we only choose the prelinked virtual address. * * This file defines the version script giving the user-exported symbols in - * the DSO. We can define local symbols here called VDSO* to make their - * values visible using the asm-x86/vdso.h macros from the kernel proper. + * the DSO. */ -#define VDSO_PRELINK 0 #include "vdso-layout.lds.S" /* @@ -24,5 +21,3 @@ VERSION { local: *; }; } - -VDSOX32_PRELINK = VDSO_PRELINK; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 1ad102613127..e1513c47872a 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -15,115 +15,51 @@ #include <asm/proto.h> #include <asm/vdso.h> #include <asm/page.h> +#include <asm/hpet.h> #if defined(CONFIG_X86_64) -unsigned int __read_mostly vdso_enabled = 1; +unsigned int __read_mostly vdso64_enabled = 1; -DECLARE_VDSO_IMAGE(vdso); extern unsigned short vdso_sync_cpuid; -static unsigned vdso_size; - -#ifdef CONFIG_X86_X32_ABI -DECLARE_VDSO_IMAGE(vdsox32); -static unsigned vdsox32_size; -#endif #endif -#if defined(CONFIG_X86_32) || defined(CONFIG_X86_X32_ABI) || \ - defined(CONFIG_COMPAT) -void __init patch_vdso32(void *vdso, size_t len) +void __init init_vdso_image(const struct vdso_image *image) { - Elf32_Ehdr *hdr = vdso; - Elf32_Shdr *sechdrs, *alt_sec = 0; - char *secstrings; - void *alt_data; int i; + int npages = (image->size) / PAGE_SIZE; - BUG_ON(len < sizeof(Elf32_Ehdr)); - BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); - - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (i = 1; i < hdr->e_shnum; i++) { - Elf32_Shdr *shdr = &sechdrs[i]; - if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { - alt_sec = shdr; - goto found; - } - } - - /* If we get here, it's probably a bug. */ - pr_warning("patch_vdso32: .altinstructions not found\n"); - return; /* nothing to patch */ + BUG_ON(image->size % PAGE_SIZE != 0); + for (i = 0; i < npages; i++) + image->text_mapping.pages[i] = + virt_to_page(image->data + i*PAGE_SIZE); -found: - alt_data = (void *)hdr + alt_sec->sh_offset; - apply_alternatives(alt_data, alt_data + alt_sec->sh_size); + apply_alternatives((struct alt_instr *)(image->data + image->alt), + (struct alt_instr *)(image->data + image->alt + + image->alt_len)); } -#endif #if defined(CONFIG_X86_64) -static void __init patch_vdso64(void *vdso, size_t len) -{ - Elf64_Ehdr *hdr = vdso; - Elf64_Shdr *sechdrs, *alt_sec = 0; - char *secstrings; - void *alt_data; - int i; - - BUG_ON(len < sizeof(Elf64_Ehdr)); - BUG_ON(memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0); - - sechdrs = (void *)hdr + hdr->e_shoff; - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - - for (i = 1; i < hdr->e_shnum; i++) { - Elf64_Shdr *shdr = &sechdrs[i]; - if (!strcmp(secstrings + shdr->sh_name, ".altinstructions")) { - alt_sec = shdr; - goto found; - } - } - - /* If we get here, it's probably a bug. */ - pr_warning("patch_vdso64: .altinstructions not found\n"); - return; /* nothing to patch */ - -found: - alt_data = (void *)hdr + alt_sec->sh_offset; - apply_alternatives(alt_data, alt_data + alt_sec->sh_size); -} - static int __init init_vdso(void) { - int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; - int i; - - patch_vdso64(vdso_start, vdso_end - vdso_start); - - vdso_size = npages << PAGE_SHIFT; - for (i = 0; i < npages; i++) - vdso_pages[i] = virt_to_page(vdso_start + i*PAGE_SIZE); + init_vdso_image(&vdso_image_64); #ifdef CONFIG_X86_X32_ABI - patch_vdso32(vdsox32_start, vdsox32_end - vdsox32_start); - npages = (vdsox32_end - vdsox32_start + PAGE_SIZE - 1) / PAGE_SIZE; - vdsox32_size = npages << PAGE_SHIFT; - for (i = 0; i < npages; i++) - vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE); + init_vdso_image(&vdso_image_x32); #endif return 0; } subsys_initcall(init_vdso); +#endif struct linux_binprm; /* Put the vdso above the (randomized) stack with another randomized offset. This way there is no hole in the middle of address space. To save memory make sure it is still in the same PTE as the stack top. - This doesn't give that many random bits */ + This doesn't give that many random bits. + + Only used for the 64-bit and x32 vdsos. */ static unsigned long vdso_addr(unsigned long start, unsigned len) { unsigned long addr, end; @@ -149,61 +85,149 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) return addr; } -/* Setup a VMA at program startup for the vsyscall page. - Not called for compat tasks */ -static int setup_additional_pages(struct linux_binprm *bprm, - int uses_interp, - struct page **pages, - unsigned size) +static int map_vdso(const struct vdso_image *image, bool calculate_addr) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; unsigned long addr; - int ret; - - if (!vdso_enabled) - return 0; + int ret = 0; + static struct page *no_pages[] = {NULL}; + static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .pages = no_pages, + }; + + if (calculate_addr) { + addr = vdso_addr(current->mm->start_stack, + image->sym_end_mapping); + } else { + addr = 0; + } down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, size); - addr = get_unmapped_area(NULL, addr, size, 0, 0); + + addr = get_unmapped_area(NULL, addr, image->sym_end_mapping, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - current->mm->context.vdso = (void *)addr; + current->mm->context.vdso = (void __user *)addr; - ret = install_special_mapping(mm, addr, size, - VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - pages); - if (ret) { - current->mm->context.vdso = NULL; + /* + * MAYWRITE to allow gdb to COW and set breakpoints + */ + vma = _install_special_mapping(mm, + addr, + image->size, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, + &image->text_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto up_fail; } + vma = _install_special_mapping(mm, + addr + image->size, + image->sym_end_mapping - image->size, + VM_READ, + &vvar_mapping); + + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto up_fail; + } + + if (image->sym_vvar_page) + ret = remap_pfn_range(vma, + addr + image->sym_vvar_page, + __pa_symbol(&__vvar_page) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + +#ifdef CONFIG_HPET_TIMER + if (hpet_address && image->sym_hpet_page) { + ret = io_remap_pfn_range(vma, + addr + image->sym_hpet_page, + hpet_address >> PAGE_SHIFT, + PAGE_SIZE, + pgprot_noncached(PAGE_READONLY)); + + if (ret) + goto up_fail; + } +#endif + up_fail: + if (ret) + current->mm->context.vdso = NULL; + up_write(&mm->mmap_sem); return ret; } +#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +static int load_vdso32(void) +{ + int ret; + + if (vdso32_enabled != 1) /* Other values all mean "disabled" */ + return 0; + + ret = map_vdso(selected_vdso32, false); + if (ret) + return ret; + + if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) + current_thread_info()->sysenter_return = + current->mm->context.vdso + + selected_vdso32->sym_VDSO32_SYSENTER_RETURN; + + return 0; +} +#endif + +#ifdef CONFIG_X86_64 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages(bprm, uses_interp, vdso_pages, - vdso_size); + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_64, true); } +#ifdef CONFIG_COMPAT +int compat_arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp) +{ #ifdef CONFIG_X86_X32_ABI -int x32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) + if (test_thread_flag(TIF_X32)) { + if (!vdso64_enabled) + return 0; + + return map_vdso(&vdso_image_x32, true); + } +#endif + + return load_vdso32(); +} +#endif +#else +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - return setup_additional_pages(bprm, uses_interp, vdsox32_pages, - vdsox32_size); + return load_vdso32(); } #endif +#ifdef CONFIG_X86_64 static __init int vdso_setup(char *s) { - vdso_enabled = simple_strtoul(s, NULL, 0); + vdso64_enabled = simple_strtoul(s, NULL, 0); return 0; } __setup("vdso=", vdso_setup); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 86e02eabb640..3060568248d3 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1494,7 +1494,7 @@ static int xen_pgd_alloc(struct mm_struct *mm) page->private = (unsigned long)user_pgd; if (user_pgd != NULL) { - user_pgd[pgd_index(VSYSCALL_START)] = + user_pgd[pgd_index(VSYSCALL_ADDR)] = __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); ret = 0; } @@ -2062,8 +2062,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) case FIX_KMAP_BEGIN ... FIX_KMAP_END: # endif #else - case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: - case VVAR_PAGE: + case VSYSCALL_PAGE: #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: @@ -2104,8 +2103,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #ifdef CONFIG_X86_64 /* Replicate changes to map the vsyscall page into the user pagetable vsyscall mapping. */ - if ((idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) || - idx == VVAR_PAGE) { + if (idx == VSYSCALL_PAGE) { unsigned long vaddr = __fix_to_virt(idx); set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 0982233b9b84..7225a9557ee2 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -516,10 +516,17 @@ char * __init xen_memory_setup(void) static void __init fiddle_vdso(void) { #ifdef CONFIG_X86_32 + /* + * This could be called before selected_vdso32 is initialized, so + * just fiddle with both possible images. vdso_image_32_syscall + * can't be selected, since it only exists on 64-bit systems. + */ u32 *mask; - mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); + mask = vdso_image_32_int80.data + + vdso_image_32_int80.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; - mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); + mask = vdso_image_32_sysenter.data + + vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; #endif } diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index aa3cb626671e..df9ea4186d75 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1108,6 +1108,14 @@ static bool always_dump_vma(struct vm_area_struct *vma) /* Any vsyscall mappings? */ if (vma == get_gate_vma(vma->vm_mm)) return true; + + /* + * Assume that all vmas with a .name op should always be dumped. + * If this changes, a new vm_ops field can easily be added. + */ + if (vma->vm_ops && vma->vm_ops->name && vma->vm_ops->name(vma)) + return true; + /* * arch_vma_name() returns non-NULL for special architecture mappings, * such as vDSO sections. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 442177b1119a..9b2f5d62ce63 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -300,6 +300,12 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } + if (vma->vm_ops && vma->vm_ops->name) { + name = vma->vm_ops->name(vma); + if (name) + goto done; + } + name = arch_vma_name(vma); if (!name) { pid_t tid; diff --git a/include/linux/mm.h b/include/linux/mm.h index d6777060449f..fb27946baf54 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -239,6 +239,12 @@ struct vm_operations_struct { */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); + + /* Called by the /proc/PID/maps code to ask the vma whether it + * has a special name. Returning non-NULL will also cause this + * vma to be dumped unconditionally. */ + const char *(*name)(struct vm_area_struct *vma); + #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy @@ -1778,7 +1784,9 @@ extern struct file *get_mm_exe_file(struct mm_struct *mm); extern int may_expand_vm(struct mm_struct *mm, unsigned long npages); extern struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, - unsigned long flags, struct page **pages); + unsigned long flags, + const struct vm_special_mapping *spec); +/* This is an obsolete alternative to _install_special_mapping. */ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20cbe57..22c6f4e16d10 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -510,4 +510,10 @@ static inline void clear_tlb_flush_pending(struct mm_struct *mm) } #endif +struct vm_special_mapping +{ + const char *name; + struct page **pages; +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/init/main.c b/init/main.c index 48655ceb66f4..eb0ea86aefde 100644 --- a/init/main.c +++ b/init/main.c @@ -617,6 +617,10 @@ asmlinkage __visible void __init start_kernel(void) if (efi_enabled(EFI_RUNTIME_SERVICES)) efi_enter_virtual_mode(); #endif +#ifdef CONFIG_X86_ESPFIX64 + /* Should be run before the first non-init thread is created */ + init_espfix_bsp(); +#endif thread_info_cache_init(); cred_init(); fork_init(totalram_pages); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 74f5b580fe34..420d77afa8fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1418,8 +1418,13 @@ static struct ctl_table vm_table[] = { (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) { .procname = "vdso_enabled", +#ifdef CONFIG_X86_32 + .data = &vdso32_enabled, + .maxlen = sizeof(vdso32_enabled), +#else .data = &vdso_enabled, .maxlen = sizeof(vdso_enabled), +#endif .mode = 0644, .proc_handler = proc_dointvec, .extra1 = &zero, diff --git a/mm/mmap.c b/mm/mmap.c index b1202cf81f4b..52bbc9514d9d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2872,6 +2872,31 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) return 1; } +static int special_mapping_fault(struct vm_area_struct *vma, + struct vm_fault *vmf); + +/* + * Having a close hook prevents vma merging regardless of flags. + */ +static void special_mapping_close(struct vm_area_struct *vma) +{ +} + +static const char *special_mapping_name(struct vm_area_struct *vma) +{ + return ((struct vm_special_mapping *)vma->vm_private_data)->name; +} + +static const struct vm_operations_struct special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, + .name = special_mapping_name, +}; + +static const struct vm_operations_struct legacy_special_mapping_vmops = { + .close = special_mapping_close, + .fault = special_mapping_fault, +}; static int special_mapping_fault(struct vm_area_struct *vma, struct vm_fault *vmf) @@ -2887,7 +2912,13 @@ static int special_mapping_fault(struct vm_area_struct *vma, */ pgoff = vmf->pgoff - vma->vm_pgoff; - for (pages = vma->vm_private_data; pgoff && *pages; ++pages) + if (vma->vm_ops == &legacy_special_mapping_vmops) + pages = vma->vm_private_data; + else + pages = ((struct vm_special_mapping *)vma->vm_private_data)-> + pages; + + for (; pgoff && *pages; ++pages) pgoff--; if (*pages) { @@ -2900,30 +2931,11 @@ static int special_mapping_fault(struct vm_area_struct *vma, return VM_FAULT_SIGBUS; } -/* - * Having a close hook prevents vma merging regardless of flags. - */ -static void special_mapping_close(struct vm_area_struct *vma) -{ -} - -static const struct vm_operations_struct special_mapping_vmops = { - .close = special_mapping_close, - .fault = special_mapping_fault, -}; - -/* - * Called with mm->mmap_sem held for writing. - * Insert a new vma covering the given region, with the given flags. - * Its pages are supplied by the given array of struct page *. - * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. - * The region past the last page supplied will always produce SIGBUS. - * The array pointer and the pages it points to are assumed to stay alive - * for as long as this mapping might exist. - */ -struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, - unsigned long addr, unsigned long len, - unsigned long vm_flags, struct page **pages) +static struct vm_area_struct *__install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_operations_struct *ops, + void *priv) { int ret; struct vm_area_struct *vma; @@ -2940,8 +2952,8 @@ struct vm_area_struct *_install_special_mapping(struct mm_struct *mm, vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - vma->vm_ops = &special_mapping_vmops; - vma->vm_private_data = pages; + vma->vm_ops = ops; + vma->vm_private_data = priv; ret = insert_vm_struct(mm, vma); if (ret) @@ -2958,12 +2970,31 @@ out: return ERR_PTR(ret); } +/* + * Called with mm->mmap_sem held for writing. + * Insert a new vma covering the given region, with the given flags. + * Its pages are supplied by the given array of struct page *. + * The array can be shorter than len >> PAGE_SHIFT if it's null-terminated. + * The region past the last page supplied will always produce SIGBUS. + * The array pointer and the pages it points to are assumed to stay alive + * for as long as this mapping might exist. + */ +struct vm_area_struct *_install_special_mapping( + struct mm_struct *mm, + unsigned long addr, unsigned long len, + unsigned long vm_flags, const struct vm_special_mapping *spec) +{ + return __install_special_mapping(mm, addr, len, vm_flags, + &special_mapping_vmops, (void *)spec); +} + int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { - struct vm_area_struct *vma = _install_special_mapping(mm, - addr, len, vm_flags, pages); + struct vm_area_struct *vma = __install_special_mapping( + mm, addr, len, vm_flags, &legacy_special_mapping_vmops, + (void *)pages); if (IS_ERR(vma)) return PTR_ERR(vma); |