diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-12-23 11:53:04 -0800 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-12-23 11:53:04 -0800 | 
| commit | caf9a82657b313106aae8f4a35936c116a152299 (patch) | |
| tree | 525b164e34122b052ad06f56e6f88ed846471a58 /arch/x86 | |
| parent | 9c294ec08408ed90c0f2d994a7979366675e3734 (diff) | |
| parent | f6c4fd506cb626e4346aa81688f255e593a7c5a0 (diff) | |
| download | linux-caf9a82657b313106aae8f4a35936c116a152299.tar.bz2 | |
Merge branch 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 PTI preparatory patches from Thomas Gleixner:
 "Todays Advent calendar window contains twentyfour easy to digest
  patches. The original plan was to have twenty three matching the date,
  but a late fixup made that moot.
   - Move the cpu_entry_area mapping out of the fixmap into a separate
     address space. That's necessary because the fixmap becomes too big
     with NRCPUS=8192 and this caused already subtle and hard to
     diagnose failures.
     The top most patch is fresh from today and cures a brain slip of
     that tall grumpy german greybeard, who ignored the intricacies of
     32bit wraparounds.
   - Limit the number of CPUs on 32bit to 64. That's insane big already,
     but at least it's small enough to prevent address space issues with
     the cpu_entry_area map, which have been observed and debugged with
     the fixmap code
   - A few TLB flush fixes in various places plus documentation which of
     the TLB functions should be used for what.
   - Rename the SYSENTER stack to CPU_ENTRY_AREA stack as it is used for
     more than sysenter now and keeping the name makes backtraces
     confusing.
   - Prevent LDT inheritance on exec() by moving it to arch_dup_mmap(),
     which is only invoked on fork().
   - Make vysycall more robust.
   - A few fixes and cleanups of the debug_pagetables code. Check
     PAGE_PRESENT instead of checking the PTE for 0 and a cleanup of the
     C89 initialization of the address hint array which already was out
     of sync with the index enums.
   - Move the ESPFIX init to a different place to prepare for PTI.
   - Several code moves with no functional change to make PTI
     integration simpler and header files less convoluted.
   - Documentation fixes and clarifications"
* 'x86-pti-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits)
  x86/cpu_entry_area: Prevent wraparound in setup_cpu_entry_area_ptes() on 32bit
  init: Invoke init_espfix_bsp() from mm_init()
  x86/cpu_entry_area: Move it out of the fixmap
  x86/cpu_entry_area: Move it to a separate unit
  x86/mm: Create asm/invpcid.h
  x86/mm: Put MMU to hardware ASID translation in one place
  x86/mm: Remove hard-coded ASID limit checks
  x86/mm: Move the CR3 construction functions to tlbflush.h
  x86/mm: Add comments to clarify which TLB-flush functions are supposed to flush what
  x86/mm: Remove superfluous barriers
  x86/mm: Use __flush_tlb_one() for kernel memory
  x86/microcode: Dont abuse the TLB-flush interface
  x86/uv: Use the right TLB-flush API
  x86/entry: Rename SYSENTER_stack to CPU_ENTRY_AREA_entry_stack
  x86/doc: Remove obvious weirdnesses from the x86 MM layout documentation
  x86/mm/64: Improve the memory map documentation
  x86/ldt: Prevent LDT inheritance on exec
  x86/ldt: Rework locking
  arch, mm: Allow arch_dup_mmap() to fail
  x86/vsyscall/64: Warn and fail vsyscall emulation in NATIVE mode
  ...
Diffstat (limited to 'arch/x86')
35 files changed, 593 insertions, 426 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8eed3f94bfc7..d4fc98c50378 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -926,7 +926,8 @@ config MAXSMP  config NR_CPUS  	int "Maximum number of CPUs" if SMP && !MAXSMP  	range 2 8 if SMP && X86_32 && !X86_BIGSMP -	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK +	range 2 64 if SMP && X86_32 && X86_BIGSMP +	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64  	range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64  	default "1" if !SMP  	default "8192" if MAXSMP diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bd8b57a5c874..ace8f321a5a1 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -942,9 +942,9 @@ ENTRY(debug)  	/* Are we currently on the SYSENTER stack? */  	movl	PER_CPU_VAR(cpu_entry_area), %ecx -	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */ -	cmpl	$SIZEOF_SYSENTER_stack, %ecx +	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx +	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */ +	cmpl	$SIZEOF_entry_stack, %ecx  	jb	.Ldebug_from_sysenter_stack  	TRACE_IRQS_OFF @@ -986,9 +986,9 @@ ENTRY(nmi)  	/* Are we currently on the SYSENTER stack? */  	movl	PER_CPU_VAR(cpu_entry_area), %ecx -	addl	$CPU_ENTRY_AREA_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx -	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */ -	cmpl	$SIZEOF_SYSENTER_stack, %ecx +	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx +	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */ +	cmpl	$SIZEOF_entry_stack, %ecx  	jb	.Lnmi_from_sysenter_stack  	/* Not on SYSENTER stack. */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 423885bee398..3d19c830e1b1 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -158,8 +158,8 @@ END(native_usergs_sysret64)  	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)  /* The top word of the SYSENTER stack is hot and is usable as scratch space. */ -#define RSP_SCRATCH	CPU_ENTRY_AREA_SYSENTER_stack + \ -			SIZEOF_SYSENTER_stack - 8 + CPU_ENTRY_AREA +#define RSP_SCRATCH	CPU_ENTRY_AREA_entry_stack + \ +			SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA  ENTRY(entry_SYSCALL_64_trampoline)  	UNWIND_HINT_EMPTY diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index f279ba2643dc..1faf40f2dda9 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -37,6 +37,7 @@  #include <asm/unistd.h>  #include <asm/fixmap.h>  #include <asm/traps.h> +#include <asm/paravirt.h>  #define CREATE_TRACE_POINTS  #include "vsyscall_trace.h" @@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)  	WARN_ON_ONCE(address != regs->ip); +	/* This should be unreachable in NATIVE mode. */ +	if (WARN_ON(vsyscall_mode == NATIVE)) +		return false; +  	if (vsyscall_mode == NONE) {  		warn_bad_vsyscall(KERN_INFO, regs,  				  "vsyscall attempted with vsyscall=none"); @@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)  	return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;  } +/* + * The VSYSCALL page is the only user-accessible page in the kernel address + * range.  Normally, the kernel page tables can have _PAGE_USER clear, but + * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls + * are enabled. + * + * Some day we may create a "minimal" vsyscall mode in which we emulate + * vsyscalls but leave the page not present.  If so, we skip calling + * this. + */ +static void __init set_vsyscall_pgtable_user_bits(void) +{ +	pgd_t *pgd; +	p4d_t *p4d; +	pud_t *pud; +	pmd_t *pmd; + +	pgd = pgd_offset_k(VSYSCALL_ADDR); +	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); +	p4d = p4d_offset(pgd, VSYSCALL_ADDR); +#if CONFIG_PGTABLE_LEVELS >= 5 +	p4d->p4d |= _PAGE_USER; +#endif +	pud = pud_offset(p4d, VSYSCALL_ADDR); +	set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); +	pmd = pmd_offset(pud, VSYSCALL_ADDR); +	set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); +} +  void __init map_vsyscall(void)  {  	extern char __vsyscall_page;  	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); -	if (vsyscall_mode != NONE) +	if (vsyscall_mode != NONE) {  		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,  			     vsyscall_mode == NATIVE  			     ? PAGE_KERNEL_VSYSCALL  			     : PAGE_KERNEL_VVAR); +		set_vsyscall_pgtable_user_bits(); +	}  	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=  		     (unsigned long)VSYSCALL_ADDR); diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h new file mode 100644 index 000000000000..2fbc69a0916e --- /dev/null +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ASM_X86_CPU_ENTRY_AREA_H +#define _ASM_X86_CPU_ENTRY_AREA_H + +#include <linux/percpu-defs.h> +#include <asm/processor.h> + +/* + * cpu_entry_area is a percpu region that contains things needed by the CPU + * and early entry/exit code.  Real types aren't used for all fields here + * to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { +	char gdt[PAGE_SIZE]; + +	/* +	 * The GDT is just below entry_stack and thus serves (on x86_64) as +	 * a a read-only guard page. +	 */ +	struct entry_stack_page entry_stack_page; + +	/* +	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because +	 * we need task switches to work, and task switches write to the TSS. +	 */ +	struct tss_struct tss; + +	char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 +	/* +	 * Exception stacks used for IST entries. +	 * +	 * In the future, this should have a separate slot for each stack +	 * with guard pages between them. +	 */ +	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif +}; + +#define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_TOT_SIZE	(CPU_ENTRY_AREA_SIZE * NR_CPUS) + +DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + +extern void setup_cpu_entry_areas(void); +extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); + +#define	CPU_ENTRY_AREA_RO_IDT		CPU_ENTRY_AREA_BASE +#define CPU_ENTRY_AREA_PER_CPU		(CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) + +#define CPU_ENTRY_AREA_RO_IDT_VADDR	((void *)CPU_ENTRY_AREA_RO_IDT) + +#define CPU_ENTRY_AREA_MAP_SIZE			\ +	(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + +extern struct cpu_entry_area *get_cpu_entry_area(int cpu); + +static inline struct entry_stack *cpu_entry_stack(int cpu) +{ +	return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +} + +#endif diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index aab4fe9f49f8..ec8be07c0cda 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -7,6 +7,7 @@  #include <asm/mmu.h>  #include <asm/fixmap.h>  #include <asm/irq_vectors.h> +#include <asm/cpu_entry_area.h>  #include <linux/smp.h>  #include <linux/percpu.h> diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 0211029076ea..6777480d8a42 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -2,7 +2,7 @@  #ifndef _ASM_X86_ESPFIX_H  #define _ASM_X86_ESPFIX_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_ESPFIX64  #include <asm/percpu.h> @@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);  extern void init_espfix_bsp(void);  extern void init_espfix_ap(int cpu); - -#endif /* CONFIG_X86_64 */ +#else +static inline void init_espfix_ap(int cpu) { } +#endif  #endif /* _ASM_X86_ESPFIX_H */ diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 94fc4fa14127..64c4a30e0d39 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -45,46 +45,6 @@ extern unsigned long __FIXADDR_TOP;  #endif  /* - * cpu_entry_area is a percpu region in the fixmap that contains things - * needed by the CPU and early entry/exit code.  Real types aren't used - * for all fields here to avoid circular header dependencies. - * - * Every field is a virtual alias of some other allocated backing store. - * There is no direct allocation of a struct cpu_entry_area. - */ -struct cpu_entry_area { -	char gdt[PAGE_SIZE]; - -	/* -	 * The GDT is just below SYSENTER_stack and thus serves (on x86_64) as -	 * a a read-only guard page. -	 */ -	struct SYSENTER_stack_page SYSENTER_stack_page; - -	/* -	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because -	 * we need task switches to work, and task switches write to the TSS. -	 */ -	struct tss_struct tss; - -	char entry_trampoline[PAGE_SIZE]; - -#ifdef CONFIG_X86_64 -	/* -	 * Exception stacks used for IST entries. -	 * -	 * In the future, this should have a separate slot for each stack -	 * with guard pages between them. -	 */ -	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; -#endif -}; - -#define CPU_ENTRY_AREA_PAGES (sizeof(struct cpu_entry_area) / PAGE_SIZE) - -extern void setup_cpu_entry_areas(void); - -/*   * Here we define all the compile-time 'special' virtual   * addresses. The point is to have a constant address at   * compile time, but to set the physical address only @@ -123,7 +83,6 @@ enum fixed_addresses {  	FIX_IO_APIC_BASE_0,  	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,  #endif -	FIX_RO_IDT,	/* Virtual mapping for read-only IDT */  #ifdef CONFIG_X86_32  	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */  	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, @@ -139,9 +98,6 @@ enum fixed_addresses {  #ifdef	CONFIG_X86_INTEL_MID  	FIX_LNW_VRTC,  #endif -	/* Fixmap entries to remap the GDTs, one per processor. */ -	FIX_CPU_ENTRY_AREA_TOP, -	FIX_CPU_ENTRY_AREA_BOTTOM = FIX_CPU_ENTRY_AREA_TOP + (CPU_ENTRY_AREA_PAGES * NR_CPUS) - 1,  #ifdef CONFIG_ACPI_APEI_GHES  	/* Used for GHES mapping from assorted contexts */ @@ -182,7 +138,7 @@ enum fixed_addresses {  extern void reserve_top_address(unsigned long reserve);  #define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT) -#define FIXADDR_START		(FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)  extern int fixmaps_set; @@ -230,30 +186,5 @@ void __init *early_memremap_decrypted_wp(resource_size_t phys_addr,  void __early_set_fixmap(enum fixed_addresses idx,  			phys_addr_t phys, pgprot_t flags); -static inline unsigned int __get_cpu_entry_area_page_index(int cpu, int page) -{ -	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); - -	return FIX_CPU_ENTRY_AREA_BOTTOM - cpu*CPU_ENTRY_AREA_PAGES - page; -} - -#define __get_cpu_entry_area_offset_index(cpu, offset) ({		\ -	BUILD_BUG_ON(offset % PAGE_SIZE != 0);				\ -	__get_cpu_entry_area_page_index(cpu, offset / PAGE_SIZE);	\ -	}) - -#define get_cpu_entry_area_index(cpu, field)				\ -	__get_cpu_entry_area_offset_index((cpu), offsetof(struct cpu_entry_area, field)) - -static inline struct cpu_entry_area *get_cpu_entry_area(int cpu) -{ -	return (struct cpu_entry_area *)__fix_to_virt(__get_cpu_entry_area_page_index(cpu, 0)); -} - -static inline struct SYSENTER_stack *cpu_SYSENTER_stack(int cpu) -{ -	return &get_cpu_entry_area(cpu)->SYSENTER_stack_page.stack; -} -  #endif /* !__ASSEMBLY__ */  #endif /* _ASM_X86_FIXMAP_H */ diff --git a/arch/x86/include/asm/invpcid.h b/arch/x86/include/asm/invpcid.h new file mode 100644 index 000000000000..989cfa86de85 --- /dev/null +++ b/arch/x86/include/asm/invpcid.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INVPCID +#define _ASM_X86_INVPCID + +static inline void __invpcid(unsigned long pcid, unsigned long addr, +			     unsigned long type) +{ +	struct { u64 d[2]; } desc = { { pcid, addr } }; + +	/* +	 * The memory clobber is because the whole point is to invalidate +	 * stale TLB entries and, especially if we're flushing global +	 * mappings, we don't want the compiler to reorder any subsequent +	 * memory accesses before the TLB flush. +	 * +	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and +	 * invpcid (%rcx), %rax in long mode. +	 */ +	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" +		      : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR		0 +#define INVPCID_TYPE_SINGLE_CTXT	1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL	2 +#define INVPCID_TYPE_ALL_NON_GLOBAL	3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, +				     unsigned long addr) +{ +	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ +	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ +	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ +	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + +#endif /* _ASM_X86_INVPCID */ diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 9ea26f167497..5ff3e8af2c20 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -3,6 +3,7 @@  #define _ASM_X86_MMU_H  #include <linux/spinlock.h> +#include <linux/rwsem.h>  #include <linux/mutex.h>  #include <linux/atomic.h> @@ -27,7 +28,8 @@ typedef struct {  	atomic64_t tlb_gen;  #ifdef CONFIG_MODIFY_LDT_SYSCALL -	struct ldt_struct *ldt; +	struct rw_semaphore	ldt_usr_sem; +	struct ldt_struct	*ldt;  #endif  #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6d16d15d09a0..5ede7cae1d67 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -57,11 +57,17 @@ struct ldt_struct {  /*   * Used for LDT copy/destruction.   */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); +static inline void init_new_context_ldt(struct mm_struct *mm) +{ +	mm->context.ldt = NULL; +	init_rwsem(&mm->context.ldt_usr_sem); +} +int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);  void destroy_context_ldt(struct mm_struct *mm);  #else	/* CONFIG_MODIFY_LDT_SYSCALL */ -static inline int init_new_context_ldt(struct task_struct *tsk, -				       struct mm_struct *mm) +static inline void init_new_context_ldt(struct mm_struct *mm) { } +static inline int ldt_dup_context(struct mm_struct *oldmm, +				  struct mm_struct *mm)  {  	return 0;  } @@ -132,18 +138,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);  static inline int init_new_context(struct task_struct *tsk,  				   struct mm_struct *mm)  { +	mutex_init(&mm->context.lock); +  	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);  	atomic64_set(&mm->context.tlb_gen, 0); -	#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS  	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {  		/* pkey 0 is the default and always allocated */  		mm->context.pkey_allocation_map = 0x1;  		/* -1 means unallocated or invalid */  		mm->context.execute_only_pkey = -1;  	} -	#endif -	return init_new_context_ldt(tsk, mm); +#endif +	init_new_context_ldt(mm); +	return 0;  }  static inline void destroy_context(struct mm_struct *mm)  { @@ -176,10 +185,10 @@ do {						\  } while (0)  #endif -static inline void arch_dup_mmap(struct mm_struct *oldmm, -				 struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)  {  	paravirt_arch_dup_mmap(oldmm, mm); +	return ldt_dup_context(oldmm, mm);  }  static inline void arch_exit_mmap(struct mm_struct *mm) @@ -282,33 +291,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,  }  /* - * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID - * bits.  This serves two purposes.  It prevents a nasty situation in - * which PCID-unaware code saves CR3, loads some other value (with PCID - * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if - * the saved ASID was nonzero.  It also means that any bugs involving - * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger - * deterministically. - */ - -static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid) -{ -	if (static_cpu_has(X86_FEATURE_PCID)) { -		VM_WARN_ON_ONCE(asid > 4094); -		return __sme_pa(mm->pgd) | (asid + 1); -	} else { -		VM_WARN_ON_ONCE(asid != 0); -		return __sme_pa(mm->pgd); -	} -} - -static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid) -{ -	VM_WARN_ON_ONCE(asid > 4094); -	return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH; -} - -/*   * This can be used from process context to figure out what the value of   * CR3 is without needing to do a (slow) __read_cr3().   * @@ -317,7 +299,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)   */  static inline unsigned long __get_current_cr3_fast(void)  { -	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm), +	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,  		this_cpu_read(cpu_tlbstate.loaded_mm_asid));  	/* For now, be very restrictive about when this can be called. */ diff --git a/arch/x86/include/asm/pgtable_32_types.h b/arch/x86/include/asm/pgtable_32_types.h index f2ca9b28fd68..ce245b0cdfca 100644 --- a/arch/x86/include/asm/pgtable_32_types.h +++ b/arch/x86/include/asm/pgtable_32_types.h @@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */  #define LAST_PKMAP 1024  #endif -#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1))	\ -		    & PMD_MASK) +/* + * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c + * to avoid include recursion hell + */ +#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 40) + +#define CPU_ENTRY_AREA_BASE				\ +	((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK) + +#define PKMAP_BASE		\ +	((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)  #ifdef CONFIG_HIGHMEM  # define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)  #else -# define VMALLOC_END	(FIXADDR_START - 2 * PAGE_SIZE) +# define VMALLOC_END	(CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)  #endif  #define MODULES_VADDR	VMALLOC_START diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6d5f45dcd4a1..3d27831bc58d 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -76,32 +76,41 @@ typedef struct { pteval_t pte; } pte_t;  #define PGDIR_MASK	(~(PGDIR_SIZE - 1))  /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM		_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#define MAXMEM			_AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +  #ifdef CONFIG_X86_5LEVEL -#define VMALLOC_SIZE_TB _AC(16384, UL) -#define __VMALLOC_BASE	_AC(0xff92000000000000, UL) -#define __VMEMMAP_BASE	_AC(0xffd4000000000000, UL) +# define VMALLOC_SIZE_TB	_AC(16384, UL) +# define __VMALLOC_BASE		_AC(0xff92000000000000, UL) +# define __VMEMMAP_BASE		_AC(0xffd4000000000000, UL)  #else -#define VMALLOC_SIZE_TB	_AC(32, UL) -#define __VMALLOC_BASE	_AC(0xffffc90000000000, UL) -#define __VMEMMAP_BASE	_AC(0xffffea0000000000, UL) +# define VMALLOC_SIZE_TB	_AC(32, UL) +# define __VMALLOC_BASE		_AC(0xffffc90000000000, UL) +# define __VMEMMAP_BASE		_AC(0xffffea0000000000, UL)  #endif +  #ifdef CONFIG_RANDOMIZE_MEMORY -#define VMALLOC_START	vmalloc_base -#define VMEMMAP_START	vmemmap_base +# define VMALLOC_START		vmalloc_base +# define VMEMMAP_START		vmemmap_base  #else -#define VMALLOC_START	__VMALLOC_BASE -#define VMEMMAP_START	__VMEMMAP_BASE +# define VMALLOC_START		__VMALLOC_BASE +# define VMEMMAP_START		__VMEMMAP_BASE  #endif /* CONFIG_RANDOMIZE_MEMORY */ -#define VMALLOC_END	(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) -#define MODULES_VADDR    (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + +#define VMALLOC_END		(VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) + +#define MODULES_VADDR		(__START_KERNEL_map + KERNEL_IMAGE_SIZE)  /* The module sections ends with the start of the fixmap */ -#define MODULES_END   __fix_to_virt(__end_of_fixed_addresses + 1) -#define MODULES_LEN   (MODULES_END - MODULES_VADDR) -#define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define EFI_VA_START	 ( -4 * (_AC(1, UL) << 30)) -#define EFI_VA_END	 (-68 * (_AC(1, UL) << 30)) +#define MODULES_END		__fix_to_virt(__end_of_fixed_addresses + 1) +#define MODULES_LEN		(MODULES_END - MODULES_VADDR) + +#define ESPFIX_PGD_ENTRY	_AC(-2, UL) +#define ESPFIX_BASE_ADDR	(ESPFIX_PGD_ENTRY << P4D_SHIFT) + +#define CPU_ENTRY_AREA_PGD	_AC(-3, UL) +#define CPU_ENTRY_AREA_BASE	(CPU_ENTRY_AREA_PGD << P4D_SHIFT) + +#define EFI_VA_START		( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END		(-68 * (_AC(1, UL) << 30))  #define EARLY_DYNAMIC_PAGE_TABLES	64 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1f2434ee9f80..cad8dab266bc 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -337,12 +337,12 @@ struct x86_hw_tss {  #define IO_BITMAP_OFFSET		(offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss))  #define INVALID_IO_BITMAP_OFFSET	0x8000 -struct SYSENTER_stack { +struct entry_stack {  	unsigned long		words[64];  }; -struct SYSENTER_stack_page { -	struct SYSENTER_stack stack; +struct entry_stack_page { +	struct entry_stack stack;  } __aligned(PAGE_SIZE);  struct tss_struct { diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f8062bfd43a0..f73706878772 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -16,7 +16,7 @@ enum stack_type {  	STACK_TYPE_TASK,  	STACK_TYPE_IRQ,  	STACK_TYPE_SOFTIRQ, -	STACK_TYPE_SYSENTER, +	STACK_TYPE_ENTRY,  	STACK_TYPE_EXCEPTION,  	STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1,  }; @@ -29,7 +29,7 @@ struct stack_info {  bool in_task_stack(unsigned long *stack, struct task_struct *task,  		   struct stack_info *info); -bool in_sysenter_stack(unsigned long *stack, struct stack_info *info); +bool in_entry_stack(unsigned long *stack, struct stack_info *info);  int get_stack_info(unsigned long *stack, struct task_struct *task,  		   struct stack_info *info, unsigned long *visit_mask); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 877b5c1a1b12..e1884cf35257 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -9,70 +9,66 @@  #include <asm/cpufeature.h>  #include <asm/special_insns.h>  #include <asm/smp.h> +#include <asm/invpcid.h> -static inline void __invpcid(unsigned long pcid, unsigned long addr, -			     unsigned long type) +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)  { -	struct { u64 d[2]; } desc = { { pcid, addr } }; -  	/* -	 * The memory clobber is because the whole point is to invalidate -	 * stale TLB entries and, especially if we're flushing global -	 * mappings, we don't want the compiler to reorder any subsequent -	 * memory accesses before the TLB flush. -	 * -	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and -	 * invpcid (%rcx), %rax in long mode. +	 * Bump the generation count.  This also serves as a full barrier +	 * that synchronizes with switch_mm(): callers are required to order +	 * their read of mm_cpumask after their writes to the paging +	 * structures.  	 */ -	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" -		      : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +	return atomic64_inc_return(&mm->context.tlb_gen);  } -#define INVPCID_TYPE_INDIV_ADDR		0 -#define INVPCID_TYPE_SINGLE_CTXT	1 -#define INVPCID_TYPE_ALL_INCL_GLOBAL	2 -#define INVPCID_TYPE_ALL_NON_GLOBAL	3 +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS		12 +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#define PTI_CONSUMED_ASID_BITS		0 -/* Flush all mappings for a given pcid and addr, not including globals. */ -static inline void invpcid_flush_one(unsigned long pcid, -				     unsigned long addr) -{ -	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); -} +#define CR3_AVAIL_ASID_BITS (CR3_HW_ASID_BITS - PTI_CONSUMED_ASID_BITS) +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid.  -1 below to account + * for them being zero-based.  Another -1 is because ASID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_ASID_BITS) - 2) -/* Flush all mappings for a given PCID, not including globals. */ -static inline void invpcid_flush_single_context(unsigned long pcid) +static inline u16 kern_pcid(u16 asid)  { -	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +	/* +	 * If PCID is on, ASID-aware code paths put the ASID+1 into the +	 * PCID bits.  This serves two purposes.  It prevents a nasty +	 * situation in which PCID-unaware code saves CR3, loads some other +	 * value (with PCID == 0), and then restores CR3, thus corrupting +	 * the TLB for ASID 0 if the saved ASID was nonzero.  It also means +	 * that any bugs involving loading a PCID-enabled CR3 with +	 * CR4.PCIDE off will trigger deterministically. +	 */ +	return asid + 1;  } -/* Flush all mappings, including globals, for all PCIDs. */ -static inline void invpcid_flush_all(void) +struct pgd_t; +static inline unsigned long build_cr3(pgd_t *pgd, u16 asid)  { -	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +	if (static_cpu_has(X86_FEATURE_PCID)) { +		return __sme_pa(pgd) | kern_pcid(asid); +	} else { +		VM_WARN_ON_ONCE(asid != 0); +		return __sme_pa(pgd); +	}  } -/* Flush all mappings for all PCIDs except globals. */ -static inline void invpcid_flush_all_nonglobals(void) +static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)  { -	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); -} - -static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) -{ -	u64 new_tlb_gen; - -	/* -	 * Bump the generation count.  This also serves as a full barrier -	 * that synchronizes with switch_mm(): callers are required to order -	 * their read of mm_cpumask after their writes to the paging -	 * structures. -	 */ -	smp_mb__before_atomic(); -	new_tlb_gen = atomic64_inc_return(&mm->context.tlb_gen); -	smp_mb__after_atomic(); - -	return new_tlb_gen; +	VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +	VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); +	return __sme_pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH;  }  #ifdef CONFIG_PARAVIRT @@ -237,6 +233,9 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)  extern void initialize_tlbstate_and_flush(void); +/* + * flush the entire current user mapping + */  static inline void __native_flush_tlb(void)  {  	/* @@ -249,20 +248,12 @@ static inline void __native_flush_tlb(void)  	preempt_enable();  } -static inline void __native_flush_tlb_global_irq_disabled(void) -{ -	unsigned long cr4; - -	cr4 = this_cpu_read(cpu_tlbstate.cr4); -	/* clear PGE */ -	native_write_cr4(cr4 & ~X86_CR4_PGE); -	/* write old PGE again and flush TLBs */ -	native_write_cr4(cr4); -} - +/* + * flush everything + */  static inline void __native_flush_tlb_global(void)  { -	unsigned long flags; +	unsigned long cr4, flags;  	if (static_cpu_has(X86_FEATURE_INVPCID)) {  		/* @@ -280,22 +271,36 @@ static inline void __native_flush_tlb_global(void)  	 */  	raw_local_irq_save(flags); -	__native_flush_tlb_global_irq_disabled(); +	cr4 = this_cpu_read(cpu_tlbstate.cr4); +	/* toggle PGE */ +	native_write_cr4(cr4 ^ X86_CR4_PGE); +	/* write old PGE again and flush TLBs */ +	native_write_cr4(cr4);  	raw_local_irq_restore(flags);  } +/* + * flush one page in the user mapping + */  static inline void __native_flush_tlb_single(unsigned long addr)  {  	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");  } +/* + * flush everything + */  static inline void __flush_tlb_all(void)  { -	if (boot_cpu_has(X86_FEATURE_PGE)) +	if (boot_cpu_has(X86_FEATURE_PGE)) {  		__flush_tlb_global(); -	else +	} else { +		/* +		 * !PGE -> !PCID (setup_pcid()), thus every flush is total. +		 */  		__flush_tlb(); +	}  	/*  	 * Note: if we somehow had PCID but not PGE, then this wouldn't work -- @@ -306,6 +311,9 @@ static inline void __flush_tlb_all(void)  	 */  } +/* + * flush one page in the kernel mapping + */  static inline void __flush_tlb_one(unsigned long addr)  {  	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index cd360a5e0dca..676b7cf4b62b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -97,6 +97,6 @@ void common(void) {  	/* Layout info for cpu_entry_area */  	OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss);  	OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); -	OFFSET(CPU_ENTRY_AREA_SYSENTER_stack, cpu_entry_area, SYSENTER_stack_page); -	DEFINE(SIZEOF_SYSENTER_stack, sizeof(struct SYSENTER_stack)); +	OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); +	DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack));  } diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 7d20d9c0b3d6..fa1261eefa16 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -48,7 +48,7 @@ void foo(void)  	/* Offset from the sysenter stack to tss.sp0 */  	DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - -	       offsetofend(struct cpu_entry_area, SYSENTER_stack_page.stack)); +	       offsetofend(struct cpu_entry_area, entry_stack_page.stack));  #ifdef CONFIG_CC_STACKPROTECTOR  	BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7416da3ec4df..c9757f07d738 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -506,102 +506,8 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {  	  [0 ... N_EXCEPTION_STACKS - 1]	= EXCEPTION_STKSZ,  	  [DEBUG_STACK - 1]			= DEBUG_STKSZ  }; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks -	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); -#endif - -static DEFINE_PER_CPU_PAGE_ALIGNED(struct SYSENTER_stack_page, -				   SYSENTER_stack_storage); - -static void __init -set_percpu_fixmap_pages(int idx, void *ptr, int pages, pgprot_t prot) -{ -	for ( ; pages; pages--, idx--, ptr += PAGE_SIZE) -		__set_fixmap(idx, per_cpu_ptr_to_phys(ptr), prot); -} - -/* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) -{ -#ifdef CONFIG_X86_64 -	extern char _entry_trampoline[]; - -	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ -	pgprot_t gdt_prot = PAGE_KERNEL_RO; -	pgprot_t tss_prot = PAGE_KERNEL_RO; -#else -	/* -	 * On native 32-bit systems, the GDT cannot be read-only because -	 * our double fault handler uses a task gate, and entering through -	 * a task gate needs to change an available TSS to busy.  If the -	 * GDT is read-only, that will triple fault.  The TSS cannot be -	 * read-only because the CPU writes to it on task switches. -	 * -	 * On Xen PV, the GDT must be read-only because the hypervisor -	 * requires it. -	 */ -	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? -		PAGE_KERNEL_RO : PAGE_KERNEL; -	pgprot_t tss_prot = PAGE_KERNEL; -#endif - -	__set_fixmap(get_cpu_entry_area_index(cpu, gdt), get_cpu_gdt_paddr(cpu), gdt_prot); -	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, SYSENTER_stack_page), -				per_cpu_ptr(&SYSENTER_stack_storage, cpu), 1, -				PAGE_KERNEL); - -	/* -	 * The Intel SDM says (Volume 3, 7.2.1): -	 * -	 *  Avoid placing a page boundary in the part of the TSS that the -	 *  processor reads during a task switch (the first 104 bytes). The -	 *  processor may not correctly perform address translations if a -	 *  boundary occurs in this area. During a task switch, the processor -	 *  reads and writes into the first 104 bytes of each TSS (using -	 *  contiguous physical addresses beginning with the physical address -	 *  of the first byte of the TSS). So, after TSS access begins, if -	 *  part of the 104 bytes is not physically contiguous, the processor -	 *  will access incorrect information without generating a page-fault -	 *  exception. -	 * -	 * There are also a lot of errata involving the TSS spanning a page -	 * boundary.  Assert that we're not doing that. -	 */ -	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ -		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); -	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); -	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, tss), -				&per_cpu(cpu_tss_rw, cpu), -				sizeof(struct tss_struct) / PAGE_SIZE, -				tss_prot); - -#ifdef CONFIG_X86_32 -	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu);  #endif -#ifdef CONFIG_X86_64 -	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); -	BUILD_BUG_ON(sizeof(exception_stacks) != -		     sizeof(((struct cpu_entry_area *)0)->exception_stacks)); -	set_percpu_fixmap_pages(get_cpu_entry_area_index(cpu, exception_stacks), -				&per_cpu(exception_stacks, cpu), -				sizeof(exception_stacks) / PAGE_SIZE, -				PAGE_KERNEL); - -	__set_fixmap(get_cpu_entry_area_index(cpu, entry_trampoline), -		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); -#endif -} - -void __init setup_cpu_entry_areas(void) -{ -	unsigned int cpu; - -	for_each_possible_cpu(cpu) -		setup_cpu_entry_area(cpu); -} -  /* Load the original GDT from the per-cpu structure */  void load_direct_gdt(int cpu)  { @@ -1348,7 +1254,7 @@ void enable_sep_cpu(void)  	tss->x86_tss.ss1 = __KERNEL_CS;  	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); -	wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1), 0); +	wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0);  	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0);  	put_cpu(); @@ -1465,7 +1371,7 @@ void syscall_init(void)  	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).  	 */  	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); -	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); +	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1));  	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);  #else  	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1680,7 +1586,7 @@ void cpu_init(void)  	 */  	set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);  	load_TR_desc(); -	load_sp0((unsigned long)(cpu_SYSENTER_stack(cpu) + 1)); +	load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1));  	load_mm_ldt(&init_mm); diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 7dbcb7adf797..8ccdca6d3f9e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -565,15 +565,6 @@ static void print_ucode(struct ucode_cpu_info *uci)  }  #else -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ -	__native_flush_tlb_global_irq_disabled(); -} -  static inline void print_ucode(struct ucode_cpu_info *uci)  {  	struct microcode_intel *mc; @@ -602,10 +593,6 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early)  	if (rev != mc->hdr.rev)  		return -1; -#ifdef CONFIG_X86_64 -	/* Flush global tlb. This is precaution. */ -	flush_tlb_early(); -#endif  	uci->cpu_sig.rev = rev;  	if (early) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index bbd6d986e2d0..36b17e0febe8 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,6 +18,7 @@  #include <linux/nmi.h>  #include <linux/sysfs.h> +#include <asm/cpu_entry_area.h>  #include <asm/stacktrace.h>  #include <asm/unwind.h> @@ -43,9 +44,9 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task,  	return true;  } -bool in_sysenter_stack(unsigned long *stack, struct stack_info *info) +bool in_entry_stack(unsigned long *stack, struct stack_info *info)  { -	struct SYSENTER_stack *ss = cpu_SYSENTER_stack(smp_processor_id()); +	struct entry_stack *ss = cpu_entry_stack(smp_processor_id());  	void *begin = ss;  	void *end = ss + 1; @@ -53,7 +54,7 @@ bool in_sysenter_stack(unsigned long *stack, struct stack_info *info)  	if ((void *)stack < begin || (void *)stack >= end)  		return false; -	info->type	= STACK_TYPE_SYSENTER; +	info->type	= STACK_TYPE_ENTRY;  	info->begin	= begin;  	info->end	= end;  	info->next_sp	= NULL; @@ -111,13 +112,13 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,  	 * - task stack  	 * - interrupt stack  	 * - HW exception stacks (double fault, nmi, debug, mce) -	 * - SYSENTER stack +	 * - entry stack  	 *  	 * x86-32 can have up to four stacks:  	 * - task stack  	 * - softirq stack  	 * - hardirq stack -	 * - SYSENTER stack +	 * - entry stack  	 */  	for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) {  		const char *stack_name; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5ff13a6b3680..04170f63e3a1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -26,8 +26,8 @@ const char *stack_type_name(enum stack_type type)  	if (type == STACK_TYPE_SOFTIRQ)  		return "SOFTIRQ"; -	if (type == STACK_TYPE_SYSENTER) -		return "SYSENTER"; +	if (type == STACK_TYPE_ENTRY) +		return "ENTRY_TRAMPOLINE";  	return NULL;  } @@ -96,7 +96,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,  	if (task != current)  		goto unknown; -	if (in_sysenter_stack(stack, info)) +	if (in_entry_stack(stack, info))  		goto recursion_check;  	if (in_hardirq_stack(stack, info)) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index abc828f8c297..563e28d14f2c 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -37,8 +37,14 @@ const char *stack_type_name(enum stack_type type)  	if (type == STACK_TYPE_IRQ)  		return "IRQ"; -	if (type == STACK_TYPE_SYSENTER) -		return "SYSENTER"; +	if (type == STACK_TYPE_ENTRY) { +		/* +		 * On 64-bit, we have a generic entry stack that we +		 * use for all the kernel entry points, including +		 * SYSENTER. +		 */ +		return "ENTRY_TRAMPOLINE"; +	}  	if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST)  		return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -118,7 +124,7 @@ int get_stack_info(unsigned long *stack, struct task_struct *task,  	if (in_irq_stack(stack, info))  		goto recursion_check; -	if (in_sysenter_stack(stack, info)) +	if (in_entry_stack(stack, info))  		goto recursion_check;  	goto unknown; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 1c1eae961340..a6b5d62f45a7 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -5,6 +5,11 @@   * Copyright (C) 2002 Andi Kleen   *   * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + *	contex.ldt_usr_sem + *	  mmap_sem + *	    context.lock   */  #include <linux/errno.h> @@ -42,7 +47,7 @@ static void refresh_ldt_segments(void)  #endif  } -/* context.lock is held for us, so we don't need any locking. */ +/* context.lock is held by the task which issued the smp function call */  static void flush_ldt(void *__mm)  {  	struct mm_struct *mm = __mm; @@ -99,15 +104,17 @@ static void finalize_ldt_struct(struct ldt_struct *ldt)  	paravirt_alloc_ldt(ldt->entries, ldt->nr_entries);  } -/* context.lock is held */ -static void install_ldt(struct mm_struct *current_mm, -			struct ldt_struct *ldt) +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt)  { +	mutex_lock(&mm->context.lock); +  	/* Synchronizes with READ_ONCE in load_mm_ldt. */ -	smp_store_release(¤t_mm->context.ldt, ldt); +	smp_store_release(&mm->context.ldt, ldt); -	/* Activate the LDT for all CPUs using current_mm. */ -	on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); +	/* Activate the LDT for all CPUs using currents mm. */ +	on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); + +	mutex_unlock(&mm->context.lock);  }  static void free_ldt_struct(struct ldt_struct *ldt) @@ -124,27 +131,20 @@ static void free_ldt_struct(struct ldt_struct *ldt)  }  /* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed.   */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)  {  	struct ldt_struct *new_ldt; -	struct mm_struct *old_mm;  	int retval = 0; -	mutex_init(&mm->context.lock); -	old_mm = current->mm; -	if (!old_mm) { -		mm->context.ldt = NULL; +	if (!old_mm)  		return 0; -	}  	mutex_lock(&old_mm->context.lock); -	if (!old_mm->context.ldt) { -		mm->context.ldt = NULL; +	if (!old_mm->context.ldt)  		goto out_unlock; -	}  	new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries);  	if (!new_ldt) { @@ -180,7 +180,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)  	unsigned long entries_size;  	int retval; -	mutex_lock(&mm->context.lock); +	down_read(&mm->context.ldt_usr_sem);  	if (!mm->context.ldt) {  		retval = 0; @@ -209,7 +209,7 @@ static int read_ldt(void __user *ptr, unsigned long bytecount)  	retval = bytecount;  out_unlock: -	mutex_unlock(&mm->context.lock); +	up_read(&mm->context.ldt_usr_sem);  	return retval;  } @@ -269,7 +269,8 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)  			ldt.avl = 0;  	} -	mutex_lock(&mm->context.lock); +	if (down_write_killable(&mm->context.ldt_usr_sem)) +		return -EINTR;  	old_ldt       = mm->context.ldt;  	old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; @@ -291,7 +292,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)  	error = 0;  out_unlock: -	mutex_unlock(&mm->context.lock); +	up_write(&mm->context.ldt_usr_sem);  out:  	return error;  } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 35cb20994e32..c5970efa8557 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -932,12 +932,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle,  	initial_code = (unsigned long)start_secondary;  	initial_stack  = idle->thread.sp; -	/* -	 * Enable the espfix hack for this CPU -	*/ -#ifdef CONFIG_X86_ESPFIX64 +	/* Enable the espfix hack for this CPU */  	init_espfix_ap(cpu); -#endif  	/* So we see what's up */  	announce_cpu(cpu, apicid); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e98f8b66a460..f69dbd47d733 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -51,6 +51,7 @@  #include <asm/traps.h>  #include <asm/desc.h>  #include <asm/fpu/internal.h> +#include <asm/cpu_entry_area.h>  #include <asm/mce.h>  #include <asm/fixmap.h>  #include <asm/mach_traps.h> @@ -951,8 +952,9 @@ void __init trap_init(void)  	 * "sidt" instruction will not leak the location of the kernel, and  	 * to defend the IDT against arbitrary memory write vulnerabilities.  	 * It will be reloaded in cpu_init() */ -	__set_fixmap(FIX_RO_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); -	idt_descr.address = fix_to_virt(FIX_RO_IDT); +	cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table), +		    PAGE_KERNEL_RO); +	idt_descr.address = CPU_ENTRY_AREA_RO_IDT;  	/*  	 * Should be a barrier for any external CPU state: diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 8e13b8cc6bed..52195ee3f6d5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -10,7 +10,7 @@ CFLAGS_REMOVE_mem_encrypt.o	= -pg  endif  obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ -	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o +	    pat.o pgtable.o physaddr.o setup_nx.o tlb.o cpu_entry_area.o  # Make sure __phys_addr has no stackprotector  nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c new file mode 100644 index 000000000000..fe814fd5e014 --- /dev/null +++ b/arch/x86/mm/cpu_entry_area.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/spinlock.h> +#include <linux/percpu.h> + +#include <asm/cpu_entry_area.h> +#include <asm/pgtable.h> +#include <asm/fixmap.h> +#include <asm/desc.h> + +static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); + +#ifdef CONFIG_X86_64 +static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks +	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +#endif + +struct cpu_entry_area *get_cpu_entry_area(int cpu) +{ +	unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE; +	BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0); + +	return (struct cpu_entry_area *) va; +} +EXPORT_SYMBOL(get_cpu_entry_area); + +void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags) +{ +	unsigned long va = (unsigned long) cea_vaddr; + +	set_pte_vaddr(va, pfn_pte(pa >> PAGE_SHIFT, flags)); +} + +static void __init +cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) +{ +	for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE) +		cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); +} + +/* Setup the fixmap mappings only once per-processor */ +static void __init setup_cpu_entry_area(int cpu) +{ +#ifdef CONFIG_X86_64 +	extern char _entry_trampoline[]; + +	/* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ +	pgprot_t gdt_prot = PAGE_KERNEL_RO; +	pgprot_t tss_prot = PAGE_KERNEL_RO; +#else +	/* +	 * On native 32-bit systems, the GDT cannot be read-only because +	 * our double fault handler uses a task gate, and entering through +	 * a task gate needs to change an available TSS to busy.  If the +	 * GDT is read-only, that will triple fault.  The TSS cannot be +	 * read-only because the CPU writes to it on task switches. +	 * +	 * On Xen PV, the GDT must be read-only because the hypervisor +	 * requires it. +	 */ +	pgprot_t gdt_prot = boot_cpu_has(X86_FEATURE_XENPV) ? +		PAGE_KERNEL_RO : PAGE_KERNEL; +	pgprot_t tss_prot = PAGE_KERNEL; +#endif + +	cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), +		    gdt_prot); + +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, +			     per_cpu_ptr(&entry_stack_storage, cpu), 1, +			     PAGE_KERNEL); + +	/* +	 * The Intel SDM says (Volume 3, 7.2.1): +	 * +	 *  Avoid placing a page boundary in the part of the TSS that the +	 *  processor reads during a task switch (the first 104 bytes). The +	 *  processor may not correctly perform address translations if a +	 *  boundary occurs in this area. During a task switch, the processor +	 *  reads and writes into the first 104 bytes of each TSS (using +	 *  contiguous physical addresses beginning with the physical address +	 *  of the first byte of the TSS). So, after TSS access begins, if +	 *  part of the 104 bytes is not physically contiguous, the processor +	 *  will access incorrect information without generating a page-fault +	 *  exception. +	 * +	 * There are also a lot of errata involving the TSS spanning a page +	 * boundary.  Assert that we're not doing that. +	 */ +	BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ +		      offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); +	BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, +			     &per_cpu(cpu_tss_rw, cpu), +			     sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); + +#ifdef CONFIG_X86_32 +	per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); +#endif + +#ifdef CONFIG_X86_64 +	BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); +	BUILD_BUG_ON(sizeof(exception_stacks) != +		     sizeof(((struct cpu_entry_area *)0)->exception_stacks)); +	cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, +			     &per_cpu(exception_stacks, cpu), +			     sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); + +	cea_set_pte(&get_cpu_entry_area(cpu)->entry_trampoline, +		     __pa_symbol(_entry_trampoline), PAGE_KERNEL_RX); +#endif +} + +static __init void setup_cpu_entry_area_ptes(void) +{ +#ifdef CONFIG_X86_32 +	unsigned long start, end; + +	BUILD_BUG_ON(CPU_ENTRY_AREA_PAGES * PAGE_SIZE < CPU_ENTRY_AREA_MAP_SIZE); +	BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK); + +	start = CPU_ENTRY_AREA_BASE; +	end = start + CPU_ENTRY_AREA_MAP_SIZE; + +	/* Careful here: start + PMD_SIZE might wrap around */ +	for (; start < end && start >= CPU_ENTRY_AREA_BASE; start += PMD_SIZE) +		populate_extra_pte(start); +#endif +} + +void __init setup_cpu_entry_areas(void) +{ +	unsigned int cpu; + +	setup_cpu_entry_area_ptes(); + +	for_each_possible_cpu(cpu) +		setup_cpu_entry_area(cpu); +} diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 5e3ac6fe6c9e..43dedbfb7257 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -44,10 +44,12 @@ struct addr_marker {  	unsigned long max_lines;  }; -/* indices for address_markers; keep sync'd w/ address_markers below */ +/* Address space markers hints */ + +#ifdef CONFIG_X86_64 +  enum address_markers_idx {  	USER_SPACE_NR = 0, -#ifdef CONFIG_X86_64  	KERNEL_SPACE_NR,  	LOW_KERNEL_NR,  	VMALLOC_START_NR, @@ -56,56 +58,74 @@ enum address_markers_idx {  	KASAN_SHADOW_START_NR,  	KASAN_SHADOW_END_NR,  #endif -# ifdef CONFIG_X86_ESPFIX64 +	CPU_ENTRY_AREA_NR, +#ifdef CONFIG_X86_ESPFIX64  	ESPFIX_START_NR, -# endif +#endif +#ifdef CONFIG_EFI +	EFI_END_NR, +#endif  	HIGH_KERNEL_NR,  	MODULES_VADDR_NR,  	MODULES_END_NR, -#else +	FIXADDR_START_NR, +	END_OF_SPACE_NR, +}; + +static struct addr_marker address_markers[] = { +	[USER_SPACE_NR]		= { 0,			"User Space" }, +	[KERNEL_SPACE_NR]	= { (1UL << 63),	"Kernel Space" }, +	[LOW_KERNEL_NR]		= { 0UL,		"Low Kernel Mapping" }, +	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" }, +	[VMEMMAP_START_NR]	= { 0UL,		"Vmemmap" }, +#ifdef CONFIG_KASAN +	[KASAN_SHADOW_START_NR]	= { KASAN_SHADOW_START,	"KASAN shadow" }, +	[KASAN_SHADOW_END_NR]	= { KASAN_SHADOW_END,	"KASAN shadow end" }, +#endif +	[CPU_ENTRY_AREA_NR]	= { CPU_ENTRY_AREA_BASE,"CPU entry Area" }, +#ifdef CONFIG_X86_ESPFIX64 +	[ESPFIX_START_NR]	= { ESPFIX_BASE_ADDR,	"ESPfix Area", 16 }, +#endif +#ifdef CONFIG_EFI +	[EFI_END_NR]		= { EFI_VA_END,		"EFI Runtime Services" }, +#endif +	[HIGH_KERNEL_NR]	= { __START_KERNEL_map,	"High Kernel Mapping" }, +	[MODULES_VADDR_NR]	= { MODULES_VADDR,	"Modules" }, +	[MODULES_END_NR]	= { MODULES_END,	"End Modules" }, +	[FIXADDR_START_NR]	= { FIXADDR_START,	"Fixmap Area" }, +	[END_OF_SPACE_NR]	= { -1,			NULL } +}; + +#else /* CONFIG_X86_64 */ + +enum address_markers_idx { +	USER_SPACE_NR = 0,  	KERNEL_SPACE_NR,  	VMALLOC_START_NR,  	VMALLOC_END_NR, -# ifdef CONFIG_HIGHMEM +#ifdef CONFIG_HIGHMEM  	PKMAP_BASE_NR, -# endif -	FIXADDR_START_NR,  #endif +	CPU_ENTRY_AREA_NR, +	FIXADDR_START_NR, +	END_OF_SPACE_NR,  }; -/* Address space markers hints */  static struct addr_marker address_markers[] = { -	{ 0, "User Space" }, -#ifdef CONFIG_X86_64 -	{ 0x8000000000000000UL, "Kernel Space" }, -	{ 0/* PAGE_OFFSET */,   "Low Kernel Mapping" }, -	{ 0/* VMALLOC_START */, "vmalloc() Area" }, -	{ 0/* VMEMMAP_START */, "Vmemmap" }, -#ifdef CONFIG_KASAN -	{ KASAN_SHADOW_START,	"KASAN shadow" }, -	{ KASAN_SHADOW_END,	"KASAN shadow end" }, +	[USER_SPACE_NR]		= { 0,			"User Space" }, +	[KERNEL_SPACE_NR]	= { PAGE_OFFSET,	"Kernel Mapping" }, +	[VMALLOC_START_NR]	= { 0UL,		"vmalloc() Area" }, +	[VMALLOC_END_NR]	= { 0UL,		"vmalloc() End" }, +#ifdef CONFIG_HIGHMEM +	[PKMAP_BASE_NR]		= { 0UL,		"Persistent kmap() Area" },  #endif -# ifdef CONFIG_X86_ESPFIX64 -	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 }, -# endif -# ifdef CONFIG_EFI -	{ EFI_VA_END,		"EFI Runtime Services" }, -# endif -	{ __START_KERNEL_map,   "High Kernel Mapping" }, -	{ MODULES_VADDR,        "Modules" }, -	{ MODULES_END,          "End Modules" }, -#else -	{ PAGE_OFFSET,          "Kernel Mapping" }, -	{ 0/* VMALLOC_START */, "vmalloc() Area" }, -	{ 0/*VMALLOC_END*/,     "vmalloc() End" }, -# ifdef CONFIG_HIGHMEM -	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" }, -# endif -	{ 0/*FIXADDR_START*/,   "Fixmap Area" }, -#endif -	{ -1, NULL }		/* End of list */ +	[CPU_ENTRY_AREA_NR]	= { 0UL,		"CPU entry area" }, +	[FIXADDR_START_NR]	= { 0UL,		"Fixmap area" }, +	[END_OF_SPACE_NR]	= { -1,			NULL }  }; +#endif /* !CONFIG_X86_64 */ +  /* Multipliers for offsets within the PTEs */  #define PTE_LEVEL_MULT (PAGE_SIZE)  #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) @@ -140,7 +160,7 @@ static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)  	static const char * const level_name[] =  		{ "cr3", "pgd", "p4d", "pud", "pmd", "pte" }; -	if (!pgprot_val(prot)) { +	if (!(pr & _PAGE_PRESENT)) {  		/* Not present */  		pt_dump_cont_printf(m, dmsg, "                              ");  	} else { @@ -525,8 +545,8 @@ static int __init pt_dump_init(void)  	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;  # endif  	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; +	address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;  #endif -  	return 0;  }  __initcall(pt_dump_init); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a64a6f2848d..135c9a7898c7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -50,6 +50,7 @@  #include <asm/setup.h>  #include <asm/set_memory.h>  #include <asm/page_types.h> +#include <asm/cpu_entry_area.h>  #include <asm/init.h>  #include "mm_internal.h" @@ -766,6 +767,7 @@ void __init mem_init(void)  	mem_init_print_info(NULL);  	printk(KERN_INFO "virtual kernel memory layout:\n"  		"    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)\n" +		"  cpu_entry : 0x%08lx - 0x%08lx   (%4ld kB)\n"  #ifdef CONFIG_HIGHMEM  		"    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)\n"  #endif @@ -777,6 +779,10 @@ void __init mem_init(void)  		FIXADDR_START, FIXADDR_TOP,  		(FIXADDR_TOP - FIXADDR_START) >> 10, +		CPU_ENTRY_AREA_BASE, +		CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE, +		CPU_ENTRY_AREA_MAP_SIZE >> 10, +  #ifdef CONFIG_HIGHMEM  		PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,  		(LAST_PKMAP*PAGE_SIZE) >> 10, diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9ec70d780f1f..47388f0c0e59 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -15,6 +15,7 @@  #include <asm/tlbflush.h>  #include <asm/sections.h>  #include <asm/pgtable.h> +#include <asm/cpu_entry_area.h>  extern struct range pfn_mapped[E820_MAX_ENTRIES]; @@ -322,31 +323,33 @@ void __init kasan_init(void)  		map_range(&pfn_mapped[i]);  	} -	kasan_populate_zero_shadow( -		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), -		kasan_mem_to_shadow((void *)__START_KERNEL_map)); - -	kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), -			      (unsigned long)kasan_mem_to_shadow(_end), -			      early_pfn_to_nid(__pa(_stext))); - -	shadow_cpu_entry_begin = (void *)__fix_to_virt(FIX_CPU_ENTRY_AREA_BOTTOM); +	shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;  	shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);  	shadow_cpu_entry_begin = (void *)round_down((unsigned long)shadow_cpu_entry_begin,  						PAGE_SIZE); -	shadow_cpu_entry_end = (void *)(__fix_to_virt(FIX_CPU_ENTRY_AREA_TOP) + PAGE_SIZE); +	shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE + +					CPU_ENTRY_AREA_MAP_SIZE);  	shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);  	shadow_cpu_entry_end = (void *)round_up((unsigned long)shadow_cpu_entry_end,  					PAGE_SIZE); -	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), -				   shadow_cpu_entry_begin); +	kasan_populate_zero_shadow( +		kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), +		shadow_cpu_entry_begin);  	kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,  			      (unsigned long)shadow_cpu_entry_end, 0); -	kasan_populate_zero_shadow(shadow_cpu_entry_end, (void *)KASAN_SHADOW_END); +	kasan_populate_zero_shadow(shadow_cpu_entry_end, +				kasan_mem_to_shadow((void *)__START_KERNEL_map)); + +	kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), +			      (unsigned long)kasan_mem_to_shadow(_end), +			      early_pfn_to_nid(__pa(_stext))); + +	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), +				(void *)KASAN_SHADOW_END);  	load_cr3(init_top_pgt);  	__flush_tlb_all(); diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 6b9bf023a700..c3c5274410a9 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,6 +10,7 @@  #include <linux/pagemap.h>  #include <linux/spinlock.h> +#include <asm/cpu_entry_area.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h>  #include <asm/fixmap.h> diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 3118392cdf75..0a1be3adc97e 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -128,7 +128,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  	 * isn't free.  	 */  #ifdef CONFIG_DEBUG_VM -	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev, prev_asid))) { +	if (WARN_ON_ONCE(__read_cr3() != build_cr3(real_prev->pgd, prev_asid))) {  		/*  		 * If we were to BUG here, we'd be very likely to kill  		 * the system so hard that we don't see the call trace. @@ -195,7 +195,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  		if (need_flush) {  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);  			this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); -			write_cr3(build_cr3(next, new_asid)); +			write_cr3(build_cr3(next->pgd, new_asid));  			/*  			 * NB: This gets called via leave_mm() in the idle path @@ -208,7 +208,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,  			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);  		} else {  			/* The new ASID is already up to date. */ -			write_cr3(build_cr3_noflush(next, new_asid)); +			write_cr3(build_cr3_noflush(next->pgd, new_asid));  			/* See above wrt _rcuidle. */  			trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); @@ -288,7 +288,7 @@ void initialize_tlbstate_and_flush(void)  		!(cr4_read_shadow() & X86_CR4_PCIDE));  	/* Force ASID 0 and force a TLB flush. */ -	write_cr3(build_cr3(mm, 0)); +	write_cr3(build_cr3(mm->pgd, 0));  	/* Reinitialize tlbstate. */  	this_cpu_write(cpu_tlbstate.loaded_mm_asid, 0); @@ -551,7 +551,7 @@ static void do_kernel_range_flush(void *info)  	/* flush range by one by one 'invlpg' */  	for (addr = f->start; addr < f->end; addr += PAGE_SIZE) -		__flush_tlb_single(addr); +		__flush_tlb_one(addr);  }  void flush_tlb_kernel_range(unsigned long start, unsigned long end) diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index f44c0bc95aa2..8538a6723171 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -299,7 +299,7 @@ static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,  		local_flush_tlb();  		stat->d_alltlb++;  	} else { -		__flush_tlb_one(msg->address); +		__flush_tlb_single(msg->address);  		stat->d_onetlb++;  	}  	stat->d_requestee++; diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 69145ea5532c..4d62c071b166 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2273,7 +2273,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)  	switch (idx) {  	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN: -	case FIX_RO_IDT:  #ifdef CONFIG_X86_32  	case FIX_WP_TEST:  # ifdef CONFIG_HIGHMEM @@ -2284,7 +2283,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)  #endif  	case FIX_TEXT_POKE0:  	case FIX_TEXT_POKE1: -	case FIX_CPU_ENTRY_AREA_TOP ... FIX_CPU_ENTRY_AREA_BOTTOM:  		/* All local page mappings */  		pte = pfn_pte(phys, prot);  		break;  |