From 6a0b41d1e23dd3318568461593ae5e36d966981e Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 6 Mar 2017 17:17:17 +0300 Subject: x86/mm: Introduce arch_rnd() to compute 32/64 mmap random base The compat (32bit) mmap() sycall issued by a 64-bit task results in a mapping above 4GB. That's outside the compat mode address space and prevents CRIU to restore 32bit processes from a 64bit application. As a first step to address this, split out the address base randomizing calculation from arch_mmap_rnd() into a helper function, which can be used independent of mmap_ia32() based decisions. [ tglx: Massaged changelog ] Suggested-by: Thomas Gleixner Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170306141721.9188-2-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/mmap.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 7940166c799b..f31ed7097d0b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -55,6 +55,14 @@ static unsigned long stack_maxrandom_size(void) #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) #define MAX_GAP (TASK_SIZE/6*5) +#ifdef CONFIG_COMPAT +# define mmap32_rnd_bits mmap_rnd_compat_bits +# define mmap64_rnd_bits mmap_rnd_bits +#else +# define mmap32_rnd_bits mmap_rnd_bits +# define mmap64_rnd_bits mmap_rnd_bits +#endif + static int mmap_is_legacy(void) { if (current->personality & ADDR_COMPAT_LAYOUT) @@ -66,20 +74,14 @@ static int mmap_is_legacy(void) return sysctl_legacy_va_layout; } -unsigned long arch_mmap_rnd(void) +static unsigned long arch_rnd(unsigned int rndbits) { - unsigned long rnd; - - if (mmap_is_ia32()) -#ifdef CONFIG_COMPAT - rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1); -#else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); -#endif - else - rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1); + return (get_random_long() & ((1UL << rndbits) - 1)) << PAGE_SHIFT; +} - return rnd << PAGE_SHIFT; +unsigned long arch_mmap_rnd(void) +{ + return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } static unsigned long mmap_base(unsigned long rnd) -- cgit v1.2.3 From 8f3e474f3cea7b2470218a6ed6da47ff02147dce Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 6 Mar 2017 17:17:18 +0300 Subject: x86/mm: Add task_size parameter to mmap_base() To correctly handle 32-bit and 64-bit mmap() syscalls in 64bit applications its required to have separate address bases to place a mapping. The tasksize can be used as an indicator to select the proper parameters for mmap_base(). This requires the following changes: - Add task_size argument to mmap_base() and make the calculation based on it. - Provide mmap_legacy_base() as a seperate function - Use the new functions in arch_pick_mmap_layout() [ tglx: Massaged changelog ] Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170306141721.9188-3-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/elf.h | 24 ++++++++++--------- arch/x86/include/asm/processor.h | 4 +++- arch/x86/mm/mmap.c | 50 +++++++++++++++++++++++++--------------- 3 files changed, 48 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 9d49c18b5ea9..b908141cf0c4 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -293,8 +293,19 @@ do { \ } \ } while (0) +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static inline int mmap_is_ia32(void) +{ + return IS_ENABLED(CONFIG_X86_32) || + (IS_ENABLED(CONFIG_COMPAT) && + test_thread_flag(TIF_ADDR32)); +} + #ifdef CONFIG_X86_32 +#define __STACK_RND_MASK(is32bit) (0x7ff) #define STACK_RND_MASK (0x7ff) #define ARCH_DLINFO ARCH_DLINFO_IA32 @@ -304,7 +315,8 @@ do { \ #else /* CONFIG_X86_32 */ /* 1GB for 64bit, 8MB for 32bit */ -#define STACK_RND_MASK (test_thread_flag(TIF_ADDR32) ? 0x7ff : 0x3fffff) +#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) +#define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) #define ARCH_DLINFO \ do { \ @@ -348,16 +360,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp); #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages -/* - * True on X86_32 or when emulating IA32 on X86_64 - */ -static inline int mmap_is_ia32(void) -{ - return IS_ENABLED(CONFIG_X86_32) || - (IS_ENABLED(CONFIG_COMPAT) && - test_thread_flag(TIF_ADDR32)); -} - /* Do not change the values. See get_align_mask() */ enum align_flags { ALIGN_VA_32 = BIT(0), diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f385eca5407a..7caa2ac50ea2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -797,6 +797,7 @@ static inline void spin_lock_prefetch(const void *x) /* * User space process size: 3GB (default). */ +#define IA32_PAGE_OFFSET PAGE_OFFSET #define TASK_SIZE PAGE_OFFSET #define TASK_SIZE_MAX TASK_SIZE #define STACK_TOP TASK_SIZE @@ -873,7 +874,8 @@ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) +#define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) +#define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE) #define KSTK_EIP(task) (task_pt_regs(task)->ip) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index f31ed7097d0b..1e9cb945dca1 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -36,25 +36,23 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned long stack_maxrandom_size(void) +static inline unsigned long tasksize_32bit(void) +{ + return IA32_PAGE_OFFSET; +} + +static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; + max = (-1UL) & __STACK_RND_MASK(task_size == tasksize_32bit()); + max <<= PAGE_SHIFT; } return max; } -/* - * Top of mmap area (just below the process stack). - * - * Leave an at least ~128 MB hole with possible stack randomization. - */ -#define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) -#define MAX_GAP (TASK_SIZE/6*5) - #ifdef CONFIG_COMPAT # define mmap32_rnd_bits mmap_rnd_compat_bits # define mmap64_rnd_bits mmap_rnd_bits @@ -63,6 +61,8 @@ static unsigned long stack_maxrandom_size(void) # define mmap64_rnd_bits mmap_rnd_bits #endif +#define SIZE_128M (128 * 1024 * 1024UL) + static int mmap_is_legacy(void) { if (current->personality & ADDR_COMPAT_LAYOUT) @@ -84,16 +84,30 @@ unsigned long arch_mmap_rnd(void) return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } -static unsigned long mmap_base(unsigned long rnd) +static unsigned long mmap_base(unsigned long rnd, unsigned long task_size) { unsigned long gap = rlimit(RLIMIT_STACK); + unsigned long gap_min, gap_max; + + /* + * Top of mmap area (just below the process stack). + * Leave an at least ~128 MB hole with possible stack randomization. + */ + gap_min = SIZE_128M + stack_maxrandom_size(task_size); + gap_max = (task_size / 6) * 5; - if (gap < MIN_GAP) - gap = MIN_GAP; - else if (gap > MAX_GAP) - gap = MAX_GAP; + if (gap < gap_min) + gap = gap_min; + else if (gap > gap_max) + gap = gap_max; - return PAGE_ALIGN(TASK_SIZE - gap - rnd); + return PAGE_ALIGN(task_size - gap - rnd); +} + +static unsigned long mmap_legacy_base(unsigned long rnd, + unsigned long task_size) +{ + return __TASK_UNMAPPED_BASE(task_size) + rnd; } /* @@ -107,13 +121,13 @@ void arch_pick_mmap_layout(struct mm_struct *mm) if (current->flags & PF_RANDOMIZE) random_factor = arch_mmap_rnd(); - mm->mmap_legacy_base = TASK_UNMAPPED_BASE + random_factor; + mm->mmap_legacy_base = mmap_legacy_base(random_factor, TASK_SIZE); if (mmap_is_legacy()) { mm->mmap_base = mm->mmap_legacy_base; mm->get_unmapped_area = arch_get_unmapped_area; } else { - mm->mmap_base = mmap_base(random_factor); + mm->mmap_base = mmap_base(random_factor, TASK_SIZE); mm->get_unmapped_area = arch_get_unmapped_area_topdown; } } -- cgit v1.2.3 From 1b028f784e8c341e762c264f70dc0ca1418c8b7a Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 6 Mar 2017 17:17:19 +0300 Subject: x86/mm: Introduce mmap_compat_base() for 32-bit mmap() mmap() uses a base address, from which it starts to look for a free space for allocation. The base address is stored in mm->mmap_base, which is calculated during exec(). The address depends on task's size, set rlimit for stack, ASLR randomization. The base depends on the task size and the number of random bits which are different for 64-bit and 32bit applications. Due to the fact, that the base address is fixed, its mmap() from a compat (32bit) syscall issued by a 64bit task will return a address which is based on the 64bit base address and does not fit into the 32bit address space (4GB). The returned pointer is truncated to 32bit, which results in an invalid address. To solve store a seperate compat address base plus a compat legacy address base in mm_struct. These bases are calculated at exec() time and can be used later to address the 32bit compat mmap() issued by 64 bit applications. As a consequence of this change 32-bit applications issuing a 64-bit syscall (after doing a long jump) will get a 64-bit mapping now. Before this change 32-bit applications always got a 32bit mapping. [ tglx: Massaged changelog and added a comment ] Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170306141721.9188-4-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/Kconfig | 7 +++++++ arch/x86/Kconfig | 1 + arch/x86/include/asm/elf.h | 3 +++ arch/x86/kernel/sys_x86_64.c | 23 ++++++++++++++++++---- arch/x86/mm/mmap.c | 47 ++++++++++++++++++++++++++++++++------------ include/linux/mm_types.h | 5 +++++ 6 files changed, 69 insertions(+), 17 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index cd211a14a88f..c4d6833aacd9 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -700,6 +700,13 @@ config ARCH_MMAP_RND_COMPAT_BITS This value can be changed after boot using the /proc/sys/vm/mmap_rnd_compat_bits tunable +config HAVE_ARCH_COMPAT_MMAP_BASES + bool + help + This allows 64bit applications to invoke 32-bit mmap() syscall + and vice-versa 32-bit applications to call 64-bit mmap(). + Required for applications doing different bitness syscalls. + config HAVE_COPY_THREAD_TLS bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc98d5a294ee..2bab9d093b51 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -106,6 +106,7 @@ config X86 select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT + select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index b908141cf0c4..ac5be5ba8527 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -303,6 +303,9 @@ static inline int mmap_is_ia32(void) test_thread_flag(TIF_ADDR32)); } +extern unsigned long tasksize_32bit(void); +extern unsigned long tasksize_64bit(void); + #ifdef CONFIG_X86_32 #define __STACK_RND_MASK(is32bit) (0x7ff) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 50215a4b9347..c54817baabc7 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -17,6 +17,8 @@ #include #include +#include +#include #include #include @@ -98,6 +100,18 @@ out: return error; } +static unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_compat_syscall()) + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; +} + static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { @@ -114,10 +128,11 @@ static void find_start_end(unsigned long flags, unsigned long *begin, if (current->flags & PF_RANDOMIZE) { *begin = randomize_page(*begin, 0x02000000); } - } else { - *begin = current->mm->mmap_legacy_base; - *end = TASK_SIZE; + return; } + + *begin = get_mmap_base(1); + *end = in_compat_syscall() ? tasksize_32bit() : tasksize_64bit(); } unsigned long @@ -191,7 +206,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; if (filp) { diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 1e9cb945dca1..529ab79800af 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -36,11 +36,16 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -static inline unsigned long tasksize_32bit(void) +unsigned long tasksize_32bit(void) { return IA32_PAGE_OFFSET; } +unsigned long tasksize_64bit(void) +{ + return TASK_SIZE_MAX; +} + static unsigned long stack_maxrandom_size(unsigned long task_size) { unsigned long max = 0; @@ -81,6 +86,8 @@ static unsigned long arch_rnd(unsigned int rndbits) unsigned long arch_mmap_rnd(void) { + if (!(current->flags & PF_RANDOMIZE)) + return 0; return arch_rnd(mmap_is_ia32() ? mmap32_rnd_bits : mmap64_rnd_bits); } @@ -114,22 +121,36 @@ static unsigned long mmap_legacy_base(unsigned long rnd, * This function, called very early during the creation of a new * process VM image, sets up which VM layout function to use: */ -void arch_pick_mmap_layout(struct mm_struct *mm) +static void arch_pick_mmap_base(unsigned long *base, unsigned long *legacy_base, + unsigned long random_factor, unsigned long task_size) { - unsigned long random_factor = 0UL; - - if (current->flags & PF_RANDOMIZE) - random_factor = arch_mmap_rnd(); - - mm->mmap_legacy_base = mmap_legacy_base(random_factor, TASK_SIZE); + *legacy_base = mmap_legacy_base(random_factor, task_size); + if (mmap_is_legacy()) + *base = *legacy_base; + else + *base = mmap_base(random_factor, task_size); +} - if (mmap_is_legacy()) { - mm->mmap_base = mm->mmap_legacy_base; +void arch_pick_mmap_layout(struct mm_struct *mm) +{ + if (mmap_is_legacy()) mm->get_unmapped_area = arch_get_unmapped_area; - } else { - mm->mmap_base = mmap_base(random_factor, TASK_SIZE); + else mm->get_unmapped_area = arch_get_unmapped_area_topdown; - } + + arch_pick_mmap_base(&mm->mmap_base, &mm->mmap_legacy_base, + arch_rnd(mmap64_rnd_bits), tasksize_64bit()); + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* + * The mmap syscall mapping base decision depends solely on the + * syscall type (64-bit or compat). This applies for 64bit + * applications and 32bit applications. The 64bit syscall uses + * mmap_base, the compat syscall uses mmap_compat_base. + */ + arch_pick_mmap_base(&mm->mmap_compat_base, &mm->mmap_compat_legacy_base, + arch_rnd(mmap32_rnd_bits), tasksize_32bit()); +#endif } const char *arch_vma_name(struct vm_area_struct *vma) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f60f45fe226f..45cdb27791a3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -367,6 +367,11 @@ struct mm_struct { #endif unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* Base adresses for compatible mmap() */ + unsigned long mmap_compat_base; + unsigned long mmap_compat_legacy_base; +#endif unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; -- cgit v1.2.3 From 3e6ef9c80946f781fc25e8490c9875b1d2b61158 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Mon, 6 Mar 2017 17:17:20 +0300 Subject: x86/mm: Make mmap(MAP_32BIT) work correctly mmap(MAP_32BIT) is broken due to the dependency on the TIF_ADDR32 thread flag. For 64bit applications MAP_32BIT will force legacy bottom-up allocations and the 1GB address space restriction even if the application issued a compat syscall, which should not be subject of these restrictions. For 32bit applications, which issue 64bit syscalls the newly introduced mmap base separation into 64-bit and compat bases changed the behaviour because now a 64-bit mapping is returned, but due to the TIF_ADDR32 dependency MAP_32BIT is ignored. Before the separation a 32-bit mapping was returned, so the MAP_32BIT handling was irrelevant. Replace the check for TIF_ADDR32 with a check for the compat syscall. That solves both the 64-bit issuing a compat syscall and the 32-bit issuing a 64-bit syscall problems. [ tglx: Massaged changelog ] Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170306141721.9188-5-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/sys_x86_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index c54817baabc7..63e89dfc808a 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -115,7 +115,7 @@ static unsigned long get_mmap_base(int is_legacy) static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) { + if (!in_compat_syscall() && (flags & MAP_32BIT)) { /* This is usually used needed to map code in small model, so it needs to be in the first 31bit. Limit it to that. This means we need to move the @@ -191,7 +191,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; /* for MAP_32BIT mappings we force the legacy mmap base */ - if (!test_thread_flag(TIF_ADDR32) && (flags & MAP_32BIT)) + if (!in_compat_syscall() && (flags & MAP_32BIT)) goto bottomup; /* requesting a specific address */ -- cgit v1.2.3 From fe1e8c3e9634071ac608172e29bf997596d17c7c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 13 Mar 2017 17:33:04 +0300 Subject: x86/mm: Extend headers with basic definitions to support 5-level paging This patch extends x86 headers to enable 5-level paging support. It's still based on . We will get to the point where we can have later. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable-2level_types.h | 1 + arch/x86/include/asm/pgtable-3level_types.h | 1 + arch/x86/include/asm/pgtable.h | 26 ++++++++++++++++++++----- arch/x86/include/asm/pgtable_64_types.h | 1 + arch/x86/include/asm/pgtable_types.h | 30 ++++++++++++++++++++++++++++- 5 files changed, 53 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h index 392576433e77..373ab1de909f 100644 --- a/arch/x86/include/asm/pgtable-2level_types.h +++ b/arch/x86/include/asm/pgtable-2level_types.h @@ -7,6 +7,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h index bcc89625ebe5..b8a4341faafa 100644 --- a/arch/x86/include/asm/pgtable-3level_types.h +++ b/arch/x86/include/asm/pgtable-3level_types.h @@ -7,6 +7,7 @@ typedef u64 pteval_t; typedef u64 pmdval_t; typedef u64 pudval_t; +typedef u64 p4dval_t; typedef u64 pgdval_t; typedef u64 pgprotval_t; diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1cfb36b8c024..6f6f351e0a81 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -179,6 +179,17 @@ static inline unsigned long pud_pfn(pud_t pud) return (pud_val(pud) & pud_pfn_mask(pud)) >> PAGE_SHIFT; } +static inline unsigned long p4d_pfn(p4d_t p4d) +{ + return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT; +} + +static inline int p4d_large(p4d_t p4d) +{ + /* No 512 GiB pages yet */ + return 0; +} + #define pte_page(pte) pfn_to_page(pte_pfn(pte)) static inline int pmd_large(pmd_t pte) @@ -770,6 +781,16 @@ static inline int pud_large(pud_t pud) } #endif /* CONFIG_PGTABLE_LEVELS > 2 */ +static inline unsigned long pud_index(unsigned long address) +{ + return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); +} + +static inline unsigned long p4d_index(unsigned long address) +{ + return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); +} + #if CONFIG_PGTABLE_LEVELS > 3 static inline int pgd_present(pgd_t pgd) { @@ -788,11 +809,6 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline unsigned long pud_index(unsigned long address) -{ - return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); -} - static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) { return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 3a264200c62f..0b2797e5083c 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -13,6 +13,7 @@ typedef unsigned long pteval_t; typedef unsigned long pmdval_t; typedef unsigned long pudval_t; +typedef unsigned long p4dval_t; typedef unsigned long pgdval_t; typedef unsigned long pgprotval_t; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 62484333673d..df08535f774a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -272,9 +272,20 @@ static inline pgdval_t pgd_flags(pgd_t pgd) return native_pgd_val(pgd) & PTE_FLAGS_MASK; } -#if CONFIG_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 4 + +#error FIXME + +#else #include +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return native_pgd_val(p4d); +} +#endif + +#if CONFIG_PGTABLE_LEVELS > 3 typedef struct { pudval_t pud; } pud_t; static inline pud_t native_make_pud(pmdval_t val) @@ -318,6 +329,22 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) } #endif +static inline p4dval_t p4d_pfn_mask(p4d_t p4d) +{ + /* No 512 GiB huge pages yet */ + return PTE_PFN_MASK; +} + +static inline p4dval_t p4d_flags_mask(p4d_t p4d) +{ + return ~p4d_pfn_mask(p4d); +} + +static inline p4dval_t p4d_flags(p4d_t p4d) +{ + return native_p4d_val(p4d) & p4d_flags_mask(p4d); +} + static inline pudval_t pud_pfn_mask(pud_t pud) { if (native_pud_val(pud) & _PAGE_PSE) @@ -461,6 +488,7 @@ enum pg_level { PG_LEVEL_4K, PG_LEVEL_2M, PG_LEVEL_1G, + PG_LEVEL_512G, PG_LEVEL_NUM }; -- cgit v1.2.3 From e0c4f6750e130541cca7390739d25feb522acfff Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 13 Mar 2017 17:33:05 +0300 Subject: x86/mm: Convert trivial cases of page table walk to 5-level paging This patch only covers simple cases. Less trivial cases will be converted with separate patches. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tboot.c | 6 +++++- arch/x86/kernel/vm86_32.c | 6 +++++- arch/x86/mm/fault.c | 39 +++++++++++++++++++++++++++++++++------ arch/x86/mm/init_32.c | 22 ++++++++++++++++------ arch/x86/mm/ioremap.c | 3 ++- arch/x86/mm/pgtable.c | 4 +++- arch/x86/mm/pgtable_32.c | 8 +++++++- arch/x86/platform/efi/efi_64.c | 13 +++++++++---- arch/x86/power/hibernate_32.c | 7 +++++-- 9 files changed, 85 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index b868fa1b812b..5db0f33cbf2c 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -118,12 +118,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pgprot_t prot) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset(&tboot_mm, vaddr); - pud = pud_alloc(&tboot_mm, pgd, vaddr); + p4d = p4d_alloc(&tboot_mm, pgd, vaddr); + if (!p4d) + return -1; + pud = pud_alloc(&tboot_mm, p4d, vaddr); if (!pud) return -1; pmd = pmd_alloc(&tboot_mm, pud, vaddr); diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 23ee89ce59a9..62597c300d94 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -164,6 +164,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) struct vm_area_struct *vma; spinlock_t *ptl; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -173,7 +174,10 @@ static void mark_screen_rdonly(struct mm_struct *mm) pgd = pgd_offset(mm, 0xA0000); if (pgd_none_or_clear_bad(pgd)) goto out; - pud = pud_offset(pgd, 0xA0000); + p4d = p4d_offset(pgd, 0xA0000); + if (p4d_none_or_clear_bad(p4d)) + goto out; + pud = pud_offset(p4d, 0xA0000); if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 428e31763cb9..605fd5e8e048 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -253,6 +253,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) { unsigned index = pgd_index(address); pgd_t *pgd_k; + p4d_t *p4d, *p4d_k; pud_t *pud, *pud_k; pmd_t *pmd, *pmd_k; @@ -265,10 +266,15 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) /* * set_pgd(pgd, *pgd_k); here would be useless on PAE * and redundant with the set_pmd() on non-PAE. As would - * set_pud. + * set_p4d/set_pud. */ - pud = pud_offset(pgd, address); - pud_k = pud_offset(pgd_k, address); + p4d = p4d_offset(pgd, address); + p4d_k = p4d_offset(pgd_k, address); + if (!p4d_present(*p4d_k)) + return NULL; + + pud = pud_offset(p4d, address); + pud_k = pud_offset(p4d_k, address); if (!pud_present(*pud_k)) return NULL; @@ -384,6 +390,8 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(address)]; + p4d_t *p4d; + pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -392,7 +400,9 @@ static void dump_pagetable(unsigned long address) if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd)) goto out; #endif - pmd = pmd_offset(pud_offset(pgd, address), address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); + pmd = pmd_offset(pud, address); printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd)); /* @@ -526,6 +536,7 @@ static void dump_pagetable(unsigned long address) { pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK); pgd_t *pgd = base + pgd_index(address); + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -538,7 +549,15 @@ static void dump_pagetable(unsigned long address) if (!pgd_present(*pgd)) goto out; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (bad_address(p4d)) + goto bad; + + printk("P4D %lx ", p4d_val(*p4d)); + if (!p4d_present(*p4d) || p4d_large(*p4d)) + goto out; + + pud = pud_offset(p4d, address); if (bad_address(pud)) goto bad; @@ -1082,6 +1101,7 @@ static noinline int spurious_fault(unsigned long error_code, unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1104,7 +1124,14 @@ spurious_fault(unsigned long error_code, unsigned long address) if (!pgd_present(*pgd)) return 0; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_large(*p4d)) + return spurious_fault_check(error_code, (pte_t *) p4d); + + pud = pud_offset(p4d, address); if (!pud_present(*pud)) return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b4b53e6793f..5ed3c141bbd5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -67,6 +67,7 @@ bool __read_mostly __vmalloc_start_set = false; */ static pmd_t * __init one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -75,13 +76,15 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); return pmd_table; } #endif - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); return pmd_table; @@ -390,8 +393,11 @@ pte_t *kmap_pte; static inline pte_t *kmap_get_fixmap_pte(unsigned long vaddr) { - return pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), - vaddr), vaddr), vaddr); + pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d = p4d_offset(pgd, vaddr); + pud_t *pud = pud_offset(p4d, vaddr); + pmd_t *pmd = pmd_offset(pud, vaddr); + return pte_offset_kernel(pmd, vaddr); } static void __init kmap_init(void) @@ -410,6 +416,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) { unsigned long vaddr; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -418,7 +425,8 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; @@ -450,6 +458,7 @@ void __init native_pagetable_init(void) { unsigned long pfn, va; pgd_t *pgd, *base = swapper_pg_dir; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -469,7 +478,8 @@ void __init native_pagetable_init(void) if (!pgd_present(*pgd)) break; - pud = pud_offset(pgd, va); + p4d = p4d_offset(pgd, va); + pud = pud_offset(p4d, va); pmd = pmd_offset(pud, va); if (!pmd_present(*pmd)) break; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7aaa2635862d..a5e1cda85974 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -425,7 +425,8 @@ static inline pmd_t * __init early_ioremap_pmd(unsigned long addr) /* Don't assume we're using swapper_pg_dir at this point */ pgd_t *base = __va(read_cr3()); pgd_t *pgd = &base[pgd_index(addr)]; - pud_t *pud = pud_offset(pgd, addr); + p4d_t *p4d = p4d_offset(pgd, addr); + pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); return pmd; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6cbdff26bb96..38b6daf72deb 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -261,13 +261,15 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) { + p4d_t *p4d; pud_t *pud; int i; if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ return; - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); for (i = 0; i < PREALLOCATED_PMDS; i++, pud++) { pmd_t *pmd = pmds[i]; diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 9adce776852b..3d275a791c76 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -26,6 +26,7 @@ unsigned int __VMALLOC_RESERVE = 128 << 20; void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -35,7 +36,12 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) BUG(); return; } - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + if (p4d_none(*p4d)) { + BUG(); + return; + } + pud = pud_offset(p4d, vaddr); if (pud_none(*pud)) { BUG(); return; diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index a4695da42d77..8544dae3d1b4 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -166,6 +166,7 @@ void efi_sync_low_kernel_mappings(void) { unsigned num_entries; pgd_t *pgd_k, *pgd_efi; + p4d_t *p4d_k, *p4d_efi; pud_t *pud_k, *pud_efi; if (efi_enabled(EFI_OLD_MEMMAP)) @@ -197,16 +198,20 @@ void efi_sync_low_kernel_mappings(void) BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); pgd_efi = efi_pgd + pgd_index(EFI_VA_END); - pud_efi = pud_offset(pgd_efi, 0); + p4d_efi = p4d_offset(pgd_efi, 0); + pud_efi = pud_offset(p4d_efi, 0); pgd_k = pgd_offset_k(EFI_VA_END); - pud_k = pud_offset(pgd_k, 0); + p4d_k = p4d_offset(pgd_k, 0); + pud_k = pud_offset(p4d_k, 0); num_entries = pud_index(EFI_VA_END); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); - pud_efi = pud_offset(pgd_efi, EFI_VA_START); - pud_k = pud_offset(pgd_k, EFI_VA_START); + p4d_efi = p4d_offset(pgd_efi, EFI_VA_START); + pud_efi = pud_offset(p4d_efi, EFI_VA_START); + p4d_k = p4d_offset(pgd_k, EFI_VA_START); + pud_k = pud_offset(p4d_k, EFI_VA_START); num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 9f14bd34581d..c35fdb585c68 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -32,6 +32,7 @@ pgd_t *resume_pg_dir; */ static pmd_t *resume_one_md_table_init(pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd_table; @@ -41,11 +42,13 @@ static pmd_t *resume_one_md_table_init(pgd_t *pgd) return NULL; set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); BUG_ON(pmd_table != pmd_offset(pud, 0)); #else - pud = pud_offset(pgd, 0); + p4d = p4d_offset(pgd, 0); + pud = pud_offset(p4d, 0); pmd_table = pmd_offset(pud, 0); #endif -- cgit v1.2.3 From 0318e5abe1c0933b8bf6763a1a0d3caec4f0826d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 13 Mar 2017 17:33:06 +0300 Subject: x86/mm/gup: Add 5-level paging support Extend get_user_pages_fast() to handle an additional page table level. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/gup.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 1f3b6ef105cd..456dfdfd2249 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -76,9 +76,9 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } /* - * 'pteval' can come from a pte, pmd or pud. We only check + * 'pteval' can come from a pte, pmd, pud or p4d. We only check * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. + * same value on all 4 types. */ static inline int pte_allows_gup(unsigned long pteval, int write) { @@ -295,13 +295,13 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, return 1; } -static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&pgd, addr); + pudp = pud_offset(&p4d, addr); do { pud_t pud = *pudp; @@ -320,6 +320,27 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, return 1; } +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = *p4dp; + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_large(p4d)); + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + /* * Like get_user_pages_fast() except its IRQ-safe in that it won't fall * back to the regular GUP. @@ -368,7 +389,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) break; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) break; } while (pgdp++, addr = next, addr != end); local_irq_restore(flags); @@ -440,7 +461,7 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, next = pgd_addr_end(addr, end); if (pgd_none(pgd)) goto slow; - if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) goto slow; } while (pgdp++, addr = next, addr != end); local_irq_enable(); -- cgit v1.2.3 From ea3b5e60ce804403ca019039d6331368521348de Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 13 Mar 2017 17:33:07 +0300 Subject: x86/mm/ident_map: Add 5-level paging support Add additional page table level handing. It's mostly mechanical. The only quirk is that with p4d folded, 'pgd' is equal to 'p4d' in kernel_ident_mapping_init(). The pgd entry has to point to the pud page table in this case. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/ident_map.c | 51 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 7 deletions(-) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index 4473cb4f8b90..04210a29dd60 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -45,6 +45,34 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, return 0; } +static int ident_p4d_init(struct x86_mapping_info *info, p4d_t *p4d_page, + unsigned long addr, unsigned long end) +{ + unsigned long next; + + for (; addr < end; addr = next) { + p4d_t *p4d = p4d_page + p4d_index(addr); + pud_t *pud; + + next = (addr & P4D_MASK) + P4D_SIZE; + if (next > end) + next = end; + + if (p4d_present(*p4d)) { + pud = pud_offset(p4d, 0); + ident_pud_init(info, pud, addr, next); + continue; + } + pud = (pud_t *)info->alloc_pgt_page(info->context); + if (!pud) + return -ENOMEM; + ident_pud_init(info, pud, addr, next); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); + } + + return 0; +} + int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, unsigned long pstart, unsigned long pend) { @@ -55,27 +83,36 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page, for (; addr < end; addr = next) { pgd_t *pgd = pgd_page + pgd_index(addr); - pud_t *pud; + p4d_t *p4d; next = (addr & PGDIR_MASK) + PGDIR_SIZE; if (next > end) next = end; if (pgd_present(*pgd)) { - pud = pud_offset(pgd, 0); - result = ident_pud_init(info, pud, addr, next); + p4d = p4d_offset(pgd, 0); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; continue; } - pud = (pud_t *)info->alloc_pgt_page(info->context); - if (!pud) + p4d = (p4d_t *)info->alloc_pgt_page(info->context); + if (!p4d) return -ENOMEM; - result = ident_pud_init(info, pud, addr, next); + result = ident_p4d_init(info, p4d, addr, next); if (result) return result; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* + * With p4d folded, pgd is equal to p4d. + * The pgd entry has to point to the pud page table in this case. + */ + pud_t *pud = pud_offset(p4d, 0); + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + } } return 0; -- cgit v1.2.3 From b50858ce3e2a25a7f4638464e857853fbfc81823 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 13 Mar 2017 17:33:08 +0300 Subject: x86/mm/vmalloc: Add 5-level paging support Modify vmalloc_fault() to handle additional page table level. With 4-level paging, copying happens on p4d level, as we have pgd_none() always false if p4d_t is folded. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 605fd5e8e048..8ad91a01cbc8 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -435,6 +435,7 @@ void vmalloc_sync_all(void) static noinline int vmalloc_fault(unsigned long address) { pgd_t *pgd, *pgd_ref; + p4d_t *p4d, *p4d_ref; pud_t *pud, *pud_ref; pmd_t *pmd, *pmd_ref; pte_t *pte, *pte_ref; @@ -458,17 +459,37 @@ static noinline int vmalloc_fault(unsigned long address) if (pgd_none(*pgd)) { set_pgd(pgd, *pgd_ref); arch_flush_lazy_mmu_mode(); - } else { + } else if (CONFIG_PGTABLE_LEVELS > 4) { + /* + * With folded p4d, pgd_none() is always false, so the pgd may + * point to an empty page table entry and pgd_page_vaddr() + * will return garbage. + * + * We will do the correct sanity check on the p4d level. + */ BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); } + /* With 4-level paging, copying happens on the p4d level. */ + p4d = p4d_offset(pgd, address); + p4d_ref = p4d_offset(pgd_ref, address); + if (p4d_none(*p4d_ref)) + return -1; + + if (p4d_none(*p4d)) { + set_p4d(p4d, *p4d_ref); + arch_flush_lazy_mmu_mode(); + } else { + BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref)); + } + /* * Below here mismatches are bugs because these lower tables * are shared: */ - pud = pud_offset(pgd, address); - pud_ref = pud_offset(pgd_ref, address); + pud = pud_offset(p4d, address); + pud_ref = pud_offset(p4d_ref, address); if (pud_none(*pud_ref)) return -1; -- cgit v1.2.3 From 06c830a48346643e195801460dfe16d96ba4dff5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 13 Mar 2017 17:33:09 +0300 Subject: x86/power: Add 5-level paging support set_up_temporary_text_mapping() and relocate_restore_code() require adjustments to handle additional page table level. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170313143309.16020-7-kirill.shutemov@linux.intel.com [ Minor readability edits. ] Signed-off-by: Ingo Molnar --- arch/x86/power/hibernate_64.c | 47 ++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index ded2e8272382..2a9f993bbbf0 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -49,6 +49,7 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) { pmd_t *pmd; pud_t *pud; + p4d_t *p4d; /* * The new mapping only has to cover the page containing the image @@ -63,6 +64,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) * the virtual address space after switching over to the original page * tables used by the image kernel. */ + + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + p4d = (p4d_t *)get_safe_page(GFP_ATOMIC); + if (!p4d) + return -ENOMEM; + } + pud = (pud_t *)get_safe_page(GFP_ATOMIC); if (!pud) return -ENOMEM; @@ -75,8 +83,13 @@ static int set_up_temporary_text_mapping(pgd_t *pgd) __pmd((jump_address_phys & PMD_MASK) | __PAGE_KERNEL_LARGE_EXEC)); set_pud(pud + pud_index(restore_jump_address), __pud(__pa(pmd) | _KERNPG_TABLE)); - set_pgd(pgd + pgd_index(restore_jump_address), - __pgd(__pa(pud) | _KERNPG_TABLE)); + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_p4d(p4d + p4d_index(restore_jump_address), __p4d(__pa(pud) | _KERNPG_TABLE)); + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(p4d) | _KERNPG_TABLE)); + } else { + /* No p4d for 4-level paging: point the pgd to the pud page table */ + set_pgd(pgd + pgd_index(restore_jump_address), __pgd(__pa(pud) | _KERNPG_TABLE)); + } return 0; } @@ -124,7 +137,10 @@ static int set_up_temporary_mappings(void) static int relocate_restore_code(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; + pmd_t *pmd; + pte_t *pte; relocated_restore_code = get_safe_page(GFP_ATOMIC); if (!relocated_restore_code) @@ -134,22 +150,25 @@ static int relocate_restore_code(void) /* Make the page containing the relocated code executable */ pgd = (pgd_t *)__va(read_cr3()) + pgd_index(relocated_restore_code); - pud = pud_offset(pgd, relocated_restore_code); + p4d = p4d_offset(pgd, relocated_restore_code); + if (p4d_large(*p4d)) { + set_p4d(p4d, __p4d(p4d_val(*p4d) & ~_PAGE_NX)); + goto out; + } + pud = pud_offset(p4d, relocated_restore_code); if (pud_large(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); - } else { - pmd_t *pmd = pmd_offset(pud, relocated_restore_code); - - if (pmd_large(*pmd)) { - set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); - } else { - pte_t *pte = pte_offset_kernel(pmd, relocated_restore_code); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); - } + goto out; + } + pmd = pmd_offset(pud, relocated_restore_code); + if (pmd_large(*pmd)) { + set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_NX)); + goto out; } + pte = pte_offset_kernel(pmd, relocated_restore_code); + set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_NX)); +out: __flush_tlb_all(); - return 0; } -- cgit v1.2.3 From e13b73dd9c8003993b171173ba803363faf74238 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 14 Mar 2017 14:41:26 +0300 Subject: x86/hugetlb: Adjust to the new native/compat mmap bases Commit 1b028f784e8c introduced two mmap() bases for 32-bit syscalls and for 64-bit syscalls. The mmap() code in x86 was modified to handle the separation, but the patch series missed to update the hugetlb code. As a consequence a 32bit application mapping a file on hugetlbfs uses the 64-bit mmap base for address space allocation, which fails. Adjust the hugetlb mapping code to use the proper bases depending on the syscall invocation mode (64-bit or compat). [ tglx: Massaged changelog and switched from asm/compat.h to linux/compat.h ] Fixes: commit 1b028f784e8c ("x86/mm: Introduce mmap_compat_base() for 32-bit mmap()") Reported-by: kernel test robot Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170314114126.9280-1-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/elf.h | 1 + arch/x86/kernel/sys_x86_64.c | 12 ------------ arch/x86/mm/hugetlbpage.c | 9 ++++++--- arch/x86/mm/mmap.c | 14 ++++++++++++++ 4 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ac5be5ba8527..d4d3ed456cb7 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -305,6 +305,7 @@ static inline int mmap_is_ia32(void) extern unsigned long tasksize_32bit(void); extern unsigned long tasksize_64bit(void); +extern unsigned long get_mmap_base(int is_legacy); #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 63e89dfc808a..207b8f2582c7 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -100,18 +100,6 @@ out: return error; } -static unsigned long get_mmap_base(int is_legacy) -{ - struct mm_struct *mm = current->mm; - -#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES - if (in_compat_syscall()) - return is_legacy ? mm->mmap_compat_legacy_base - : mm->mmap_compat_base; -#endif - return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; -} - static void find_start_end(unsigned long flags, unsigned long *begin, unsigned long *end) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index c5066a260803..302f43fd9c28 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -12,10 +12,12 @@ #include #include #include +#include #include #include #include #include +#include #if 0 /* This is just for testing */ struct page * @@ -82,8 +84,9 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, info.flags = 0; info.length = len; - info.low_limit = current->mm->mmap_legacy_base; - info.high_limit = TASK_SIZE; + info.low_limit = get_mmap_base(1); + info.high_limit = in_compat_syscall() ? + tasksize_32bit() : tasksize_64bit(); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; return vm_unmapped_area(&info); @@ -100,7 +103,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; + info.high_limit = get_mmap_base(0); info.align_mask = PAGE_MASK & ~huge_page_mask(h); info.align_offset = 0; addr = vm_unmapped_area(&info); diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 529ab79800af..19ad095b41df 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -30,6 +30,7 @@ #include #include #include +#include #include struct va_alignment __read_mostly va_align = { @@ -153,6 +154,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm) #endif } +unsigned long get_mmap_base(int is_legacy) +{ + struct mm_struct *mm = current->mm; + +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + if (in_compat_syscall()) { + return is_legacy ? mm->mmap_compat_legacy_base + : mm->mmap_compat_base; + } +#endif + return is_legacy ? mm->mmap_legacy_base : mm->mmap_base; +} + const char *arch_vma_name(struct vm_area_struct *vma) { if (vma->vm_flags & VM_MPX) -- cgit v1.2.3 From f06bdd4001c257792c54dce9427399f2896470af Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 14 Mar 2017 10:05:06 -0700 Subject: x86/mm: Adapt MODULES_END based on fixmap section size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch aligns MODULES_END to the beginning of the fixmap section. It optimizes the space available for both sections. The address is pre-computed based on the number of pages required by the fixmap section. It will allow GDT remapping in the fixmap section. The current MODULES_END static address does not provide enough space for the kernel to support a large number of processors. Signed-off-by: Thomas Garnier Cc: Alexander Potapenko Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Chris Wilson Cc: Christian Borntraeger Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Jiri Kosina Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Len Brown Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Luis R . Rodriguez Cc: Matt Fleming Cc: Michal Hocko Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Pavel Machek Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rafael J . Wysocki Cc: Rusty Russell Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Tim Chen Cc: Vitaly Kuznetsov Cc: kasan-dev@googlegroups.com Cc: kernel-hardening@lists.openwall.com Cc: kvm@vger.kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-pm@vger.kernel.org Cc: xen-devel@lists.xenproject.org Cc: zijun_hu Link: http://lkml.kernel.org/r/20170314170508.100882-1-thgarnie@google.com [ Small build fix. ] Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 5 ++++- arch/x86/include/asm/pgtable_64_types.h | 3 ++- arch/x86/kernel/module.c | 1 + arch/x86/mm/dump_pagetables.c | 1 + arch/x86/mm/kasan_init_64.c | 1 + mm/vmalloc.c | 4 ++++ 6 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5724092db811..ee3f9c30957c 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -19,7 +19,7 @@ ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space +ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole @@ -39,6 +39,9 @@ memory window (this size is arbitrary, it can be raised later if needed). The mappings are not part of any other kernel PGD and are only available during EFI runtime calls. +The module mapping space size changes based on the CONFIG requirements for the +following fixmap section. + Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 0b2797e5083c..516593e66bd6 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -68,7 +68,8 @@ typedef struct { pteval_t pte; } pte_t; #endif /* CONFIG_RANDOMIZE_MEMORY */ #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) -#define MODULES_END _AC(0xffffffffff000000, UL) +/* The module sections ends with the start of the fixmap */ +#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) #define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 477ae806c2fa..fad61caac75e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include #include #include +#include #if 0 #define DEBUGP(fmt, ...) \ diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 58b5bee7ea27..75efeecc85eb 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -20,6 +20,7 @@ #include #include +#include /* * The dumper groups pagetable entries of the same type into one, and for diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8d63d7a104c3..1bde19ef86bd 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -9,6 +9,7 @@ #include #include +#include extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_X_MAX]; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0dd80222b20b..b7d2a23349f4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -36,6 +36,10 @@ #include #include +#ifdef CONFIG_X86 +# include +#endif + #include "internal.h" struct vfree_deferred { -- cgit v1.2.3 From 69218e47994da614e7af600bf06887750ab6657a Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 14 Mar 2017 10:05:07 -0700 Subject: x86: Remap GDT tables in the fixmap section MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each processor holds a GDT in its per-cpu structure. The sgdt instruction gives the base address of the current GDT. This address can be used to bypass KASLR memory randomization. With another bug, an attacker could target other per-cpu structures or deduce the base of the main memory section (PAGE_OFFSET). This patch relocates the GDT table for each processor inside the fixmap section. The space is reserved based on number of supported processors. For consistency, the remapping is done by default on 32 and 64-bit. Each processor switches to its remapped GDT at the end of initialization. For hibernation, the main processor returns with the original GDT and switches back to the remapping at completion. This patch was tested on both architectures. Hibernation and KVM were both tested specially for their usage of the GDT. Thanks to Boris Ostrovsky for testing and recommending changes for Xen support. Signed-off-by: Thomas Garnier Cc: Alexander Potapenko Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Chris Wilson Cc: Christian Borntraeger Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Jiri Kosina Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Len Brown Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Luis R . Rodriguez Cc: Matt Fleming Cc: Michal Hocko Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Pavel Machek Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rafael J . Wysocki Cc: Rusty Russell Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Tim Chen Cc: Vitaly Kuznetsov Cc: kasan-dev@googlegroups.com Cc: kernel-hardening@lists.openwall.com Cc: kvm@vger.kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-pm@vger.kernel.org Cc: xen-devel@lists.xenproject.org Cc: zijun_hu Link: http://lkml.kernel.org/r/20170314170508.100882-2-thgarnie@google.com Signed-off-by: Ingo Molnar --- arch/x86/entry/vdso/vma.c | 2 +- arch/x86/include/asm/desc.h | 58 ++++++++++++++++++++++++++++++++--- arch/x86/include/asm/fixmap.h | 4 +++ arch/x86/include/asm/processor.h | 1 + arch/x86/include/asm/stackprotector.h | 2 +- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/apm_32.c | 6 ++-- arch/x86/kernel/cpu/common.c | 29 ++++++++++++++++-- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/platform/efi/efi_32.c | 4 +-- arch/x86/power/cpu.c | 7 +++-- arch/x86/xen/enlighten.c | 5 ++- arch/x86/xen/mmu.c | 1 + arch/x86/xen/smp.c | 2 +- drivers/lguest/x86/core.c | 6 ++-- drivers/pnp/pnpbios/bioscalls.c | 10 +++--- 17 files changed, 114 insertions(+), 29 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 226ca70dc6bd..5c5d4d7618e6 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -354,7 +354,7 @@ static void vgetcpu_cpu_init(void *arg) d.p = 1; /* Present */ d.d = 1; /* 32-bit */ - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); } static int vgetcpu_online(unsigned int cpu) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 1548ca92ad3f..4b5ef0c64291 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,7 @@ extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; extern const struct desc_ptr debug_idt_descr; extern gate_desc debug_idt_table[]; +extern pgprot_t pg_fixmap_gdt_flags; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -45,11 +47,57 @@ struct gdt_page { DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); -static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) +/* Provide the original GDT */ +static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) { return per_cpu(gdt_page, cpu).gdt; } +static inline unsigned long get_cpu_gdt_rw_vaddr(unsigned int cpu) +{ + return (unsigned long)get_cpu_gdt_rw(cpu); +} + +/* Provide the current original GDT */ +static inline struct desc_struct *get_current_gdt_rw(void) +{ + return this_cpu_ptr(&gdt_page)->gdt; +} + +static inline unsigned long get_current_gdt_rw_vaddr(void) +{ + return (unsigned long)get_current_gdt_rw(); +} + +/* Get the fixmap index for a specific processor */ +static inline unsigned int get_cpu_gdt_ro_index(int cpu) +{ + return FIX_GDT_REMAP_BEGIN + cpu; +} + +/* Provide the fixmap address of the remapped GDT */ +static inline struct desc_struct *get_cpu_gdt_ro(int cpu) +{ + unsigned int idx = get_cpu_gdt_ro_index(cpu); + return (struct desc_struct *)__fix_to_virt(idx); +} + +static inline unsigned long get_cpu_gdt_ro_vaddr(int cpu) +{ + return (unsigned long)get_cpu_gdt_ro(cpu); +} + +/* Provide the current read-only GDT */ +static inline struct desc_struct *get_current_gdt_ro(void) +{ + return get_cpu_gdt_ro(smp_processor_id()); +} + +static inline unsigned long get_current_gdt_ro_vaddr(void) +{ + return (unsigned long)get_current_gdt_ro(); +} + #ifdef CONFIG_X86_64 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, @@ -174,7 +222,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned t static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) { - struct desc_struct *d = get_cpu_gdt_table(cpu); + struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS, @@ -194,7 +242,7 @@ static inline void native_set_ldt(const void *addr, unsigned int entries) set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT, entries * LDT_ENTRY_SIZE - 1); - write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_LDT, &ldt, DESC_LDT); asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); } @@ -209,7 +257,7 @@ DECLARE_PER_CPU(bool, __tss_limit_invalid); static inline void force_reload_TR(void) { - struct desc_struct *d = get_cpu_gdt_table(smp_processor_id()); + struct desc_struct *d = get_current_gdt_rw(); tss_desc tss; memcpy(&tss, &d[GDT_ENTRY_TSS], sizeof(tss_desc)); @@ -288,7 +336,7 @@ static inline unsigned long native_store_tr(void) static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); + struct desc_struct *gdt = get_cpu_gdt_rw(cpu); unsigned int i; for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 8554f960e21b..b65155cc3760 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -100,6 +100,10 @@ enum fixed_addresses { #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif + /* Fixmap entries to remap the GDTs, one per processor. */ + FIX_GDT_REMAP_BEGIN, + FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + __end_of_permanent_fixed_addresses, /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7caa2ac50ea2..1150e1b21b0d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -716,6 +716,7 @@ extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); +extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 58505f01962f..dcbd9bcce714 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -87,7 +87,7 @@ static inline void setup_stack_canary_segment(int cpu) { #ifdef CONFIG_X86_32 unsigned long canary = (unsigned long)&per_cpu(stack_canary, cpu); - struct desc_struct *gdt_table = get_cpu_gdt_table(cpu); + struct desc_struct *gdt_table = get_cpu_gdt_rw(cpu); struct desc_struct desc; desc = gdt_table[GDT_ENTRY_STACK_CANARY]; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 48587335ede8..ed014814ea35 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -101,7 +101,7 @@ int x86_acpi_suspend_lowlevel(void) #ifdef CONFIG_SMP initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = - (unsigned long)get_cpu_gdt_table(smp_processor_id()); + (unsigned long)get_cpu_gdt_rw(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5a414545e8a3..446b0d3d4932 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -609,7 +609,7 @@ static long __apm_bios_call(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -685,7 +685,7 @@ static long __apm_bios_call_simple(void *_call) cpu = get_cpu(); BUG_ON(cpu != 0); - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); save_desc_40 = gdt[0x40 / 8]; gdt[0x40 / 8] = bad_bios_desc; @@ -2352,7 +2352,7 @@ static int __init apm_init(void) * Note we only set APM segments on CPU zero, since we pin the APM * code to that CPU. */ - gdt = get_cpu_gdt_table(0); + gdt = get_cpu_gdt_rw(0); set_desc_base(&gdt[APM_CS >> 3], (unsigned long)__va((unsigned long)apm_info.bios.cseg << 4)); set_desc_base(&gdt[APM_CS_16 >> 3], diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 58094a1f9e9d..3cf1590ec9ce 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -448,6 +448,26 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } +/* Used by XEN to force the GDT read-only when required */ +pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL; + +/* Setup the fixmap mapping only once per-processor */ +static inline void setup_fixmap_gdt(int cpu) +{ + __set_fixmap(get_cpu_gdt_ro_index(cpu), + __pa(get_cpu_gdt_rw(cpu)), pg_fixmap_gdt_flags); +} + +/* Load a fixmap remapping of the per-cpu GDT */ +void load_fixmap_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_ro(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} + /* * Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. @@ -456,11 +476,10 @@ void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr; - gdt_descr.address = (long)get_cpu_gdt_table(cpu); + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); /* Reload the per-cpu base */ - load_percpu_segment(cpu); } @@ -1526,6 +1545,9 @@ void cpu_init(void) if (is_uv_system()) uv_cpu_init(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #else @@ -1581,6 +1603,9 @@ void cpu_init(void) dbg_restore_debug_regs(); fpu__init_cpu(); + + setup_fixmap_gdt(cpu); + load_fixmap_gdt(cpu); } #endif diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 9820d6d977c6..11338b0b3ad2 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -160,7 +160,7 @@ static inline void setup_percpu_segment(int cpu) pack_descriptor(&gdt, per_cpu_offset(cpu), 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); gdt.s = 1; - write_gdt_entry(get_cpu_gdt_table(cpu), + write_gdt_entry(get_cpu_gdt_rw(cpu), GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index bd1f1ad35284..f04479a8f74f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -983,7 +983,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) unsigned long timeout; idle->thread.sp = (unsigned long)task_pt_regs(idle); - early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); + early_gdt_descr.address = (unsigned long)get_cpu_gdt_rw(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index cef39b097649..950071171436 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void) load_cr3(initial_page_table); __flush_tlb_all(); - gdt_descr.address = __pa(get_cpu_gdt_table(0)); + gdt_descr.address = __pa(get_cpu_gdt_rw(0)); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); @@ -79,7 +79,7 @@ void __init efi_call_phys_epilog(pgd_t *save_pgd) { struct desc_ptr gdt_descr; - gdt_descr.address = (unsigned long)get_cpu_gdt_table(0); + gdt_descr.address = (unsigned long)get_cpu_gdt_rw(0); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 66ade16c7693..6b05a9219ea2 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -95,7 +95,7 @@ static void __save_processor_state(struct saved_context *ctxt) * 'pmode_gdt' in wakeup_start. */ ctxt->gdt_desc.size = GDT_SIZE - 1; - ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); + ctxt->gdt_desc.address = (unsigned long)get_cpu_gdt_rw(smp_processor_id()); store_tr(ctxt->tr); @@ -162,7 +162,7 @@ static void fix_processor_context(void) int cpu = smp_processor_id(); struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 - struct desc_struct *desc = get_cpu_gdt_table(cpu); + struct desc_struct *desc = get_cpu_gdt_rw(cpu); tss_desc tss; #endif set_tss_desc(cpu, t); /* @@ -183,6 +183,9 @@ static void fix_processor_context(void) load_mm_ldt(current->active_mm); /* This does lldt */ fpu__resume_cpu(); + + /* The processor is back on the direct GDT, load back the fixmap */ + load_fixmap_gdt(cpu); } /** diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ec1d5c46e58f..08faa61de5f7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -710,7 +710,7 @@ static void load_TLS_descriptor(struct thread_struct *t, *shadow = t->tls_array[i]; - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]); mc = __xen_mc_entry(0); @@ -1545,6 +1545,9 @@ asmlinkage __visible void __init xen_start_kernel(void) */ xen_initial_gdt = &per_cpu(gdt_page, 0); + /* GDT can only be remapped RO */ + pg_fixmap_gdt_flags = PAGE_KERNEL_RO; + xen_smp_init(); #ifdef CONFIG_ACPI_NUMA diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 37cb5aad71de..ebbfe00133f7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2326,6 +2326,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #endif case FIX_TEXT_POKE0: case FIX_TEXT_POKE1: + case FIX_GDT_REMAP_BEGIN ... FIX_GDT_REMAP_END: /* All local page mappings */ pte = pfn_pte(phys, prot); break; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7ff2f1bfb7ec..eaa36162ed4a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -392,7 +392,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) if (ctxt == NULL) return -ENOMEM; - gdt = get_cpu_gdt_table(cpu); + gdt = get_cpu_gdt_rw(cpu); #ifdef CONFIG_X86_32 ctxt->user_regs.fs = __KERNEL_PERCPU; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index d71f6323ac00..b4f79b923aea 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -504,7 +504,7 @@ void __init lguest_arch_host_init(void) * byte, not the size, hence the "-1"). */ state->host_gdt_desc.size = GDT_SIZE-1; - state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); + state->host_gdt_desc.address = (long)get_cpu_gdt_rw(i); /* * All CPUs on the Host use the same Interrupt Descriptor @@ -554,8 +554,8 @@ void __init lguest_arch_host_init(void) * The Host needs to be able to use the LGUEST segments on this * CPU, too, so put them in the Host GDT. */ - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; - get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; + get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; + get_cpu_gdt_rw(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } /* diff --git a/drivers/pnp/pnpbios/bioscalls.c b/drivers/pnp/pnpbios/bioscalls.c index 438d4c72c7b3..ff563db025b3 100644 --- a/drivers/pnp/pnpbios/bioscalls.c +++ b/drivers/pnp/pnpbios/bioscalls.c @@ -54,7 +54,7 @@ __asm__(".text \n" #define Q2_SET_SEL(cpu, selname, address, size) \ do { \ - struct desc_struct *gdt = get_cpu_gdt_table((cpu)); \ + struct desc_struct *gdt = get_cpu_gdt_rw((cpu)); \ set_desc_base(&gdt[(selname) >> 3], (u32)(address)); \ set_desc_limit(&gdt[(selname) >> 3], (size) - 1); \ } while(0) @@ -95,8 +95,8 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3, return PNP_FUNCTION_NOT_SUPPORTED; cpu = get_cpu(); - save_desc_40 = get_cpu_gdt_table(cpu)[0x40 / 8]; - get_cpu_gdt_table(cpu)[0x40 / 8] = bad_bios_desc; + save_desc_40 = get_cpu_gdt_rw(cpu)[0x40 / 8]; + get_cpu_gdt_rw(cpu)[0x40 / 8] = bad_bios_desc; /* On some boxes IRQ's during PnP BIOS calls are deadly. */ spin_lock_irqsave(&pnp_bios_lock, flags); @@ -134,7 +134,7 @@ static inline u16 call_pnp_bios(u16 func, u16 arg1, u16 arg2, u16 arg3, :"memory"); spin_unlock_irqrestore(&pnp_bios_lock, flags); - get_cpu_gdt_table(cpu)[0x40 / 8] = save_desc_40; + get_cpu_gdt_rw(cpu)[0x40 / 8] = save_desc_40; put_cpu(); /* If we get here and this is set then the PnP BIOS faulted on us. */ @@ -477,7 +477,7 @@ void pnpbios_calls_init(union pnp_bios_install_struct *header) pnp_bios_callpoint.segment = PNP_CS16; for_each_possible_cpu(i) { - struct desc_struct *gdt = get_cpu_gdt_table(i); + struct desc_struct *gdt = get_cpu_gdt_rw(i); if (!gdt) continue; set_desc_base(&gdt[GDT_ENTRY_PNPBIOS_CS32], -- cgit v1.2.3 From 45fc8757d1d2128e342b4e7ef39adedf7752faac Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 14 Mar 2017 10:05:08 -0700 Subject: x86: Make the GDT remapping read-only on 64-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch makes the GDT remapped pages read-only, to prevent accidental (or intentional) corruption of this key data structure. This change is done only on 64-bit, because 32-bit needs it to be writable for TSS switches. The native_load_tr_desc function was adapted to correctly handle a read-only GDT. The LTR instruction always writes to the GDT TSS entry. This generates a page fault if the GDT is read-only. This change checks if the current GDT is a remap and swap GDTs as needed. This function was tested by booting multiple machines and checking hibernation works properly. KVM SVM and VMX were adapted to use the writeable GDT. On VMX, the per-cpu variable was removed for functions to fetch the original GDT. Instead of reloading the previous GDT, VMX will reload the fixmap GDT as expected. For testing, VMs were started and restored on multiple configurations. Signed-off-by: Thomas Garnier Cc: Alexander Potapenko Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Chris Wilson Cc: Christian Borntraeger Cc: Dmitry Vyukov Cc: Frederic Weisbecker Cc: Jiri Kosina Cc: Joerg Roedel Cc: Jonathan Corbet Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Kees Cook Cc: Len Brown Cc: Linus Torvalds Cc: Lorenzo Stoakes Cc: Luis R . Rodriguez Cc: Matt Fleming Cc: Michal Hocko Cc: Paolo Bonzini Cc: Paul Gortmaker Cc: Pavel Machek Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rafael J . Wysocki Cc: Rusty Russell Cc: Stanislaw Gruszka Cc: Thomas Gleixner Cc: Tim Chen Cc: Vitaly Kuznetsov Cc: kasan-dev@googlegroups.com Cc: kernel-hardening@lists.openwall.com Cc: kvm@vger.kernel.org Cc: lguest@lists.ozlabs.org Cc: linux-doc@vger.kernel.org Cc: linux-efi@vger.kernel.org Cc: linux-mm@kvack.org Cc: linux-pm@vger.kernel.org Cc: xen-devel@lists.xenproject.org Cc: zijun_hu Link: http://lkml.kernel.org/r/20170314170508.100882-3-thgarnie@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 106 +++++++++++++++++++++++++-------------- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 28 ++++++++--- arch/x86/kvm/svm.c | 4 +- arch/x86/kvm/vmx.c | 12 ++--- 5 files changed, 96 insertions(+), 55 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4b5ef0c64291..ec05f9c1a62c 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -248,9 +248,77 @@ static inline void native_set_ldt(const void *addr, unsigned int entries) } } +static inline void native_load_gdt(const struct desc_ptr *dtr) +{ + asm volatile("lgdt %0"::"m" (*dtr)); +} + +static inline void native_load_idt(const struct desc_ptr *dtr) +{ + asm volatile("lidt %0"::"m" (*dtr)); +} + +static inline void native_store_gdt(struct desc_ptr *dtr) +{ + asm volatile("sgdt %0":"=m" (*dtr)); +} + +static inline void native_store_idt(struct desc_ptr *dtr) +{ + asm volatile("sidt %0":"=m" (*dtr)); +} + +/* + * The LTR instruction marks the TSS GDT entry as busy. On 64-bit, the GDT is + * a read-only remapping. To prevent a page fault, the GDT is switched to the + * original writeable version when needed. + */ +#ifdef CONFIG_X86_64 static inline void native_load_tr_desc(void) { + struct desc_ptr gdt; + int cpu = raw_smp_processor_id(); + bool restore = 0; + struct desc_struct *fixmap_gdt; + + native_store_gdt(&gdt); + fixmap_gdt = get_cpu_gdt_ro(cpu); + + /* + * If the current GDT is the read-only fixmap, swap to the original + * writeable version. Swap back at the end. + */ + if (gdt.address == (unsigned long)fixmap_gdt) { + load_direct_gdt(cpu); + restore = 1; + } asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); + if (restore) + load_fixmap_gdt(cpu); +} +#else +static inline void native_load_tr_desc(void) +{ + asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); +} +#endif + +static inline unsigned long native_store_tr(void) +{ + unsigned long tr; + + asm volatile("str %0":"=r" (tr)); + + return tr; +} + +static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) +{ + struct desc_struct *gdt = get_cpu_gdt_rw(cpu); + unsigned int i; + + for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) + gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; } DECLARE_PER_CPU(bool, __tss_limit_invalid); @@ -305,44 +373,6 @@ static inline void invalidate_tss_limit(void) this_cpu_write(__tss_limit_invalid, true); } -static inline void native_load_gdt(const struct desc_ptr *dtr) -{ - asm volatile("lgdt %0"::"m" (*dtr)); -} - -static inline void native_load_idt(const struct desc_ptr *dtr) -{ - asm volatile("lidt %0"::"m" (*dtr)); -} - -static inline void native_store_gdt(struct desc_ptr *dtr) -{ - asm volatile("sgdt %0":"=m" (*dtr)); -} - -static inline void native_store_idt(struct desc_ptr *dtr) -{ - asm volatile("sidt %0":"=m" (*dtr)); -} - -static inline unsigned long native_store_tr(void) -{ - unsigned long tr; - - asm volatile("str %0":"=r" (tr)); - - return tr; -} - -static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) -{ - struct desc_struct *gdt = get_cpu_gdt_rw(cpu); - unsigned int i; - - for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) - gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; -} - /* This intentionally ignores lm, since 32-bit apps don't have that field. */ #define LDT_empty(info) \ ((info)->base_addr == 0 && \ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1150e1b21b0d..edf42c4ac8c8 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -716,6 +716,7 @@ extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); +extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3cf1590ec9ce..f8e22dbad86c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -448,8 +448,15 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* Used by XEN to force the GDT read-only when required */ +/* + * On 64-bit the GDT remapping is read-only. + * A global is used for Xen to change the default when required. + */ +#ifdef CONFIG_X86_64 +pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL_RO; +#else pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL; +#endif /* Setup the fixmap mapping only once per-processor */ static inline void setup_fixmap_gdt(int cpu) @@ -458,6 +465,17 @@ static inline void setup_fixmap_gdt(int cpu) __pa(get_cpu_gdt_rw(cpu)), pg_fixmap_gdt_flags); } +/* Load the original GDT from the per-cpu structure */ +void load_direct_gdt(int cpu) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_rw(cpu); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); +} +EXPORT_SYMBOL_GPL(load_direct_gdt); + /* Load a fixmap remapping of the per-cpu GDT */ void load_fixmap_gdt(int cpu) { @@ -467,6 +485,7 @@ void load_fixmap_gdt(int cpu) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); } +EXPORT_SYMBOL_GPL(load_fixmap_gdt); /* * Current gdt points %fs at the "master" per-cpu area: after this, @@ -474,11 +493,8 @@ void load_fixmap_gdt(int cpu) */ void switch_to_new_gdt(int cpu) { - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_rw(cpu); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); + /* Load the original GDT */ + load_direct_gdt(cpu); /* Reload the per-cpu base */ load_percpu_segment(cpu); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1efe2c62b3f..c02b9af2056a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -741,7 +741,6 @@ static int svm_hardware_enable(void) struct svm_cpu_data *sd; uint64_t efer; - struct desc_ptr gdt_descr; struct desc_struct *gdt; int me = raw_smp_processor_id(); @@ -763,8 +762,7 @@ static int svm_hardware_enable(void) sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; sd->next_asid = sd->max_asid + 1; - native_store_gdt(&gdt_descr); - gdt = (struct desc_struct *)gdt_descr.address; + gdt = get_current_gdt_rw(); sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); wrmsrl(MSR_EFER, efer | EFER_SVME); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 98e82ee1e699..596a76d82b11 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -935,7 +935,6 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs); * when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it. */ static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu); -static DEFINE_PER_CPU(struct desc_ptr, host_gdt); /* * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we @@ -2052,14 +2051,13 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset) */ static unsigned long segment_base(u16 selector) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); struct desc_struct *table; unsigned long v; if (!(selector & ~SEGMENT_RPL_MASK)) return 0; - table = (struct desc_struct *)gdt->address; + table = get_current_gdt_ro(); if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) { u16 ldt_selector = kvm_read_ldt(); @@ -2164,7 +2162,7 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - load_gdt(this_cpu_ptr(&host_gdt)); + load_fixmap_gdt(raw_smp_processor_id()); } static void vmx_load_host_state(struct vcpu_vmx *vmx) @@ -2266,7 +2264,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (!already_loaded) { - struct desc_ptr *gdt = this_cpu_ptr(&host_gdt); + unsigned long gdt = get_current_gdt_ro_vaddr(); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -2277,7 +2275,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ vmcs_writel(HOST_TR_BASE, (unsigned long)this_cpu_ptr(&cpu_tss)); - vmcs_writel(HOST_GDTR_BASE, gdt->address); + vmcs_writel(HOST_GDTR_BASE, gdt); /* 22.2.4 */ /* * VM exits change the host TR limit to 0x67 after a VM @@ -3465,8 +3463,6 @@ static int hardware_enable(void) ept_sync_global(); } - native_store_gdt(this_cpu_ptr(&host_gdt)); - return 0; } -- cgit v1.2.3 From f991376e444aee8f5643a45703c1433bf7948940 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Fri, 17 Mar 2017 10:50:34 -0700 Subject: x86/mm: Correct fixmap header usage on adaptable MODULES_END This patch removes fixmap header usage on non-x86 code that was introduced by the adaptable MODULE_END change. Signed-off-by: Thomas Garnier Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317175034.4701-1-thgarnie@google.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 1 + arch/x86/kernel/module.c | 1 - arch/x86/mm/dump_pagetables.c | 1 - arch/x86/mm/kasan_init_64.c | 1 - mm/vmalloc.c | 4 ---- 5 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 73c7ccc38912..67608d4abc2c 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -13,6 +13,7 @@ #include #include #include +#include extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index fad61caac75e..477ae806c2fa 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -35,7 +35,6 @@ #include #include #include -#include #if 0 #define DEBUGP(fmt, ...) \ diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 75efeecc85eb..58b5bee7ea27 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -20,7 +20,6 @@ #include #include -#include /* * The dumper groups pagetable entries of the same type into one, and for diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 1bde19ef86bd..8d63d7a104c3 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -9,7 +9,6 @@ #include #include -#include extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_X_MAX]; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index b7d2a23349f4..0dd80222b20b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -36,10 +36,6 @@ #include #include -#ifdef CONFIG_X86 -# include -#endif - #include "internal.h" struct vfree_deferred { -- cgit v1.2.3 From 9a804fecee232e71b47ac37d62fd3d5d66b08b91 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:49 +0300 Subject: mm/gup: Drop the arch_pte_access_permitted() MMU callback The only arch that defines it to something meaningful is x86. But x86 doesn't use the generic GUP_fast() implementation -- the only place where the callback is called. Let's drop it. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/mmu_context.h | 6 ------ arch/s390/include/asm/mmu_context.h | 6 ------ arch/um/include/asm/mmu_context.h | 6 ------ arch/unicore32/include/asm/mmu_context.h | 6 ------ arch/x86/include/asm/mmu_context.h | 4 ---- include/asm-generic/mm_hooks.h | 6 ------ mm/gup.c | 3 --- 7 files changed, 37 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index b9e3f0aca261..ecf9885ab660 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -163,11 +163,5 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* __KERNEL__ */ #endif /* __ASM_POWERPC_MMU_CONTEXT_H */ diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 6e31d87fb669..fa2bf69be182 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -156,10 +156,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* __S390_MMU_CONTEXT_H */ diff --git a/arch/um/include/asm/mmu_context.h b/arch/um/include/asm/mmu_context.h index 94ac2739918c..b668e351fd6c 100644 --- a/arch/um/include/asm/mmu_context.h +++ b/arch/um/include/asm/mmu_context.h @@ -37,12 +37,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return true; } -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} - /* * end asm-generic/mm_hooks.h functions */ diff --git a/arch/unicore32/include/asm/mmu_context.h b/arch/unicore32/include/asm/mmu_context.h index 62dfc644c908..59b06b48f27d 100644 --- a/arch/unicore32/include/asm/mmu_context.h +++ b/arch/unicore32/include/asm/mmu_context.h @@ -103,10 +103,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 306c7e12af55..68b329d77b3a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -268,8 +268,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, return __pkru_allows_pkey(vma_pkey(vma), write); } -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write); -} #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index cc5d9a1405df..41e5b6784b97 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* _ASM_GENERIC_MM_HOOKS_H */ diff --git a/mm/gup.c b/mm/gup.c index 04aa405350dc..3f2338ba3402 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1216,9 +1216,6 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, pte_protnone(pte) || (write && !pte_write(pte))) goto pte_unmap; - if (!arch_pte_access_permitted(pte, write)) - goto pte_unmap; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); head = compound_head(page); -- cgit v1.2.3 From e7884f8ead4a301b04687a3238527b06feef8ea0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:50 +0300 Subject: mm/gup: Move permission checks into helpers This is a preparation patch for the transition of x86 to the generic GUP_fast() implementation. On x86, we would need to do additional permission checks to determine if access is allowed. Let's abstract it out into separate helpers. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/asm-generic/pgtable.h | 25 +++++++++++++++++++++++++ mm/gup.c | 15 ++++++++++----- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1fad160f35de..7dfa767dc680 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte) } #endif +#ifndef pte_access_permitted +#define pte_access_permitted(pte, write) \ + (pte_present(pte) && (!(write) || pte_write(pte))) +#endif + +#ifndef pmd_access_permitted +#define pmd_access_permitted(pmd, write) \ + (pmd_present(pmd) && (!(write) || pmd_write(pmd))) +#endif + +#ifndef pud_access_permitted +#define pud_access_permitted(pud, write) \ + (pud_present(pud) && (!(write) || pud_write(pud))) +#endif + +#ifndef p4d_access_permitted +#define p4d_access_permitted(p4d, write) \ + (p4d_present(p4d) && (!(write) || p4d_write(p4d))) +#endif + +#ifndef pgd_access_permitted +#define pgd_access_permitted(pgd, write) \ + (pgd_present(pgd) && (!(write) || pgd_write(pgd))) +#endif + #ifndef __HAVE_ARCH_PMD_SAME #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) diff --git a/mm/gup.c b/mm/gup.c index 3f2338ba3402..a62a778ce4ec 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1212,8 +1212,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, * Similar to the PMD case below, NUMA hinting must take slow * path using the pte_protnone check. */ - if (!pte_present(pte) || pte_special(pte) || - pte_protnone(pte) || (write && !pte_write(pte))) + if (pte_protnone(pte)) + goto pte_unmap; + + if (!pte_access_permitted(pte, write)) + goto pte_unmap; + + if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -1264,7 +1269,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, struct page *head, *page; int refs; - if (write && !pmd_write(orig)) + if (!pmd_access_permitted(orig, write)) return 0; refs = 0; @@ -1299,7 +1304,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, struct page *head, *page; int refs; - if (write && !pud_write(orig)) + if (!pud_access_permitted(orig, write)) return 0; refs = 0; @@ -1335,7 +1340,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, int refs; struct page *head, *page; - if (write && !pgd_write(orig)) + if (!pgd_access_permitted(orig, write)) return 0; refs = 0; -- cgit v1.2.3 From 0005d20b2ff1e501e186b3b3bc587085ac305fdc Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:51 +0300 Subject: mm/gup: Move page table entry dereference into helper function This is a preparation patch for the transition of x86 to the generic GUP_fast() implementation. On x86 PAE, page table entry is larger than sizeof(long) and we would need to provide a helper that can read the entry atomically. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index a62a778ce4ec..e83db38deb17 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1189,6 +1189,17 @@ struct page *get_dump_page(unsigned long addr) */ #ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifndef gup_get_pte +/* + * We assume that the PTE can be read atomically. If this is not the case for + * your architecture, please provide the helper. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} +#endif + #ifdef __HAVE_ARCH_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) @@ -1198,14 +1209,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ptem = ptep = pte_offset_map(&pmd, addr); do { - /* - * In the line below we are assuming that the pte can be read - * atomically. If this is not the case for your architecture, - * please wrap this in a helper function! - * - * for an example see gup_get_pte in arch/x86/mm/gup.c - */ - pte_t pte = READ_ONCE(*ptep); + pte_t pte = gup_get_pte(ptep); struct page *head, *page; /* -- cgit v1.2.3 From e93480537fd7ecaf5ed1a662a979376f6fee50e3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:52 +0300 Subject: mm/gup: Mark all pages PageReferenced in generic get_user_pages_fast() This is a preparation patch for the transition of x86 to the generic GUP_fast() implementation. Unlike generic GUP_fast(), the x86 version makes all pages it touches referenced. It seems required for GRU and EPT. See the following commit: 8ee53820edfd ("thp: mmu_notifier_test_young") Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- mm/gup.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/gup.c b/mm/gup.c index e83db38deb17..2b6cd3573457 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1238,6 +1238,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } VM_BUG_ON_PAGE(compound_head(page) != head, page); + + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -1299,6 +1301,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1334,6 +1337,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1370,6 +1374,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } -- cgit v1.2.3 From b59f65fa076a8eac2ff3a8ab7f8e1705b9fa86cb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:53 +0300 Subject: mm/gup: Implement the dev_pagemap() logic in the generic get_user_pages_fast() function This is a preparation patch for the transition of x86 to the generic GUP_fast() implementation. Prepare generic GUP_fast() to handle dev_pagemap(). At the moment, it's only implemented on x86. On non-x86, the new code will be compiled out. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dan Williams Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/mm.h | 4 +++ mm/gup.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 89 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 5f01c88f0800..e197d3ca3e8a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -430,6 +430,10 @@ static inline int pud_devmap(pud_t pud) { return 0; } +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif /* diff --git a/mm/gup.c b/mm/gup.c index 2b6cd3573457..e3d1e80424f4 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1200,12 +1200,23 @@ static inline pte_t gup_get_pte(pte_t *ptep) } #endif +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + #ifdef __HAVE_ARCH_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { + struct dev_pagemap *pgmap = NULL; + int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; - int ret = 0; ptem = ptep = pte_offset_map(&pmd, addr); do { @@ -1222,7 +1233,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (!pte_access_permitted(pte, write)) goto pte_unmap; - if (pte_special(pte)) + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + goto pte_unmap; + } + } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -1239,6 +1256,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON_PAGE(compound_head(page) != head, page); + put_dev_pagemap(pgmap); SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -1269,6 +1287,64 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ +#ifdef __HAVE_ARCH_PTE_DEVMAP +static int __gup_device_huge(unsigned long pfn, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + struct dev_pagemap *pgmap = NULL; + + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} +#else +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} +#endif + static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1278,6 +1354,9 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, if (!pmd_access_permitted(orig, write)) return 0; + if (pmd_devmap(orig)) + return __gup_device_huge_pmd(orig, addr, end, pages, nr); + refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1314,6 +1393,9 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, if (!pud_access_permitted(orig, write)) return 0; + if (pud_devmap(orig)) + return __gup_device_huge_pud(orig, addr, end, pages, nr); + refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -1351,6 +1433,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, if (!pgd_access_permitted(orig, write)) return 0; + BUILD_BUG_ON(pgd_devmap(orig)); refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); -- cgit v1.2.3 From 73e10a61817dfc97fe7418bfad1f608e562d7348 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 16 Mar 2017 18:26:54 +0300 Subject: mm/gup: Provide callback to check if __GUP_fast() is allowed for the range This is a preparation patch for the transition of x86 to the generic GUP_fast() implementation. On x86, get_user_pages_fast() does a couple of sanity checks to see if we can call __get_user_pages_fast() for the range. This kind of wrapping protection should be useful for the generic code too. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316152655.37789-7-kirill.shutemov@linux.intel.com [ Small readability edits. ] Signed-off-by: Ingo Molnar --- mm/gup.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index e3d1e80424f4..527ec2c6cca3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1614,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } +#ifndef gup_fast_permitted +/* + * Check if it's allowed to use __get_user_pages_fast() for the range, or + * we need to fall back to the slow version: + */ +bool gup_fast_permitted(unsigned long start, int nr_pages, int write) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + return end >= start; +} +#endif + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -1633,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - int nr, ret; + int nr = 0, ret = 0; start &= PAGE_MASK; - nr = __get_user_pages_fast(start, nr_pages, write, pages); - ret = nr; + + if (gup_fast_permitted(start, nr_pages, write)) { + nr = __get_user_pages_fast(start, nr_pages, write, pages); + ret = nr; + } if (nr < nr_pages) { /* Try to get the remaining pages with get_user_pages */ -- cgit v1.2.3 From 2947ba054a4dabbd82848728d765346886050029 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Mar 2017 00:39:06 +0300 Subject: x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation This patch provides all required callbacks required by the generic get_user_pages_fast() code and switches x86 over - and removes the platform specific implementation. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K . V Cc: Borislav Petkov Cc: Catalin Marinas Cc: Dann Frazier Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Steve Capper Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170316213906.89528-1-kirill.shutemov@linux.intel.com [ Minor readability edits. ] Signed-off-by: Ingo Molnar --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/x86/Kconfig | 3 + arch/x86/include/asm/mmu_context.h | 12 - arch/x86/include/asm/pgtable-3level.h | 47 ++++ arch/x86/include/asm/pgtable.h | 53 ++++ arch/x86/include/asm/pgtable_64.h | 16 +- arch/x86/mm/Makefile | 2 +- arch/x86/mm/gup.c | 496 ---------------------------------- mm/Kconfig | 2 +- mm/gup.c | 10 +- 12 files changed, 128 insertions(+), 519 deletions(-) delete mode 100644 arch/x86/mm/gup.c diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 0d4e71b42c77..454fadd077ad 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1666,7 +1666,7 @@ config ARCH_SELECT_MEMORY_MODEL config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP def_bool y depends on ARM_LPAE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3741859765cf..af62bf79721a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY config ZONE_DMA def_bool y -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP def_bool y config ARCH_DMA_ADDR_T_64BIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 97a8bc8a095c..3a716b2dcde9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,7 +135,7 @@ config PPC select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS - select HAVE_GENERIC_RCU_GUP + select HAVE_GENERIC_GUP select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2bab9d093b51..8977d9c77373 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2788,6 +2788,9 @@ config X86_DMA_REMAP bool depends on STA2X11 +config HAVE_GENERIC_GUP + def_bool y + source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 68b329d77b3a..6e933d2d88d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -220,18 +220,6 @@ static inline int vma_pkey(struct vm_area_struct *vma) } #endif -static inline bool __pkru_allows_pkey(u16 pkey, bool write) -{ - u32 pkru = read_pkru(); - - if (!__pkru_allows_read(pkru, pkey)) - return false; - if (write && !__pkru_allows_write(pkru, pkey)) - return false; - - return true; -} - /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 72277b1028a5..29eb5778019c 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -215,4 +215,51 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) +#define gup_get_pte gup_get_pte +/* + * WARNING: only to be used in the get_user_pages_fast() implementation. + * + * With get_user_pages_fast(), we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atomically, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a PTE will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees 'l' iff pte_high + * sees 'h'. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have gotten rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. Because get_user_pages_fast() only operates on present ptes + * we're safe. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + pte_t pte; + + do { + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + } while (unlikely(pte.pte_low != ptep->pte_low)); + + return pte; +} + #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6f6f351e0a81..160256bc9f2e 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -233,6 +233,11 @@ static inline int pud_devmap(pud_t pud) return 0; } #endif + +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -1136,6 +1141,54 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) #endif } +static inline bool __pkru_allows_pkey(u16 pkey, bool write) +{ + u32 pkru = read_pkru(); + + if (!__pkru_allows_read(pkru, pkey)) + return false; + if (write && !__pkru_allows_write(pkru, pkey)) + return false; + + return true; +} + +/* + * 'pteval' can come from a PTE, PMD or PUD. We only check + * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the + * same value on all 3 types. + */ +static inline bool __pte_access_permitted(unsigned long pteval, bool write) +{ + unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + + if (write) + need_pte_bits |= _PAGE_RW; + + if ((pteval & need_pte_bits) != need_pte_bits) + return 0; + + return __pkru_allows_pkey(pte_flags_pkey(pteval), write); +} + +#define pte_access_permitted pte_access_permitted +static inline bool pte_access_permitted(pte_t pte, bool write) +{ + return __pte_access_permitted(pte_val(pte), write); +} + +#define pmd_access_permitted pmd_access_permitted +static inline bool pmd_access_permitted(pmd_t pmd, bool write) +{ + return __pte_access_permitted(pmd_val(pmd), write); +} + +#define pud_access_permitted pud_access_permitted +static inline bool pud_access_permitted(pud_t pud, bool write) +{ + return __pte_access_permitted(pud_val(pud), write); +} + #include #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 67608d4abc2c..13709cf74ab6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -207,6 +207,20 @@ extern void cleanup_highmap(void); extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); -#endif /* !__ASSEMBLY__ */ +#define gup_fast_permitted gup_fast_permitted +static inline bool gup_fast_permitted(unsigned long start, int nr_pages, + int write) +{ + unsigned long len, end; + + len = (unsigned long)nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + return false; + if (end >> __VIRTUAL_MASK_SHIFT) + return false; + return true; +} +#endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 96d2b847e09e..0fbdcb64f9f8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c deleted file mode 100644 index 456dfdfd2249..000000000000 --- a/arch/x86/mm/gup.c +++ /dev/null @@ -1,496 +0,0 @@ -/* - * Lockless get_user_pages_fast for x86 - * - * Copyright (C) 2008 Nick Piggin - * Copyright (C) 2008 Novell Inc. - */ -#include -#include -#include -#include -#include -#include - -#include -#include - -static inline pte_t gup_get_pte(pte_t *ptep) -{ -#ifndef CONFIG_X86_PAE - return READ_ONCE(*ptep); -#else - /* - * With get_user_pages_fast, we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a pte will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees l iff pte_high - * sees h. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have got rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. And get_user_pages_fast only operates on present ptes, so - * we're safe. - * - * gup_get_pte should not be used or copied outside gup.c without being - * very careful -- it does not atomically load the pte or anything that - * is likely to be useful for you. - */ - pte_t pte; - -retry: - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - if (unlikely(pte.pte_low != ptep->pte_low)) - goto retry; - - return pte; -#endif -} - -static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) -{ - while ((*nr) - nr_start) { - struct page *page = pages[--(*nr)]; - - ClearPageReferenced(page); - put_page(page); - } -} - -/* - * 'pteval' can come from a pte, pmd, pud or p4d. We only check - * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 4 types. - */ -static inline int pte_allows_gup(unsigned long pteval, int write) -{ - unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_RW; - - if ((pteval & need_pte_bits) != need_pte_bits) - return 0; - - /* Check memory protection keys permissions. */ - if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) - return 0; - - return 1; -} - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct dev_pagemap *pgmap = NULL; - int nr_start = *nr, ret = 0; - pte_t *ptep, *ptem; - - /* - * Keep the original mapped PTE value (ptem) around since we - * might increment ptep off the end of the page when finishing - * our loop iteration. - */ - ptem = ptep = pte_offset_map(&pmd, addr); - do { - pte_t pte = gup_get_pte(ptep); - struct page *page; - - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - break; - - if (!pte_allows_gup(pte_val(pte), write)) - break; - - if (pte_devmap(pte)) { - pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - break; - } - } else if (pte_special(pte)) - break; - - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - get_page(page); - put_dev_pagemap(pgmap); - SetPageReferenced(page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - if (addr == end) - ret = 1; - pte_unmap(ptem); - - return ret; -} - -static inline void get_head_page_multiple(struct page *page, int nr) -{ - VM_BUG_ON_PAGE(page != compound_head(page), page); - VM_BUG_ON_PAGE(page_count(page) == 0, page); - page_ref_add(page, nr); - SetPageReferenced(page); -} - -static int __gup_device_huge(unsigned long pfn, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - int nr_start = *nr; - struct dev_pagemap *pgmap = NULL; - - do { - struct page *page = pfn_to_page(pfn); - - pgmap = get_dev_pagemap(pfn, pgmap); - if (unlikely(!pgmap)) { - undo_dev_pagemap(nr, nr_start, pages); - return 0; - } - SetPageReferenced(page); - pages[*nr] = page; - get_page(page); - put_dev_pagemap(pgmap); - (*nr)++; - pfn++; - } while (addr += PAGE_SIZE, addr != end); - return 1; -} - -static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static int __gup_device_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, struct page **pages, int *nr) -{ - unsigned long fault_pfn; - - fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - return __gup_device_huge(fault_pfn, addr, end, pages, nr); -} - -static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pmd_val(pmd), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); - if (pmd_devmap(pmd)) - return __gup_device_huge_pmd(pmd, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp; - - pmdp = pmd_offset(&pud, addr); - do { - pmd_t pmd = *pmdp; - - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pte_range(pmd, addr, next, write, pages, nr)) - return 0; - } - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static noinline int gup_huge_pud(pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - int refs; - - if (!pte_allows_gup(pud_val(pud), write)) - return 0; - - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - if (pud_devmap(pud)) - return __gup_device_huge_pud(pud, addr, end, pages, nr); - - /* hugepages are never "special" */ - VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - get_head_page_multiple(head, refs); - - return 1; -} - -static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp; - - pudp = pud_offset(&p4d, addr); - do { - pud_t pud = *pudp; - - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pud, addr, next, write, pages, nr)) - return 0; - } else { - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; - } - } while (pudp++, addr = next, addr != end); - - return 1; -} - -static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, - int write, struct page **pages, int *nr) -{ - unsigned long next; - p4d_t *p4dp; - - p4dp = p4d_offset(&pgd, addr); - do { - p4d_t p4d = *p4dp; - - next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) - return 0; - BUILD_BUG_ON(p4d_large(p4d)); - if (!gup_pud_range(p4d, addr, next, write, pages, nr)) - return 0; - } while (p4dp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - unsigned long flags; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - (void __user *)start, len))) - return 0; - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next; - pgd_t *pgdp; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - - end = start + len; - if (end < start) - goto slow_irqon; - -#ifdef CONFIG_X86_64 - if (end >> __VIRTUAL_MASK_SHIFT) - goto slow_irqon; -#endif - - /* - * XXX: batch / limit 'nr', to avoid large irq off latency - * needs some instrumenting to determine the common sizes used by - * important workloads (eg. DB2), and whether limiting the batch size - * will decrease performance. - * - * It seems like we're in the clear for the moment. Direct-IO is - * the main guy that batches up lots of get_user_pages, and even - * they are limited to 64-at-a-time which is not so many. - */ - /* - * This doesn't prevent pagetable teardown, but does prevent - * the pagetables and pages from being freed on x86. - * - * So long as we atomically load page table pointers versus teardown - * (which we do on x86, with the above PAE exception), we can follow the - * address down to the the page and take a ref on it. - */ - local_irq_disable(); - pgdp = pgd_offset(mm, addr); - do { - pgd_t pgd = *pgdp; - - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - goto slow; - if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) - goto slow; - } while (pgdp++, addr = next, addr != end); - local_irq_enable(); - - VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); - return nr; - - { - int ret; - -slow: - local_irq_enable(); -slow_irqon: - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - - ret = get_user_pages_unlocked(start, - (end - start) >> PAGE_SHIFT, - pages, write ? FOLL_WRITE : 0); - - /* Have to be a bit careful with return values */ - if (nr > 0) { - if (ret < 0) - ret = nr; - else - ret += nr; - } - - return ret; - } -} diff --git a/mm/Kconfig b/mm/Kconfig index 9b8fccb969dc..c89f472b658c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_RCU_GUP +config HAVE_GENERIC_GUP bool config ARCH_DISCARD_MEMBLOCK diff --git a/mm/gup.c b/mm/gup.c index 527ec2c6cca3..2559a3987de7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1155,7 +1155,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic RCU Fast GUP + * Generic Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1176,8 +1176,8 @@ struct page *get_dump_page(unsigned long addr) * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free - * pages containing page tables. + * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to + * free pages containing page tables or TLB flushing requires IPI broadcast. * * *) ptes can be read atomically by the architecture. * @@ -1187,7 +1187,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifdef CONFIG_HAVE_GENERIC_GUP #ifndef gup_get_pte /* @@ -1677,4 +1677,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, return ret; } -#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ +#endif /* CONFIG_HAVE_GENERIC_GUP */ -- cgit v1.2.3 From 5b781c7e317fcf9f74475dc82bfce2e359dfca13 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 18 Mar 2017 22:17:24 -0700 Subject: x86/tls: Forcibly set the accessed bit in TLS segments For mysterious historical reasons, struct user_desc doesn't indicate whether segments are accessed. set_thread_area() has always programmed segments as non-accessed, so the first write will set the accessed bit. This will fault if the GDT is read-only. Fix it by making TLS segments start out accessed. If this ends up breaking something, we could, in principle, leave TLS segments non-accessed and fix them up when we get the page fault. I'd be surprised, though -- AFAIK all the nasty legacy segmented programs (DOSEMU, Wine, things that run on DOSEMU and Wine, etc.) do their nasty segmented things using the LDT and not the GDT. I assume this is mainly because old OSes (Linux and otherwise) didn't historically provide APIs to do nasty things in the GDT. Fixes: 45fc8757d1d2 ("x86: Make the GDT remapping read-only on 64-bit") Signed-off-by: Andy Lutomirski Cc: Linus Torvalds Cc: Borislav Petkov Cc: Thomas Garnier Link: http://lkml.kernel.org/r/62b7748542df0164af7e0a5231283b9b13858c45.1489900519.git.luto@kernel.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/tls.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 6c8934406dc9..dcd699baea1b 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -92,10 +92,17 @@ static void set_tls_desc(struct task_struct *p, int idx, cpu = get_cpu(); while (n-- > 0) { - if (LDT_empty(info) || LDT_zero(info)) + if (LDT_empty(info) || LDT_zero(info)) { desc->a = desc->b = 0; - else + } else { fill_ldt(desc, info); + + /* + * Always set the accessed bit so that the CPU + * doesn't try to write to the (read-only) GDT. + */ + desc->type |= 1; + } ++info; ++desc; } -- cgit v1.2.3 From ef37bc361442545a5be3c56c49a08c3153032127 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Tue, 21 Mar 2017 08:17:25 +0100 Subject: x86/headers: Simplify asm/fixmap.h inclusion into asm/pgtable*.h Instead of including fixmap.h twice in pgtable_32.h and pgtable_64.h, include it only once, in the common asm/pgtable.h header. Signed-off-by: Thomas Garnier Cc: Alexander Potapenko Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Chris Wilson Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Hugh Dickins Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Wilcox Cc: Michal Hocko Cc: Paul Gortmaker Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Xiao Guangrong Cc: kasan-dev@googlegroups.com Cc: kernel-hardening@lists.openwall.com Cc: linux-mm@kvack.org Cc: richard.weiyang@gmail.com Cc: zijun_hu Link: http://lkml.kernel.org/r/20170321071725.GA15782@gmail.com [ Generated this patch from two other patches and wrote changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable.h | 1 + arch/x86/include/asm/pgtable_32.h | 1 - arch/x86/include/asm/pgtable_64.h | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 160256bc9f2e..18a6f5460461 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -603,6 +603,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #include #include #include +#include static inline int pte_none(pte_t pte) { diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h index fbc73360aea0..bfab55675c16 100644 --- a/arch/x86/include/asm/pgtable_32.h +++ b/arch/x86/include/asm/pgtable_32.h @@ -14,7 +14,6 @@ */ #ifndef __ASSEMBLY__ #include -#include #include #include diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 13709cf74ab6..1a4bc71534d4 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -13,7 +13,6 @@ #include #include #include -#include extern pud_t level3_kernel_pgt[512]; extern pud_t level3_ident_pgt[512]; -- cgit v1.2.3 From 65973dd3fd31151823f4b8c289eebbb3fb7e6bc0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Mar 2017 14:32:29 -0700 Subject: selftests/x86/ldt_gdt_32: Work around a glibc sigaction() bug i386 glibc is buggy and calls the sigaction syscall incorrectly. This is asymptomatic for normal programs, but it blows up on programs that do evil things with segmentation. The ldt_gdt self-test is an example of such an evil program. This doesn't appear to be a regression -- I think I just got lucky with the uninitialized memory that glibc threw at the kernel when I wrote the test. This hackish fix manually issues sigaction(2) syscalls to undo the damage. Without the fix, ldt_gdt_32 segfaults; with the fix, it passes for me. See: https://sourceware.org/bugzilla/show_bug.cgi?id=21269 Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Garnier Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/aaab0f9f93c9af25396f01232608c163a760a668.1490218061.git.luto@kernel.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/ldt_gdt.c | 46 +++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c index f6121612e769..b9a22f18566a 100644 --- a/tools/testing/selftests/x86/ldt_gdt.c +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -409,6 +409,51 @@ static void *threadproc(void *ctx) } } +#ifdef __i386__ + +#ifndef SA_RESTORE +#define SA_RESTORER 0x04000000 +#endif + +/* + * The UAPI header calls this 'struct sigaction', which conflicts with + * glibc. Sigh. + */ +struct fake_ksigaction { + void *handler; /* the real type is nasty */ + unsigned long sa_flags; + void (*sa_restorer)(void); + unsigned char sigset[8]; +}; + +static void fix_sa_restorer(int sig) +{ + struct fake_ksigaction ksa; + + if (syscall(SYS_rt_sigaction, sig, NULL, &ksa, 8) == 0) { + /* + * glibc has a nasty bug: it sometimes writes garbage to + * sa_restorer. This interacts quite badly with anything + * that fiddles with SS because it can trigger legacy + * stack switching. Patch it up. See: + * + * https://sourceware.org/bugzilla/show_bug.cgi?id=21269 + */ + if (!(ksa.sa_flags & SA_RESTORER) && ksa.sa_restorer) { + ksa.sa_restorer = NULL; + if (syscall(SYS_rt_sigaction, sig, &ksa, NULL, + sizeof(ksa.sigset)) != 0) + err(1, "rt_sigaction"); + } + } +} +#else +static void fix_sa_restorer(int sig) +{ + /* 64-bit glibc works fine. */ +} +#endif + static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), int flags) { @@ -420,6 +465,7 @@ static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), if (sigaction(sig, &sa, 0)) err(1, "sigaction"); + fix_sa_restorer(sig); } static jmp_buf jmpbuf; -- cgit v1.2.3 From aa4ea675528f3fa11e9663e5a32f55a81c34dcac Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Mar 2017 14:32:30 -0700 Subject: x86/gdt: Fix setup_fixmap_gdt() to use the correct PA __pa() cannot be used on percpu pointers because they may be virtually mapped. Use per_cpu_ptr_to_phys() instead. This fixes a boot crash on a some 32-bit configurations. I assume this is related to which allocation strategy is chosen by the percpu core. Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Garnier Cc: Thomas Gleixner Fixes: 69218e47994d x86: ("Remap GDT tables in the fixmap section") Link: http://lkml.kernel.org/r/22e0069c29fba31998f193201e359eebfdac4960.1490218061.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index ec05f9c1a62c..bde11696b893 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -98,6 +98,12 @@ static inline unsigned long get_current_gdt_ro_vaddr(void) return (unsigned long)get_current_gdt_ro(); } +/* Provide the physical address of the GDT page. */ +static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) +{ + return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); +} + #ifdef CONFIG_X86_64 static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f8e22dbad86c..f6e20e2dbfa5 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -461,8 +461,8 @@ pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL; /* Setup the fixmap mapping only once per-processor */ static inline void setup_fixmap_gdt(int cpu) { - __set_fixmap(get_cpu_gdt_ro_index(cpu), - __pa(get_cpu_gdt_rw(cpu)), pg_fixmap_gdt_flags); + __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), + pg_fixmap_gdt_flags); } /* Load the original GDT from the per-cpu structure */ -- cgit v1.2.3 From 3fa1cabbc3b61224ef33d3ca4a1a96998529bc68 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Mar 2017 14:32:31 -0700 Subject: x86/efi/32: Fix EFI on systems where the per-cpu GDT is virtually mapped __pa() on a per-cpu pointer is invalid. This bug appears to go *waaay* back, and I guess it's just never been triggered. Signed-off-by: Andy Lutomirski Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Garnier Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/5ba1d3ffca85e1a5b3ac99265ebe55df4cf0dbe4.1490218061.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 950071171436..3481268da3d0 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -68,7 +68,7 @@ pgd_t * __init efi_call_phys_prolog(void) load_cr3(initial_page_table); __flush_tlb_all(); - gdt_descr.address = __pa(get_cpu_gdt_rw(0)); + gdt_descr.address = get_cpu_gdt_paddr(0); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); -- cgit v1.2.3 From 23b2a4ddebdd17fad265b4bb77256c2e4ec37dee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Mar 2017 14:32:32 -0700 Subject: x86/boot/32: Defer resyncing initial_page_table until per-cpu is set up The x86 smpboot trampoline expects initial_page_table to have the GDT mapped. If the GDT ends up in a virtually mapped per-cpu page, then it won't be in the page tables at all until perc-pu areas are set up. The result will be a triple fault the first time that the CPU attempts to access the GDT after LGDT loads the perc-pu GDT. This appears to be an old bug, but somehow the GDT fixmap rework is triggering it. This seems to have something to do with the memory layout. Signed-off-by: Andy Lutomirski Cc: Ard Biesheuvel Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Thomas Garnier Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/a553264a5972c6a86f9b5caac237470a0c74a720.1490218061.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 15 --------------- arch/x86/kernel/setup_percpu.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4bf0c8926a1c..56b1177155db 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1226,21 +1226,6 @@ void __init setup_arch(char **cmdline_p) kasan_init(); -#ifdef CONFIG_X86_32 - /* sync back kernel address range */ - clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); - - /* - * sync back low identity map too. It is used for example - * in the 32-bit EFI stub. - */ - clone_pgd_range(initial_page_table, - swapper_pg_dir + KERNEL_PGD_BOUNDARY, - min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); -#endif - tboot_probe(); map_vsyscall(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 11338b0b3ad2..bb1e8cc0bc84 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -288,4 +288,25 @@ void __init setup_per_cpu_areas(void) /* Setup cpu initialized, callin, callout masks */ setup_cpu_local_masks(); + +#ifdef CONFIG_X86_32 + /* + * Sync back kernel address range. We want to make sure that + * all kernel mappings, including percpu mappings, are available + * in the smpboot asm. We can't reliably pick up percpu + * mappings using vmalloc_fault(), because exception dispatch + * needs percpu data. + */ + clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + KERNEL_PGD_PTRS); + + /* + * sync back low identity map too. It is used for example + * in the 32-bit EFI stub. + */ + clone_pgd_range(initial_page_table, + swapper_pg_dir + KERNEL_PGD_BOUNDARY, + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); +#endif } -- cgit v1.2.3 From 59c58ceb29d0f030eddb36a3a9dbadcc499786a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Mar 2017 14:32:33 -0700 Subject: x86/gdt: Get rid of the get_*_gdt_*_vaddr() helpers There's a single caller that is only there because it's passing a pointer into a function (vmcs_writel()) that takes an unsigned long. Let's just cast it in place rather than having a bunch of trivial helpers. Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Garnier Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/46108fb35e1699252b1b6a85039303ff562c9836.1490218061.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 20 -------------------- arch/x86/kvm/vmx.c | 4 ++-- 2 files changed, 2 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index bde11696b893..17cb46e8a184 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -53,22 +53,12 @@ static inline struct desc_struct *get_cpu_gdt_rw(unsigned int cpu) return per_cpu(gdt_page, cpu).gdt; } -static inline unsigned long get_cpu_gdt_rw_vaddr(unsigned int cpu) -{ - return (unsigned long)get_cpu_gdt_rw(cpu); -} - /* Provide the current original GDT */ static inline struct desc_struct *get_current_gdt_rw(void) { return this_cpu_ptr(&gdt_page)->gdt; } -static inline unsigned long get_current_gdt_rw_vaddr(void) -{ - return (unsigned long)get_current_gdt_rw(); -} - /* Get the fixmap index for a specific processor */ static inline unsigned int get_cpu_gdt_ro_index(int cpu) { @@ -82,22 +72,12 @@ static inline struct desc_struct *get_cpu_gdt_ro(int cpu) return (struct desc_struct *)__fix_to_virt(idx); } -static inline unsigned long get_cpu_gdt_ro_vaddr(int cpu) -{ - return (unsigned long)get_cpu_gdt_ro(cpu); -} - /* Provide the current read-only GDT */ static inline struct desc_struct *get_current_gdt_ro(void) { return get_cpu_gdt_ro(smp_processor_id()); } -static inline unsigned long get_current_gdt_ro_vaddr(void) -{ - return (unsigned long)get_current_gdt_ro(); -} - /* Provide the physical address of the GDT page. */ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 596a76d82b11..3acde663dc58 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2264,7 +2264,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (!already_loaded) { - unsigned long gdt = get_current_gdt_ro_vaddr(); + void *gdt = get_current_gdt_ro(); unsigned long sysenter_esp; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); @@ -2275,7 +2275,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) */ vmcs_writel(HOST_TR_BASE, (unsigned long)this_cpu_ptr(&cpu_tss)); - vmcs_writel(HOST_GDTR_BASE, gdt); /* 22.2.4 */ + vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */ /* * VM exits change the host TR limit to 0x67 after a VM -- cgit v1.2.3 From b23adb7d3f7d1d7cce03db9704de67a99ceeda38 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 22 Mar 2017 14:32:34 -0700 Subject: x86/xen/gdt: Use X86_FEATURE_XENPV instead of globals for the GDT fixup Xen imposes special requirements on the GDT. Rather than using a global variable for the pgprot, just use an explicit special case for Xen -- this makes it clearer what's going on. It also debloats 64-bit kernels very slightly. Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Garnier Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e9ea96abbfd6a8c87753849171bb5987ecfeb523.1490218061.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 1 - arch/x86/kernel/cpu/common.c | 28 +++++++++++++++++----------- arch/x86/xen/enlighten.c | 3 --- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 17cb46e8a184..d0a21b12dd58 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -39,7 +39,6 @@ extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; extern const struct desc_ptr debug_idt_descr; extern gate_desc debug_idt_table[]; -extern pgprot_t pg_fixmap_gdt_flags; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f6e20e2dbfa5..8ee32119144d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -448,21 +448,27 @@ void load_percpu_segment(int cpu) load_stack_canary_segment(); } -/* - * On 64-bit the GDT remapping is read-only. - * A global is used for Xen to change the default when required. - */ +/* Setup the fixmap mapping only once per-processor */ +static inline void setup_fixmap_gdt(int cpu) +{ #ifdef CONFIG_X86_64 -pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL_RO; + /* On 64-bit systems, we use a read-only fixmap GDT. */ + pgprot_t prot = PAGE_KERNEL_RO; #else -pgprot_t pg_fixmap_gdt_flags = PAGE_KERNEL; + /* + * On native 32-bit systems, the GDT cannot be read-only because + * our double fault handler uses a task gate, and entering through + * a task gate needs to change an available TSS to busy. If the GDT + * is read-only, that will triple fault. + * + * On Xen PV, the GDT must be read-only because the hypervisor requires + * it. + */ + pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? + PAGE_KERNEL_RO : PAGE_KERNEL; #endif -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) -{ - __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), - pg_fixmap_gdt_flags); + __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); } /* Load the original GDT from the per-cpu structure */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 08faa61de5f7..4951fcf95143 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1545,9 +1545,6 @@ asmlinkage __visible void __init xen_start_kernel(void) */ xen_initial_gdt = &per_cpu(gdt_page, 0); - /* GDT can only be remapped RO */ - pg_fixmap_gdt_flags = PAGE_KERNEL_RO; - xen_smp_init(); #ifdef CONFIG_ACPI_NUMA -- cgit v1.2.3 From 7f68904182e2f346c11b0acd74048181dc6615bb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Mar 2017 21:55:10 +0300 Subject: x86/kexec: Add 5-level paging support Handle additional page table level in the kexec code. Signed-off-by: Kirill A. Shutemov Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317185515.8636-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/kexec.h | 1 + arch/x86/kernel/machine_kexec_32.c | 4 +++- arch/x86/kernel/machine_kexec_64.c | 14 ++++++++++++-- 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 282630e4c6ea..70ef205489f0 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -164,6 +164,7 @@ struct kimage_arch { }; #else struct kimage_arch { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 469b23d6acc2..5f43cec296c5 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -103,6 +103,7 @@ static void machine_kexec_page_table_set_one( pgd_t *pgd, pmd_t *pmd, pte_t *pte, unsigned long vaddr, unsigned long paddr) { + p4d_t *p4d; pud_t *pud; pgd += pgd_index(vaddr); @@ -110,7 +111,8 @@ static void machine_kexec_page_table_set_one( if (!(pgd_val(*pgd) & _PAGE_PRESENT)) set_pgd(pgd, __pgd(__pa(pmd) | _PAGE_PRESENT)); #endif - pud = pud_offset(pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); if (!(pmd_val(*pmd) & _PAGE_PRESENT)) set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 857cdbd02867..085c3b300d32 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -36,6 +36,7 @@ static struct kexec_file_ops *kexec_file_loaders[] = { static void free_transition_pgtable(struct kimage *image) { + free_page((unsigned long)image->arch.p4d); free_page((unsigned long)image->arch.pud); free_page((unsigned long)image->arch.pmd); free_page((unsigned long)image->arch.pte); @@ -43,6 +44,7 @@ static void free_transition_pgtable(struct kimage *image) static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -53,13 +55,21 @@ static int init_transition_pgtable(struct kimage *image, pgd_t *pgd) paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE); pgd += pgd_index(vaddr); if (!pgd_present(*pgd)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); + if (!p4d) + goto err; + image->arch.p4d = p4d; + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + p4d = p4d_offset(pgd, vaddr); + if (!p4d_present(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) goto err; image->arch.pud = pud; - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } - pud = pud_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); if (!pud_present(*pud)) { pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) -- cgit v1.2.3 From e981316f560482c001dede7f5b29259bde0b8afb Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Mar 2017 21:55:11 +0300 Subject: x86/efi: Add 5-level paging support Allocate additional page table level and ajdust efi_sync_low_kernel_mappings() to work with additional page table level. Signed-off-by: Kirill A. Shutemov Reviewed-by: Matt Fleming Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317185515.8636-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/platform/efi/efi_64.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 8544dae3d1b4..eb64e5b33e37 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -135,6 +135,7 @@ static pgd_t *efi_pgd; int __init efi_alloc_page_tables(void) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; gfp_t gfp_mask; @@ -147,15 +148,20 @@ int __init efi_alloc_page_tables(void) return -ENOMEM; pgd = efi_pgd + pgd_index(EFI_VA_END); + p4d = p4d_alloc(&init_mm, pgd, EFI_VA_END); + if (!p4d) { + free_page((unsigned long)efi_pgd); + return -ENOMEM; + } - pud = pud_alloc_one(NULL, 0); + pud = pud_alloc(&init_mm, p4d, EFI_VA_END); if (!pud) { + if (CONFIG_PGTABLE_LEVELS > 4) + free_page((unsigned long) pgd_page_vaddr(*pgd)); free_page((unsigned long)efi_pgd); return -ENOMEM; } - pgd_populate(NULL, pgd, pud); - return 0; } @@ -190,6 +196,21 @@ void efi_sync_low_kernel_mappings(void) num_entries = pgd_index(EFI_VA_END) - pgd_index(PAGE_OFFSET); memcpy(pgd_efi, pgd_k, sizeof(pgd_t) * num_entries); + /* + * As with PGDs, we share all P4D entries apart from the one entry + * that covers the EFI runtime mapping space. + */ + BUILD_BUG_ON(p4d_index(EFI_VA_END) != p4d_index(MODULES_END)); + BUILD_BUG_ON((EFI_VA_START & P4D_MASK) != (EFI_VA_END & P4D_MASK)); + + pgd_efi = efi_pgd + pgd_index(EFI_VA_END); + pgd_k = pgd_offset_k(EFI_VA_END); + p4d_efi = p4d_offset(pgd_efi, 0); + p4d_k = p4d_offset(pgd_k, 0); + + num_entries = p4d_index(EFI_VA_END); + memcpy(p4d_efi, p4d_k, sizeof(p4d_t) * num_entries); + /* * We share all the PUD entries apart from those that map the * EFI regions. Copy around them. @@ -197,20 +218,15 @@ void efi_sync_low_kernel_mappings(void) BUILD_BUG_ON((EFI_VA_START & ~PUD_MASK) != 0); BUILD_BUG_ON((EFI_VA_END & ~PUD_MASK) != 0); - pgd_efi = efi_pgd + pgd_index(EFI_VA_END); - p4d_efi = p4d_offset(pgd_efi, 0); + p4d_efi = p4d_offset(pgd_efi, EFI_VA_END); + p4d_k = p4d_offset(pgd_k, EFI_VA_END); pud_efi = pud_offset(p4d_efi, 0); - - pgd_k = pgd_offset_k(EFI_VA_END); - p4d_k = p4d_offset(pgd_k, 0); pud_k = pud_offset(p4d_k, 0); num_entries = pud_index(EFI_VA_END); memcpy(pud_efi, pud_k, sizeof(pud_t) * num_entries); - p4d_efi = p4d_offset(pgd_efi, EFI_VA_START); pud_efi = pud_offset(p4d_efi, EFI_VA_START); - p4d_k = p4d_offset(pgd_k, EFI_VA_START); pud_k = pud_offset(p4d_k, EFI_VA_START); num_entries = PTRS_PER_PUD - pud_index(EFI_VA_START); -- cgit v1.2.3 From 4547833602fdd3b672c9b945818cc658d38bfcf1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Mar 2017 21:55:12 +0300 Subject: x86/mm/pat: Add 5-level paging support Straight-forward extension of existing code to support additional page table level. Signed-off-by: Kirill A. Shutemov Reviewed-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317185515.8636-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 54 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 28d42130243c..b5949017dead 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -346,6 +346,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, unsigned int *level) { + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -354,7 +355,15 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d)) + return NULL; + + *level = PG_LEVEL_512G; + if (p4d_large(*p4d) || !p4d_present(*p4d)) + return (pte_t *)p4d; + + pud = pud_offset(p4d, address); if (pud_none(*pud)) return NULL; @@ -406,13 +415,18 @@ static pte_t *_lookup_address_cpa(struct cpa_data *cpa, unsigned long address, pmd_t *lookup_pmd_address(unsigned long address) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(address); if (pgd_none(*pgd)) return NULL; - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + if (p4d_none(*p4d) || p4d_large(*p4d) || !p4d_present(*p4d)) + return NULL; + + pud = pud_offset(p4d, address); if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) return NULL; @@ -477,11 +491,13 @@ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); - pud = pud_offset(pgd, address); + p4d = p4d_offset(pgd, address); + pud = pud_offset(p4d, address); pmd = pmd_offset(pud, address); set_pte_atomic((pte_t *)pmd, pte); } @@ -836,9 +852,9 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end) pud_clear(pud); } -static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) +static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) { - pud_t *pud = pud_offset(pgd, start); + pud_t *pud = pud_offset(p4d, start); /* * Not on a GB page boundary? @@ -1004,8 +1020,8 @@ static long populate_pmd(struct cpa_data *cpa, return num_pages; } -static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static int populate_pud(struct cpa_data *cpa, unsigned long start, p4d_t *p4d, + pgprot_t pgprot) { pud_t *pud; unsigned long end; @@ -1026,7 +1042,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, cur_pages = (pre_end - start) >> PAGE_SHIFT; cur_pages = min_t(int, (int)cpa->numpages, cur_pages); - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); /* * Need a PMD page? @@ -1047,7 +1063,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (cpa->numpages == cur_pages) return cur_pages; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); pud_pgprot = pgprot_4k_2_large(pgprot); /* @@ -1067,7 +1083,7 @@ static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, if (start < end) { long tmp; - pud = pud_offset(pgd, start); + pud = pud_offset(p4d, start); if (pud_none(*pud)) if (alloc_pmd_page(pud)) return -1; @@ -1090,33 +1106,43 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) { pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ + p4d_t *p4d; pgd_t *pgd_entry; long ret; pgd_entry = cpa->pgd + pgd_index(addr); + if (pgd_none(*pgd_entry)) { + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + if (!p4d) + return -1; + + set_pgd(pgd_entry, __pgd(__pa(p4d) | _KERNPG_TABLE)); + } + /* * Allocate a PUD page and hand it down for mapping. */ - if (pgd_none(*pgd_entry)) { + p4d = p4d_offset(pgd_entry, addr); + if (p4d_none(*p4d)) { pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); if (!pud) return -1; - set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(pgprot) |= pgprot_val(cpa->mask_set); - ret = populate_pud(cpa, addr, pgd_entry, pgprot); + ret = populate_pud(cpa, addr, p4d, pgprot); if (ret < 0) { /* * Leave the PUD page in place in case some other CPU or thread * already found it, but remove any useless entries we just * added to it. */ - unmap_pud_range(pgd_entry, addr, + unmap_pud_range(p4d, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } -- cgit v1.2.3 From d691a3cf8004784303f0a4c1c036edfaa7d64d0b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Mar 2017 21:55:13 +0300 Subject: x86/kasan: Prepare clear_pgds() to switch to With folded p4d, pgd_clear() is a nop. Change clear_pgds() to use p4d_clear() instead. Signed-off-by: Kirill A. Shutemov Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317185515.8636-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 8d63d7a104c3..7b81f01067f2 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -32,8 +32,19 @@ static int __init map_range(struct range *range) static void __init clear_pgds(unsigned long start, unsigned long end) { - for (; start < end; start += PGDIR_SIZE) - pgd_clear(pgd_offset_k(start)); + pgd_t *pgd; + + for (; start < end; start += PGDIR_SIZE) { + pgd = pgd_offset_k(start); + /* + * With folded p4d, pgd_clear() is nop, use p4d_clear() + * instead. + */ + if (CONFIG_PGTABLE_LEVELS < 5) + p4d_clear(p4d_offset(pgd, start)); + else + pgd_clear(pgd); + } } static void __init kasan_map_early_shadow(pgd_t *pgd) -- cgit v1.2.3 From 907cd439029091bcbd67f03cbe45a4c124347731 Mon Sep 17 00:00:00 2001 From: Xiong Zhang Date: Fri, 17 Mar 2017 21:55:14 +0300 Subject: x86/xen: Change __xen_pgd_walk() and xen_cleanmfnmap() to support p4d Split these helpers into a couple of per-level functions and add support for an additional page table level. Signed-off-by: Xiong Zhang [ Split off into separate patch ] Signed-off-by: Kirill A. Shutemov Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317185515.8636-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 245 ++++++++++++++++++++++++++++++++--------------------- arch/x86/xen/mmu.h | 1 + 2 files changed, 150 insertions(+), 96 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ebbfe00133f7..e6adebbb5f8d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -593,6 +593,64 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) } #endif /* CONFIG_PGTABLE_LEVELS == 4 */ +static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pmd_index(limit) + 1 : PTRS_PER_PMD; + for (i = 0; i < nr; i++) { + if (!pmd_none(pmd[i])) + flush |= (*func)(mm, pmd_page(pmd[i]), PT_PTE); + } + return flush; +} + +static int xen_pud_walk(struct mm_struct *mm, pud_t *pud, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? pud_index(limit) + 1 : PTRS_PER_PUD; + for (i = 0; i < nr; i++) { + pmd_t *pmd; + + if (pud_none(pud[i])) + continue; + + pmd = pmd_offset(&pud[i], 0); + if (PTRS_PER_PMD > 1) + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); + flush |= xen_pmd_walk(mm, pmd, func, + last && i == nr - 1, limit); + } + return flush; +} + +static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d, + int (*func)(struct mm_struct *mm, struct page *, enum pt_level), + bool last, unsigned long limit) +{ + int i, nr, flush = 0; + + nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D; + for (i = 0; i < nr; i++) { + pud_t *pud; + + if (p4d_none(p4d[i])) + continue; + + pud = pud_offset(&p4d[i], 0); + if (PTRS_PER_PUD > 1) + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); + flush |= xen_pud_walk(mm, pud, func, + last && i == nr - 1, limit); + } + return flush; +} + /* * (Yet another) pagetable walker. This one is intended for pinning a * pagetable. This means that it walks a pagetable and calls the @@ -613,10 +671,8 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, enum pt_level), unsigned long limit) { - int flush = 0; + int i, nr, flush = 0; unsigned hole_low, hole_high; - unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; - unsigned pgdidx, pudidx, pmdidx; /* The limit is the last byte to be touched */ limit--; @@ -633,65 +689,22 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd, hole_low = pgd_index(USER_LIMIT); hole_high = pgd_index(PAGE_OFFSET); - pgdidx_limit = pgd_index(limit); -#if PTRS_PER_PUD > 1 - pudidx_limit = pud_index(limit); -#else - pudidx_limit = 0; -#endif -#if PTRS_PER_PMD > 1 - pmdidx_limit = pmd_index(limit); -#else - pmdidx_limit = 0; -#endif - - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { - pud_t *pud; + nr = pgd_index(limit) + 1; + for (i = 0; i < nr; i++) { + p4d_t *p4d; - if (pgdidx >= hole_low && pgdidx < hole_high) + if (i >= hole_low && i < hole_high) continue; - if (!pgd_val(pgd[pgdidx])) + if (pgd_none(pgd[i])) continue; - pud = pud_offset(&pgd[pgdidx], 0); - - if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pud), PT_PUD); - - for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { - pmd_t *pmd; - - if (pgdidx == pgdidx_limit && - pudidx > pudidx_limit) - goto out; - - if (pud_none(pud[pudidx])) - continue; - - pmd = pmd_offset(&pud[pudidx], 0); - - if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); - - for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { - struct page *pte; - - if (pgdidx == pgdidx_limit && - pudidx == pudidx_limit && - pmdidx > pmdidx_limit) - goto out; - - if (pmd_none(pmd[pmdidx])) - continue; - - pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(mm, pte, PT_PTE); - } - } + p4d = p4d_offset(&pgd[i], 0); + if (PTRS_PER_P4D > 1) + flush |= (*func)(mm, virt_to_page(p4d), PT_P4D); + flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit); } -out: /* Do the top level last, so that the callbacks can use it as a cue to do final things like tlb flushes. */ flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); @@ -1150,57 +1163,97 @@ static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin) xen_free_ro_pages(pa, PAGE_SIZE); } +static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin) +{ + unsigned long pa; + pte_t *pte_tbl; + int i; + + if (pmd_large(*pmd)) { + pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PMD_SIZE); + return; + } + + pte_tbl = pte_offset_kernel(pmd, 0); + for (i = 0; i < PTRS_PER_PTE; i++) { + if (pte_none(pte_tbl[i])) + continue; + pa = pte_pfn(pte_tbl[i]) << PAGE_SHIFT; + xen_free_ro_pages(pa, PAGE_SIZE); + } + set_pmd(pmd, __pmd(0)); + xen_cleanmfnmap_free_pgtbl(pte_tbl, unpin); +} + +static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) +{ + unsigned long pa; + pmd_t *pmd_tbl; + int i; + + if (pud_large(*pud)) { + pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, PUD_SIZE); + return; + } + + pmd_tbl = pmd_offset(pud, 0); + for (i = 0; i < PTRS_PER_PMD; i++) { + if (pmd_none(pmd_tbl[i])) + continue; + xen_cleanmfnmap_pmd(pmd_tbl + i, unpin); + } + set_pud(pud, __pud(0)); + xen_cleanmfnmap_free_pgtbl(pmd_tbl, unpin); +} + +static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin) +{ + unsigned long pa; + pud_t *pud_tbl; + int i; + + if (p4d_large(*p4d)) { + pa = p4d_val(*p4d) & PHYSICAL_PAGE_MASK; + xen_free_ro_pages(pa, P4D_SIZE); + return; + } + + pud_tbl = pud_offset(p4d, 0); + for (i = 0; i < PTRS_PER_PUD; i++) { + if (pud_none(pud_tbl[i])) + continue; + xen_cleanmfnmap_pud(pud_tbl + i, unpin); + } + set_p4d(p4d, __p4d(0)); + xen_cleanmfnmap_free_pgtbl(pud_tbl, unpin); +} + /* * Since it is well isolated we can (and since it is perhaps large we should) * also free the page tables mapping the initial P->M table. */ static void __init xen_cleanmfnmap(unsigned long vaddr) { - unsigned long va = vaddr & PMD_MASK; - unsigned long pa; - pgd_t *pgd = pgd_offset_k(va); - pud_t *pud_page = pud_offset(pgd, 0); - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgd; + p4d_t *p4d; unsigned int i; bool unpin; unpin = (vaddr == 2 * PGDIR_SIZE); - set_pgd(pgd, __pgd(0)); - do { - pud = pud_page + pud_index(va); - if (pud_none(*pud)) { - va += PUD_SIZE; - } else if (pud_large(*pud)) { - pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PUD_SIZE); - va += PUD_SIZE; - } else { - pmd = pmd_offset(pud, va); - if (pmd_large(*pmd)) { - pa = pmd_val(*pmd) & PHYSICAL_PAGE_MASK; - xen_free_ro_pages(pa, PMD_SIZE); - } else if (!pmd_none(*pmd)) { - pte = pte_offset_kernel(pmd, va); - set_pmd(pmd, __pmd(0)); - for (i = 0; i < PTRS_PER_PTE; ++i) { - if (pte_none(pte[i])) - break; - pa = pte_pfn(pte[i]) << PAGE_SHIFT; - xen_free_ro_pages(pa, PAGE_SIZE); - } - xen_cleanmfnmap_free_pgtbl(pte, unpin); - } - va += PMD_SIZE; - if (pmd_index(va)) - continue; - set_pud(pud, __pud(0)); - xen_cleanmfnmap_free_pgtbl(pmd, unpin); - } - - } while (pud_index(va) || pmd_index(va)); - xen_cleanmfnmap_free_pgtbl(pud_page, unpin); + vaddr &= PMD_MASK; + pgd = pgd_offset_k(vaddr); + p4d = p4d_offset(pgd, 0); + for (i = 0; i < PTRS_PER_P4D; i++) { + if (p4d_none(p4d[i])) + continue; + xen_cleanmfnmap_p4d(p4d + i, unpin); + } + if (IS_ENABLED(CONFIG_X86_5LEVEL)) { + set_pgd(pgd, __pgd(0)); + xen_cleanmfnmap_free_pgtbl(p4d, unpin); + } } static void __init xen_pagetable_p2m_free(void) diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 73809bb951b4..3fe2b3292915 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -5,6 +5,7 @@ enum pt_level { PT_PGD, + PT_P4D, PT_PUD, PT_PMD, PT_PTE -- cgit v1.2.3 From f2a6a7050109e0a5c7a84c70aa6010f682b2f1ee Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 17 Mar 2017 21:55:15 +0300 Subject: x86: Convert the rest of the code to support p4d_t This patch converts x86 to use proper folding of a new (fifth) page table level with . That's a bit of a kitchen sink patch, but I don't see how to split it further without hurting bisectability. Signed-off-by: Kirill A. Shutemov Acked-by: Thomas Gleixner Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170317185515.8636-7-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 33 +++++- arch/x86/include/asm/paravirt_types.h | 12 ++- arch/x86/include/asm/pgalloc.h | 35 ++++++- arch/x86/include/asm/pgtable.h | 59 ++++++++++- arch/x86/include/asm/pgtable_64.h | 12 ++- arch/x86/include/asm/pgtable_types.h | 10 +- arch/x86/include/asm/xen/page.h | 8 +- arch/x86/kernel/paravirt.c | 10 +- arch/x86/mm/init_64.c | 183 +++++++++++++++++++++++++++------- arch/x86/xen/mmu.c | 152 ++++++++++++++++------------ include/trace/events/xen.h | 28 +++--- 11 files changed, 401 insertions(+), 141 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 0489884fdc44..158d877ce9e9 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -536,7 +536,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud) PVOP_VCALL2(pv_mmu_ops.set_pud, pudp, val); } -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 static inline pud_t __pud(pudval_t val) { pudval_t ret; @@ -565,6 +565,32 @@ static inline pudval_t pud_val(pud_t pud) return ret; } +static inline void pud_clear(pud_t *pudp) +{ + set_pud(pudp, __pud(0)); +} + +static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + p4dval_t val = native_p4d_val(p4d); + + if (sizeof(p4dval_t) > sizeof(long)) + PVOP_VCALL3(pv_mmu_ops.set_p4d, p4dp, + val, (u64)val >> 32); + else + PVOP_VCALL2(pv_mmu_ops.set_p4d, p4dp, + val); +} + +static inline void p4d_clear(p4d_t *p4dp) +{ + set_p4d(p4dp, __p4d(0)); +} + +#if CONFIG_PGTABLE_LEVELS >= 5 + +#error FIXME + static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { pgdval_t val = native_pgd_val(pgd); @@ -582,10 +608,7 @@ static inline void pgd_clear(pgd_t *pgdp) set_pgd(pgdp, __pgd(0)); } -static inline void pud_clear(pud_t *pudp) -{ - set_pud(pudp, __pud(0)); -} +#endif /* CONFIG_PGTABLE_LEVELS == 5 */ #endif /* CONFIG_PGTABLE_LEVELS == 4 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b060f962d581..93c49cf09b63 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -279,12 +279,18 @@ struct pv_mmu_ops { struct paravirt_callee_save pmd_val; struct paravirt_callee_save make_pmd; -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 struct paravirt_callee_save pud_val; struct paravirt_callee_save make_pud; - void (*set_pgd)(pgd_t *pudp, pgd_t pgdval); -#endif /* CONFIG_PGTABLE_LEVELS == 4 */ + void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval); + +#if CONFIG_PGTABLE_LEVELS >= 5 +#error FIXME +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ + +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ + #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ struct pv_lazy_ops lazy_mode; diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index b6d425999f99..2f585054c63c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -121,10 +121,10 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud))); + set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) @@ -150,6 +150,37 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, ___pud_free_tlb(tlb, pud); } +#if CONFIG_PGTABLE_LEVELS > 4 +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) +{ + paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); +} + +static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_KERNEL_ACCOUNT; + + if (mm == &init_mm) + gfp &= ~__GFP_ACCOUNT; + return (p4d_t *)get_zeroed_page(gfp); +} + +static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) +{ + BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); + free_page((unsigned long)p4d); +} + +extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); + +static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long address) +{ + ___p4d_free_tlb(tlb, p4d); +} + +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 18a6f5460461..bf51e6054577 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -53,11 +53,19 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd) -#ifndef __PAGETABLE_PUD_FOLDED +#ifndef __PAGETABLE_P4D_FOLDED #define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd) #define pgd_clear(pgd) native_pgd_clear(pgd) #endif +#ifndef set_p4d +# define set_p4d(p4dp, p4d) native_set_p4d(p4dp, p4d) +#endif + +#ifndef __PAGETABLE_PUD_FOLDED +#define p4d_clear(p4d) native_p4d_clear(p4d) +#endif + #ifndef set_pud # define set_pud(pudp, pud) native_set_pud(pudp, pud) #endif @@ -74,6 +82,11 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) +#ifndef __PAGETABLE_P4D_FOLDED +#define p4d_val(x) native_p4d_val(x) +#define __p4d(x) native_make_p4d(x) +#endif + #ifndef __PAGETABLE_PUD_FOLDED #define pud_val(x) native_pud_val(x) #define __pud(x) native_make_pud(x) @@ -554,6 +567,7 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) #define pte_pgprot(x) __pgprot(pte_flags(x)) #define pmd_pgprot(x) __pgprot(pmd_flags(x)) #define pud_pgprot(x) __pgprot(pud_flags(x)) +#define p4d_pgprot(x) __pgprot(p4d_flags(x)) #define canon_pgprot(p) __pgprot(massage_pgprot(p)) @@ -792,12 +806,47 @@ static inline unsigned long pud_index(unsigned long address) return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1); } +#if CONFIG_PGTABLE_LEVELS > 3 +static inline int p4d_none(p4d_t p4d) +{ + return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; +} + +static inline int p4d_present(p4d_t p4d) +{ + return p4d_flags(p4d) & _PAGE_PRESENT; +} + +static inline unsigned long p4d_page_vaddr(p4d_t p4d) +{ + return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d)); +} + +/* + * Currently stuck as a macro due to indirect forward reference to + * linux/mmzone.h's __section_mem_map_addr() definition: + */ +#define p4d_page(p4d) \ + pfn_to_page((p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT) + +/* Find an entry in the third-level page table.. */ +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address); +} + +static inline int p4d_bad(p4d_t p4d) +{ + return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; +} +#endif /* CONFIG_PGTABLE_LEVELS > 3 */ + static inline unsigned long p4d_index(unsigned long address) { return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1); } -#if CONFIG_PGTABLE_LEVELS > 3 +#if CONFIG_PGTABLE_LEVELS > 4 static inline int pgd_present(pgd_t pgd) { return pgd_flags(pgd) & _PAGE_PRESENT; @@ -815,9 +864,9 @@ static inline unsigned long pgd_page_vaddr(pgd_t pgd) #define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) /* to find an entry in a page-table-directory. */ -static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - return (pud_t *)pgd_page_vaddr(*pgd) + pud_index(address); + return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address); } static inline int pgd_bad(pgd_t pgd) @@ -835,7 +884,7 @@ static inline int pgd_none(pgd_t pgd) */ return !native_pgd_val(pgd); } -#endif /* CONFIG_PGTABLE_LEVELS > 3 */ +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1a4bc71534d4..0593a1ae7573 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -41,9 +41,9 @@ extern void paging_init(void); struct mm_struct; +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte); void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte); - static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -121,6 +121,16 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp) #endif } +static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) +{ + *p4dp = p4d; +} + +static inline void native_p4d_clear(p4d_t *p4d) +{ + native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); +} + static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { *pgdp = pgd; diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index df08535f774a..4930afe9df0a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -277,11 +277,11 @@ static inline pgdval_t pgd_flags(pgd_t pgd) #error FIXME #else -#include +#include static inline p4dval_t native_p4d_val(p4d_t p4d) { - return native_pgd_val(p4d); + return native_pgd_val(p4d.pgd); } #endif @@ -298,12 +298,11 @@ static inline pudval_t native_pud_val(pud_t pud) return pud.pud; } #else -#define __ARCH_USE_5LEVEL_HACK #include static inline pudval_t native_pud_val(pud_t pud) { - return native_pgd_val(pud.pgd); + return native_pgd_val(pud.p4d.pgd); } #endif @@ -320,12 +319,11 @@ static inline pmdval_t native_pmd_val(pmd_t pmd) return pmd.pmd; } #else -#define __ARCH_USE_5LEVEL_HACK #include static inline pmdval_t native_pmd_val(pmd_t pmd) { - return native_pgd_val(pmd.pud.pgd); + return native_pgd_val(pmd.pud.p4d.pgd); } #endif diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 33cbd3db97b9..bf2ca56fba11 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -279,13 +279,17 @@ static inline pte_t __pte_ma(pteval_t x) #define pmd_val_ma(v) ((v).pmd) #ifdef __PAGETABLE_PUD_FOLDED -#define pud_val_ma(v) ((v).pgd.pgd) +#define pud_val_ma(v) ((v).p4d.pgd.pgd) #else #define pud_val_ma(v) ((v).pud) #endif #define __pmd_ma(x) ((pmd_t) { (x) } ) -#define pgd_val_ma(x) ((x).pgd) +#ifdef __PAGETABLE_P4D_FOLDED +#define p4d_val_ma(x) ((x).pgd.pgd) +#else +#define p4d_val_ma(x) ((x).p4d) +#endif void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4797e87b0fb6..110daf22f5c7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -430,12 +430,16 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PTE_IDENT, .make_pud = PTE_IDENT, - .set_pgd = native_set_pgd, -#endif + .set_p4d = native_set_p4d, + +#if CONFIG_PGTABLE_LEVELS >= 5 +#error FIXME +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ +#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ .pte_val = PTE_IDENT, diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 15173d37f399..7bdda6f1d135 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -97,28 +97,38 @@ void sync_global_pgds(unsigned long start, unsigned long end) unsigned long address; for (address = start; address <= end; address += PGDIR_SIZE) { - const pgd_t *pgd_ref = pgd_offset_k(address); + pgd_t *pgd_ref = pgd_offset_k(address); + const p4d_t *p4d_ref; struct page *page; - if (pgd_none(*pgd_ref)) + /* + * With folded p4d, pgd_none() is always false, we need to + * handle synchonization on p4d level. + */ + BUILD_BUG_ON(pgd_none(*pgd_ref)); + p4d_ref = p4d_offset(pgd_ref, address); + + if (p4d_none(*p4d_ref)) continue; spin_lock(&pgd_lock); list_for_each_entry(page, &pgd_list, lru) { pgd_t *pgd; + p4d_t *p4d; spinlock_t *pgt_lock; pgd = (pgd_t *)page_address(page) + pgd_index(address); + p4d = p4d_offset(pgd, address); /* the pgt_lock only for Xen */ pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) - BUG_ON(pgd_page_vaddr(*pgd) - != pgd_page_vaddr(*pgd_ref)); + if (!p4d_none(*p4d_ref) && !p4d_none(*p4d)) + BUG_ON(p4d_page_vaddr(*p4d) + != p4d_page_vaddr(*p4d_ref)); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); + if (p4d_none(*p4d)) + set_p4d(p4d, *p4d_ref); spin_unlock(pgt_lock); } @@ -149,16 +159,28 @@ static __ref void *spp_getpage(void) return ptr; } -static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr) +static p4d_t *fill_p4d(pgd_t *pgd, unsigned long vaddr) { if (pgd_none(*pgd)) { - pud_t *pud = (pud_t *)spp_getpage(); - pgd_populate(&init_mm, pgd, pud); - if (pud != pud_offset(pgd, 0)) + p4d_t *p4d = (p4d_t *)spp_getpage(); + pgd_populate(&init_mm, pgd, p4d); + if (p4d != p4d_offset(pgd, 0)) printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n", - pud, pud_offset(pgd, 0)); + p4d, p4d_offset(pgd, 0)); + } + return p4d_offset(pgd, vaddr); +} + +static pud_t *fill_pud(p4d_t *p4d, unsigned long vaddr) +{ + if (p4d_none(*p4d)) { + pud_t *pud = (pud_t *)spp_getpage(); + p4d_populate(&init_mm, p4d, pud); + if (pud != pud_offset(p4d, 0)) + printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + pud, pud_offset(p4d, 0)); } - return pud_offset(pgd, vaddr); + return pud_offset(p4d, vaddr); } static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) @@ -167,7 +189,7 @@ static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr) pmd_t *pmd = (pmd_t *) spp_getpage(); pud_populate(&init_mm, pud, pmd); if (pmd != pmd_offset(pud, 0)) - printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n", + printk(KERN_ERR "PAGETABLE BUG #02! %p <-> %p\n", pmd, pmd_offset(pud, 0)); } return pmd_offset(pud, vaddr); @@ -179,20 +201,15 @@ static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr) pte_t *pte = (pte_t *) spp_getpage(); pmd_populate_kernel(&init_mm, pmd, pte); if (pte != pte_offset_kernel(pmd, 0)) - printk(KERN_ERR "PAGETABLE BUG #02!\n"); + printk(KERN_ERR "PAGETABLE BUG #03!\n"); } return pte_offset_kernel(pmd, vaddr); } -void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +static void __set_pte_vaddr(pud_t *pud, unsigned long vaddr, pte_t new_pte) { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pud = pud_page + pud_index(vaddr); - pmd = fill_pmd(pud, vaddr); - pte = fill_pte(pmd, vaddr); + pmd_t *pmd = fill_pmd(pud, vaddr); + pte_t *pte = fill_pte(pmd, vaddr); set_pte(pte, new_pte); @@ -203,10 +220,25 @@ void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) __flush_tlb_one(vaddr); } +void set_pte_vaddr_p4d(p4d_t *p4d_page, unsigned long vaddr, pte_t new_pte) +{ + p4d_t *p4d = p4d_page + p4d_index(vaddr); + pud_t *pud = fill_pud(p4d, vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + +void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) +{ + pud_t *pud = pud_page + pud_index(vaddr); + + __set_pte_vaddr(pud, vaddr, new_pte); +} + void set_pte_vaddr(unsigned long vaddr, pte_t pteval) { pgd_t *pgd; - pud_t *pud_page; + p4d_t *p4d_page; pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval)); @@ -216,17 +248,20 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) "PGD FIXMAP MISSING, it should be setup in head.S!\n"); return; } - pud_page = (pud_t*)pgd_page_vaddr(*pgd); - set_pte_vaddr_pud(pud_page, vaddr, pteval); + + p4d_page = p4d_offset(pgd, 0); + set_pte_vaddr_p4d(p4d_page, vaddr, pteval); } pmd_t * __init populate_extra_pmd(unsigned long vaddr) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pgd = pgd_offset_k(vaddr); - pud = fill_pud(pgd, vaddr); + p4d = fill_p4d(pgd, vaddr); + pud = fill_pud(p4d, vaddr); return fill_pmd(pud, vaddr); } @@ -245,6 +280,7 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, enum page_cache_mode cache) { pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgprot_t prot; @@ -255,11 +291,17 @@ static void __init __init_extra_mapping(unsigned long phys, unsigned long size, for (; size; phys += PMD_SIZE, size -= PMD_SIZE) { pgd = pgd_offset_k((unsigned long)__va(phys)); if (pgd_none(*pgd)) { + p4d = (p4d_t *) spp_getpage(); + set_pgd(pgd, __pgd(__pa(p4d) | _KERNPG_TABLE | + _PAGE_USER)); + } + p4d = p4d_offset(pgd, (unsigned long)__va(phys)); + if (p4d_none(*p4d)) { pud = (pud_t *) spp_getpage(); - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE | + set_p4d(p4d, __p4d(__pa(pud) | _KERNPG_TABLE | _PAGE_USER)); } - pud = pud_offset(pgd, (unsigned long)__va(phys)); + pud = pud_offset(p4d, (unsigned long)__va(phys)); if (pud_none(*pud)) { pmd = (pmd_t *) spp_getpage(); set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | @@ -563,12 +605,15 @@ kernel_physical_mapping_init(unsigned long paddr_start, for (; vaddr < vaddr_end; vaddr = vaddr_next) { pgd_t *pgd = pgd_offset_k(vaddr); + p4d_t *p4d; pud_t *pud; vaddr_next = (vaddr & PGDIR_MASK) + PGDIR_SIZE; - if (pgd_val(*pgd)) { - pud = (pud_t *)pgd_page_vaddr(*pgd); + BUILD_BUG_ON(pgd_none(*pgd)); + p4d = p4d_offset(pgd, vaddr); + if (p4d_val(*p4d)) { + pud = (pud_t *)p4d_page_vaddr(*p4d); paddr_last = phys_pud_init(pud, __pa(vaddr), __pa(vaddr_end), page_size_mask); @@ -580,7 +625,7 @@ kernel_physical_mapping_init(unsigned long paddr_start, page_size_mask); spin_lock(&init_mm.page_table_lock); - pgd_populate(&init_mm, pgd, pud); + p4d_populate(&init_mm, p4d, pud); spin_unlock(&init_mm.page_table_lock); pgd_changed = true; } @@ -726,6 +771,24 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) spin_unlock(&init_mm.page_table_lock); } +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + /* free a pud talbe */ + free_pagetable(p4d_page(*p4d), 0); + spin_lock(&init_mm.page_table_lock); + p4d_clear(p4d); + spin_unlock(&init_mm.page_table_lock); +} + static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) @@ -908,6 +971,32 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, update_page_count(PG_LEVEL_1G, -pages); } +static void __meminit +remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, + bool direct) +{ + unsigned long next, pages = 0; + pud_t *pud_base; + p4d_t *p4d; + + p4d = p4d_start + p4d_index(addr); + for (; addr < end; addr = next, p4d++) { + next = p4d_addr_end(addr, end); + + if (!p4d_present(*p4d)) + continue; + + BUILD_BUG_ON(p4d_large(*p4d)); + + pud_base = (pud_t *)p4d_page_vaddr(*p4d); + remove_pud_table(pud_base, addr, next, direct); + free_pud_table(pud_base, p4d); + } + + if (direct) + update_page_count(PG_LEVEL_512G, -pages); +} + /* start and end are both virtual address. */ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) @@ -915,7 +1004,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) unsigned long next; unsigned long addr; pgd_t *pgd; - pud_t *pud; + p4d_t *p4d; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); @@ -924,8 +1013,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) if (!pgd_present(*pgd)) continue; - pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, addr, next, direct); + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + remove_p4d_table(p4d, addr, next, direct); } flush_tlb_all(); @@ -1090,6 +1179,7 @@ int kern_addr_valid(unsigned long addr) { unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; pte_t *pte; @@ -1101,7 +1191,11 @@ int kern_addr_valid(unsigned long addr) if (pgd_none(*pgd)) return 0; - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return 0; + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) return 0; @@ -1158,6 +1252,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, unsigned long addr; unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; @@ -1168,7 +1263,11 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (!pgd) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); + p4d = vmemmap_p4d_populate(pgd, addr, node); + if (!p4d) + return -ENOMEM; + + pud = vmemmap_pud_populate(p4d, addr, node); if (!pud) return -ENOMEM; @@ -1236,6 +1335,7 @@ void register_page_bootmem_memmap(unsigned long section_nr, unsigned long end = (unsigned long)(start_page + size); unsigned long next; pgd_t *pgd; + p4d_t *p4d; pud_t *pud; pmd_t *pmd; unsigned int nr_pages; @@ -1251,7 +1351,14 @@ void register_page_bootmem_memmap(unsigned long section_nr, } get_page_bootmem(section_nr, pgd_page(*pgd), MIX_SECTION_INFO); - pud = pud_offset(pgd, addr); + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + continue; + } + get_page_bootmem(section_nr, p4d_page(*p4d), MIX_SECTION_INFO); + + pud = pud_offset(p4d, addr); if (pud_none(*pud)) { next = (addr + PAGE_SIZE) & PAGE_MASK; continue; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e6adebbb5f8d..4d4b7bc48f5d 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -535,40 +535,41 @@ static pgd_t *xen_get_user_pgd(pgd_t *pgd) return user_ptr; } -static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) { struct mmu_update u; u.ptr = virt_to_machine(ptr).maddr; - u.val = pgd_val_ma(val); + u.val = p4d_val_ma(val); xen_extend_mmu_update(&u); } /* - * Raw hypercall-based set_pgd, intended for in early boot before + * Raw hypercall-based set_p4d, intended for in early boot before * there's a page structure. This implies: * 1. The only existing pagetable is the kernel's * 2. It is always pinned * 3. It has no user pagetable attached to it */ -static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val) { preempt_disable(); xen_mc_batch(); - __xen_set_pgd_hyper(ptr, val); + __xen_set_p4d_hyper(ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } -static void xen_set_pgd(pgd_t *ptr, pgd_t val) +static void xen_set_p4d(p4d_t *ptr, p4d_t val) { - pgd_t *user_ptr = xen_get_user_pgd(ptr); + pgd_t *user_ptr = xen_get_user_pgd((pgd_t *)ptr); + pgd_t pgd_val; - trace_xen_mmu_set_pgd(ptr, user_ptr, val); + trace_xen_mmu_set_p4d(ptr, (p4d_t *)user_ptr, val); /* If page is not pinned, we can just update the entry directly */ @@ -576,7 +577,8 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) *ptr = val; if (user_ptr) { WARN_ON(xen_page_pinned(user_ptr)); - *user_ptr = val; + pgd_val.pgd = p4d_val_ma(val); + *user_ptr = pgd_val; } return; } @@ -585,9 +587,9 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val) user updates together. */ xen_mc_batch(); - __xen_set_pgd_hyper(ptr, val); + __xen_set_p4d_hyper(ptr, val); if (user_ptr) - __xen_set_pgd_hyper(user_ptr, val); + __xen_set_p4d_hyper((p4d_t *)user_ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -1591,7 +1593,6 @@ static int xen_pgd_alloc(struct mm_struct *mm) BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); } #endif - return ret; } @@ -1783,7 +1784,7 @@ static void xen_release_pmd(unsigned long pfn) xen_release_ptpage(pfn, PT_PMD); } -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); @@ -2124,21 +2125,27 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) */ void __init xen_relocate_p2m(void) { - phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys; + phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys; unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end; - int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud; + int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d; pte_t *pt; pmd_t *pmd; pud_t *pud; + p4d_t *p4d = NULL; pgd_t *pgd; unsigned long *new_p2m; + int save_pud; size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long)); n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT; n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT; n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT; - n_pud = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; - n_frames = n_pte + n_pt + n_pmd + n_pud; + n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT; + if (PTRS_PER_P4D > 1) + n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT; + else + n_p4d = 0; + n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d; new_area = xen_find_free_area(PFN_PHYS(n_frames)); if (!new_area) { @@ -2154,55 +2161,76 @@ void __init xen_relocate_p2m(void) * To avoid any possible virtual address collision, just use * 2 * PUD_SIZE for the new area. */ - pud_phys = new_area; + p4d_phys = new_area; + pud_phys = p4d_phys + PFN_PHYS(n_p4d); pmd_phys = pud_phys + PFN_PHYS(n_pud); pt_phys = pmd_phys + PFN_PHYS(n_pmd); p2m_pfn = PFN_DOWN(pt_phys) + n_pt; pgd = __va(read_cr3()); new_p2m = (unsigned long *)(2 * PGDIR_SIZE); - for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { - pud = early_memremap(pud_phys, PAGE_SIZE); - clear_page(pud); - for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); - idx_pmd++) { - pmd = early_memremap(pmd_phys, PAGE_SIZE); - clear_page(pmd); - for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); - idx_pt++) { - pt = early_memremap(pt_phys, PAGE_SIZE); - clear_page(pt); - for (idx_pte = 0; - idx_pte < min(n_pte, PTRS_PER_PTE); - idx_pte++) { - set_pte(pt + idx_pte, - pfn_pte(p2m_pfn, PAGE_KERNEL)); - p2m_pfn++; + idx_p4d = 0; + save_pud = n_pud; + do { + if (n_p4d > 0) { + p4d = early_memremap(p4d_phys, PAGE_SIZE); + clear_page(p4d); + n_pud = min(save_pud, PTRS_PER_P4D); + } + for (idx_pud = 0; idx_pud < n_pud; idx_pud++) { + pud = early_memremap(pud_phys, PAGE_SIZE); + clear_page(pud); + for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD); + idx_pmd++) { + pmd = early_memremap(pmd_phys, PAGE_SIZE); + clear_page(pmd); + for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD); + idx_pt++) { + pt = early_memremap(pt_phys, PAGE_SIZE); + clear_page(pt); + for (idx_pte = 0; + idx_pte < min(n_pte, PTRS_PER_PTE); + idx_pte++) { + set_pte(pt + idx_pte, + pfn_pte(p2m_pfn, PAGE_KERNEL)); + p2m_pfn++; + } + n_pte -= PTRS_PER_PTE; + early_memunmap(pt, PAGE_SIZE); + make_lowmem_page_readonly(__va(pt_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, + PFN_DOWN(pt_phys)); + set_pmd(pmd + idx_pt, + __pmd(_PAGE_TABLE | pt_phys)); + pt_phys += PAGE_SIZE; } - n_pte -= PTRS_PER_PTE; - early_memunmap(pt, PAGE_SIZE); - make_lowmem_page_readonly(__va(pt_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, - PFN_DOWN(pt_phys)); - set_pmd(pmd + idx_pt, - __pmd(_PAGE_TABLE | pt_phys)); - pt_phys += PAGE_SIZE; + n_pt -= PTRS_PER_PMD; + early_memunmap(pmd, PAGE_SIZE); + make_lowmem_page_readonly(__va(pmd_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, + PFN_DOWN(pmd_phys)); + set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); + pmd_phys += PAGE_SIZE; } - n_pt -= PTRS_PER_PMD; - early_memunmap(pmd, PAGE_SIZE); - make_lowmem_page_readonly(__va(pmd_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE, - PFN_DOWN(pmd_phys)); - set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys)); - pmd_phys += PAGE_SIZE; + n_pmd -= PTRS_PER_PUD; + early_memunmap(pud, PAGE_SIZE); + make_lowmem_page_readonly(__va(pud_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); + if (n_p4d > 0) + set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys)); + else + set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); + pud_phys += PAGE_SIZE; } - n_pmd -= PTRS_PER_PUD; - early_memunmap(pud, PAGE_SIZE); - make_lowmem_page_readonly(__va(pud_phys)); - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys)); - set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys)); - pud_phys += PAGE_SIZE; - } + if (n_p4d > 0) { + save_pud -= PTRS_PER_P4D; + early_memunmap(p4d, PAGE_SIZE); + make_lowmem_page_readonly(__va(p4d_phys)); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys)); + set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys)); + p4d_phys += PAGE_SIZE; + } + } while (++idx_p4d < n_p4d); /* Now copy the old p2m info to the new area. */ memcpy(new_p2m, xen_p2m_addr, size); @@ -2432,8 +2460,8 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; -#if CONFIG_PGTABLE_LEVELS == 4 - pv_mmu_ops.set_pgd = xen_set_pgd; +#if CONFIG_PGTABLE_LEVELS >= 4 + pv_mmu_ops.set_p4d = xen_set_p4d; #endif /* This will work as long as patching hasn't happened yet @@ -2442,7 +2470,7 @@ static void __init xen_post_allocator_init(void) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 pv_mmu_ops.alloc_pud = xen_alloc_pud; pv_mmu_ops.release_pud = xen_release_pud; #endif @@ -2508,10 +2536,10 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .make_pmd = PV_CALLEE_SAVE(xen_make_pmd), .pmd_val = PV_CALLEE_SAVE(xen_pmd_val), -#if CONFIG_PGTABLE_LEVELS == 4 +#if CONFIG_PGTABLE_LEVELS >= 4 .pud_val = PV_CALLEE_SAVE(xen_pud_val), .make_pud = PV_CALLEE_SAVE(xen_make_pud), - .set_pgd = xen_set_pgd_hyper, + .set_p4d = xen_set_p4d_hyper, .alloc_pud = xen_alloc_pmd_init, .release_pud = xen_release_pmd_init, diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index bce990f5a35d..31acce9019a6 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud, (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval) ); -TRACE_EVENT(xen_mmu_set_pgd, - TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval), - TP_ARGS(pgdp, user_pgdp, pgdval), +TRACE_EVENT(xen_mmu_set_p4d, + TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval), + TP_ARGS(p4dp, user_p4dp, p4dval), TP_STRUCT__entry( - __field(pgd_t *, pgdp) - __field(pgd_t *, user_pgdp) - __field(pgdval_t, pgdval) - ), - TP_fast_assign(__entry->pgdp = pgdp; - __entry->user_pgdp = user_pgdp; - __entry->pgdval = pgdval.pgd), - TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)", - __entry->pgdp, __entry->user_pgdp, - (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)), - (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval) + __field(p4d_t *, p4dp) + __field(p4d_t *, user_p4dp) + __field(p4dval_t, p4dval) + ), + TP_fast_assign(__entry->p4dp = p4dp; + __entry->user_p4dp = user_p4dp; + __entry->p4dval = p4d_val(p4dval)), + TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)", + __entry->p4dp, __entry->user_p4dp, + (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)), + (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval) ); TRACE_EVENT(xen_mmu_pud_clear, -- cgit v1.2.3 From 591a3d7c09fa08baff48ad86c2347dbd28a52753 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 24 Mar 2017 14:13:05 +0300 Subject: mm: Fix false-positive VM_BUG_ON() in page_cache_{get,add}_speculative() 0day testing by Fengguang Wu triggered this crash while running Trinity: kernel BUG at include/linux/pagemap.h:151! ... CPU: 0 PID: 458 Comm: trinity-c0 Not tainted 4.11.0-rc2-00251-g2947ba0 #1 ... Call Trace: __get_user_pages_fast() get_user_pages_fast() get_futex_key() futex_requeue() do_futex() SyS_futex() do_syscall_64() entry_SYSCALL64_slow_path() It' VM_BUG_ON() due to false-negative in_atomic(). We call page_cache_get_speculative() with disabled local interrupts. It should be atomic enough. So let's check for disabled interrupts in the VM_BUG_ON() condition too, to resolve this. ( This got triggered by the conversion of the x86 GUP code to the generic GUP code. ) Reported-by: Fengguang Wu Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: LKP Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170324114709.pcytvyb3d6ajux33@black.fi.intel.com Signed-off-by: Ingo Molnar --- include/linux/pagemap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 84943e8057ef..316a19f6b635 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page) #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif /* * Preempt must be disabled here - we rely on rcu_read_lock doing @@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif VM_BUG_ON_PAGE(page_count(page) == 0, page); page_ref_add(page, count); -- cgit v1.2.3 From fdd3d8ce0ea62c32b039af45cc5538b728e366d9 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 28 Mar 2017 13:48:06 +0300 Subject: x86/dump_pagetables: Add support for 5-level paging Simple extension to support one more page table level. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170328104806.41711-1-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 59 +++++++++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 58b5bee7ea27..9f305be71a72 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -110,7 +110,8 @@ static struct addr_marker address_markers[] = { #define PTE_LEVEL_MULT (PAGE_SIZE) #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT) #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ ({ \ @@ -286,14 +287,13 @@ static void note_page(struct seq_file *m, struct pg_state *st, } } -static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, - unsigned long P) +static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, unsigned long P) { int i; pte_t *start; pgprotval_t prot; - start = (pte_t *) pmd_page_vaddr(addr); + start = (pte_t *)pmd_page_vaddr(addr); for (i = 0; i < PTRS_PER_PTE; i++) { prot = pte_flags(*start); st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT); @@ -304,14 +304,13 @@ static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr, #if PTRS_PER_PMD > 1 -static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, - unsigned long P) +static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, unsigned long P) { int i; pmd_t *start; pgprotval_t prot; - start = (pmd_t *) pud_page_vaddr(addr); + start = (pmd_t *)pud_page_vaddr(addr); for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { @@ -347,15 +346,14 @@ static bool pud_already_checked(pud_t *prev_pud, pud_t *pud, bool checkwx) return checkwx && prev_pud && (pud_val(*prev_pud) == pud_val(*pud)); } -static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, - unsigned long P) +static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr, unsigned long P) { int i; pud_t *start; pgprotval_t prot; pud_t *prev_pud = NULL; - start = (pud_t *) pgd_page_vaddr(addr); + start = (pud_t *)p4d_page_vaddr(addr); for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); @@ -377,9 +375,42 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, } #else -#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p) -#define pgd_large(a) pud_large(__pud(pgd_val(a))) -#define pgd_none(a) pud_none(__pud(pgd_val(a))) +#define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(p4d_val(a)),p) +#define p4d_large(a) pud_large(__pud(p4d_val(a))) +#define p4d_none(a) pud_none(__pud(p4d_val(a))) +#endif + +#if PTRS_PER_P4D > 1 + +static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr, unsigned long P) +{ + int i; + p4d_t *start; + pgprotval_t prot; + + start = (p4d_t *)pgd_page_vaddr(addr); + + for (i = 0; i < PTRS_PER_P4D; i++) { + st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT); + if (!p4d_none(*start)) { + if (p4d_large(*start) || !p4d_present(*start)) { + prot = p4d_flags(*start); + note_page(m, st, __pgprot(prot), 2); + } else { + walk_pud_level(m, st, *start, + P + i * P4D_LEVEL_MULT); + } + } else + note_page(m, st, __pgprot(0), 2); + + start++; + } +} + +#else +#define walk_p4d_level(m,s,a,p) walk_pud_level(m,s,__p4d(pgd_val(a)),p) +#define pgd_large(a) p4d_large(__p4d(pgd_val(a))) +#define pgd_none(a) p4d_none(__p4d(pgd_val(a))) #endif static inline bool is_hypervisor_range(int idx) @@ -424,7 +455,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); } else { - walk_pud_level(m, &st, *start, + walk_p4d_level(m, &st, *start, i * PGD_LEVEL_MULT); } } else -- cgit v1.2.3 From 4af171105144a6475704c1e6024132883d50499e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 29 Mar 2017 16:47:35 -0700 Subject: x86/boot/32: Rewrite test_wp_bit() This code seems to be very old and has gotten only minor updates. It's overcomplicated and has a bunch of comments that are, at best, of purely historical interest. Nowadays we have a shiny function probe_kernel_write() that does more or less exactly what we need. Use it. I switched the page that we test from swapper_pg_dir to empty_zero_page because writing zero to empty_zero_page is more obviously safe than writing to the paging structures. (It's extremely unlikely that any of this would cause problems in practice because the write will fail on any supported CPU.) Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Juergen Gross Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Garnier Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/0b9e64ab0236de30e7572213cea77bf95ae2e990.1490831211.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 41 +++++++---------------------------------- 1 file changed, 7 insertions(+), 34 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7116a727fd5a..097089a5e4d5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -56,8 +56,6 @@ unsigned long highstart_pfn, highend_pfn; -static noinline int do_test_wp_bit(void); - bool __read_mostly __vmalloc_start_set = false; /* @@ -726,22 +724,21 @@ void __init paging_init(void) */ static void __init test_wp_bit(void) { - int wp_works_ok; + char z = 0; printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); - /* Any page-aligned address will do, the test is non-destructive */ - __set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO); - wp_works_ok = do_test_wp_bit(); - clear_fixmap(FIX_WP_TEST); + __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); - if (!wp_works_ok) { + if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1) == 0) { printk(KERN_CONT "No.\n"); panic("Linux doesn't support CPUs with broken WP."); - } else { - printk(KERN_CONT "Ok.\n"); } + + clear_fixmap(FIX_WP_TEST); + + printk(KERN_CONT "Ok.\n"); } void __init mem_init(void) @@ -851,30 +848,6 @@ int arch_remove_memory(u64 start, u64 size) #endif #endif -/* - * This function cannot be __init, since exceptions don't work in that - * section. Put this after the callers, so that it cannot be inlined. - */ -static noinline int do_test_wp_bit(void) -{ - char tmp_reg; - int flag; - - __asm__ __volatile__( - " movb %0, %1 \n" - "1: movb %1, %0 \n" - " xorl %2, %2 \n" - "2: \n" - _ASM_EXTABLE(1b,2b) - :"=m" (*(char *)fix_to_virt(FIX_WP_TEST)), - "=q" (tmp_reg), - "=r" (flag) - :"2" (1) - :"memory"); - - return flag; -} - int kernel_set_to_readonly __read_mostly; void set_kernel_text_rw(void) -- cgit v1.2.3 From 952a6c2c094f4eda295f20c42e6e2d73735950fa Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 30 Mar 2017 09:44:05 +0200 Subject: x86/boot/32: Flip the logic in test_wp_bit() ... to have a natural "likely()" in the code flow and thus have the success case with a branch 99.999% of the times non-taken and function return code following it instead of jumping to it each time. This puts the panic() call at the end of the function - it is going to be practically unreachable anyway. The C code is a bit more readable too. No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: thgarnie@google.com Link: http://lkml.kernel.org/r/20170330080101.ywsf5rg6ilzu4itk@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 097089a5e4d5..601b8e04e5c6 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -726,19 +726,18 @@ static void __init test_wp_bit(void) { char z = 0; - printk(KERN_INFO - "Checking if this processor honours the WP bit even in supervisor mode..."); + printk(KERN_INFO "Checking if this processor honours the WP bit even in supervisor mode..."); __set_fixmap(FIX_WP_TEST, __pa_symbol(empty_zero_page), PAGE_KERNEL_RO); - if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1) == 0) { - printk(KERN_CONT "No.\n"); - panic("Linux doesn't support CPUs with broken WP."); + if (probe_kernel_write((char *)fix_to_virt(FIX_WP_TEST), &z, 1)) { + clear_fixmap(FIX_WP_TEST); + printk(KERN_CONT "Ok.\n"); + return; } - clear_fixmap(FIX_WP_TEST); - - printk(KERN_CONT "Ok.\n"); + printk(KERN_CONT "No.\n"); + panic("Linux doesn't support CPUs with broken WP."); } void __init mem_init(void) -- cgit v1.2.3 From ada26481dfe698ac64b4aaf19a726e66eb8508c6 Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Fri, 31 Mar 2017 14:11:37 +0300 Subject: x86/mm: Make in_compat_syscall() work during exec The x86 mmap() code selects the mmap base for an allocation depending on the bitness of the syscall. For 64bit sycalls it select mm->mmap_base and for 32bit mm->mmap_compat_base. On execve the registers of the task invoking exec() are copied to the child pt_regs. So child->pt_regs->orig_ax contains the execve syscall number of the parent. exec() calls mmap() which in turn uses in_compat_syscall() to check whether the mapping is for a 32bit or a 64bit task. The decision is made on the following criteria: ia32 child->thread.status & TS_COMPAT x32 child->pt_regs.orig_ax & __X32_SYSCALL_BIT ia64 !ia32 && !x32 child->thread.status is corretly set up in set_personality_*(), but the syscall number in child->pt_regs.orig_ax is left unmodified. Therefore the parent/child combinations work or fail in the following way: Parent Child Child->thread_status child->pt_regs.orig_ax in_compat() Works ia64 ia64 TS_COMPAT == 0 __X32_SYSCALL_BIT == 0 false Y ia64 ia32 TS_COMPAT == 1 __X32_SYSCALL_BIT == 0 true Y ia64 x32 TS_COMPAT == 0 __X32_SYSCALL_BIT == 0 false N ia32 ia64 TS_COMPAT == 0 __X32_SYSCALL_BIT == 0 false Y ia32 ia32 TS_COMPAT == 1 __X32_SYSCALL_BIT == 0 true Y ia32 x32 TS_COMPAT == 0 __X32_SYSCALL_BIT == 0 false N x32 ia64 TS_COMPAT == 0 __X32_SYSCALL_BIT == 1 true N x32 ia32 TS_COMPAT == 1 __X32_SYSCALL_BIT == 1 true Y x32 x32 TS_COMPAT == 0 __X32_SYSCALL_BIT == 1 true Y Make set_personality_*() store the syscall number incl. __X32_SYSCALL_BIT which corresponds to the newly started ELF executable in the childs pt_regs, i.e. pretend that the exec was invoked from a task with the same executable format. So both thread.status and pt_regs.orig_ax correspond to the new ELF format and in_compat_syscall() returns the correct result. [ tglx: Rewrote changelog ] Fixes: commit 1b028f784e8c ("x86/mm: Introduce mmap_compat_base() for 32-bit mmap()") Reported-by: Adam Borowski Suggested-by: H. Peter Anvin Suggested-by: Thomas Gleixner Signed-off-by: Dmitry Safonov Cc: 0x7f454c46@gmail.com Cc: linux-mm@kvack.org Cc: Andrei Vagin Cc: Andy Lutomirski Cc: Cyrill Gorcunov Cc: Borislav Petkov Cc: "Kirill A. Shutemov" Link: http://lkml.kernel.org/r/20170331111137.28170-1-dsafonov@virtuozzo.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_64.c | 67 ++++++++++++++++++++++++++++++-------------- 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ea1a6180bf39..825a1e47cf3e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -53,6 +53,11 @@ #include #include #include +#include +#ifdef CONFIG_IA32_EMULATION +/* Not included via unistd.h */ +#include +#endif __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); @@ -494,6 +499,8 @@ void set_personality_64bit(void) clear_thread_flag(TIF_IA32); clear_thread_flag(TIF_ADDR32); clear_thread_flag(TIF_X32); + /* Pretend that this comes from a 64bit execve */ + task_pt_regs(current)->orig_ax = __NR_execve; /* Ensure the corresponding mm is not marked. */ if (current->mm) @@ -506,32 +513,50 @@ void set_personality_64bit(void) current->personality &= ~READ_IMPLIES_EXEC; } -void set_personality_ia32(bool x32) +static void __set_personality_x32(void) { - /* inherit personality from parent */ +#ifdef CONFIG_X86_X32 + clear_thread_flag(TIF_IA32); + set_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_X32; + current->personality &= ~READ_IMPLIES_EXEC; + /* + * in_compat_syscall() uses the presence of the x32 syscall bit + * flag to determine compat status. The x86 mmap() code relies on + * the syscall bitness so set x32 syscall bit right here to make + * in_compat_syscall() work during exec(). + * + * Pretend to come from a x32 execve. + */ + task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT; + current->thread.status &= ~TS_COMPAT; +#endif +} +static void __set_personality_ia32(void) +{ +#ifdef CONFIG_IA32_EMULATION + set_thread_flag(TIF_IA32); + clear_thread_flag(TIF_X32); + if (current->mm) + current->mm->context.ia32_compat = TIF_IA32; + current->personality |= force_personality32; + /* Prepare the first "return" to user space */ + task_pt_regs(current)->orig_ax = __NR_ia32_execve; + current->thread.status |= TS_COMPAT; +#endif +} + +void set_personality_ia32(bool x32) +{ /* Make sure to be in 32bit mode */ set_thread_flag(TIF_ADDR32); - /* Mark the associated mm as containing 32-bit tasks. */ - if (x32) { - clear_thread_flag(TIF_IA32); - set_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_X32; - current->personality &= ~READ_IMPLIES_EXEC; - /* in_compat_syscall() uses the presence of the x32 - syscall bit flag to determine compat status */ - current->thread.status &= ~TS_COMPAT; - } else { - set_thread_flag(TIF_IA32); - clear_thread_flag(TIF_X32); - if (current->mm) - current->mm->context.ia32_compat = TIF_IA32; - current->personality |= force_personality32; - /* Prepare the first "return" to user space */ - current->thread.status |= TS_COMPAT; - } + if (x32) + __set_personality_x32(); + else + __set_personality_ia32(); } EXPORT_SYMBOL_GPL(set_personality_ia32); -- cgit v1.2.3 From 43dac8f6a74c9811454f4efbe52b48f7a802c277 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 14 Mar 2017 11:08:00 +0800 Subject: x86/mm/numa: Improve alloc_node_data() error path message alloc_node_data() tries to allocate from the local node first and, if that attempt fails, falls back to any node. Improve the error message to issue the initial node for ease during debugging. Fix a typo in the comments, while at it. Signed-off-by: Wei Yang Link: http://lkml.kernel.org/r/20170314030801.13656-1-richard.weiyang@gmail.com [ Masssage commit message. ] Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- arch/x86/mm/numa.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 12dcad7297a5..93671d8b3b0d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -201,7 +201,7 @@ static void __init alloc_node_data(int nid) nd_pa = __memblock_alloc_base(nd_size, SMP_CACHE_BYTES, MEMBLOCK_ALLOC_ACCESSIBLE); if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", + pr_err("Cannot find %zu bytes in any node (initial node: %d)\n", nd_size, nid); return; } @@ -225,7 +225,7 @@ static void __init alloc_node_data(int nid) * numa_cleanup_meminfo - Cleanup a numa_meminfo * @mi: numa_meminfo to clean up * - * Sanitize @mi by merging and removing unncessary memblks. Also check for + * Sanitize @mi by merging and removing unnecessary memblks. Also check for * conflicts and clear unused memblks. * * RETURNS: -- cgit v1.2.3 From 474aeffd88b87746a75583f356183d5c6caa4213 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 14 Mar 2017 11:08:01 +0800 Subject: x86/mm/numa: Remove numa_nodemask_from_meminfo() numa_nodemask_from_meminfo() generates a nodemask of nodes which have memory according to a meminfo descriptor. The two callsites of that function both set bits in copies of the numa_nodes_parsed nodemask. In both cases, the information in supplied numa_meminfo is a subset of numa_nodes_parsed. So setting those bits again is not really necessary. Here are the three call paths which show that the supplied numa_meminfo argument describes memory regions in nodes which are already in numa_nodes_parsed: x86_numa_init() numa_init() Case 1: acpi_numa_init() acpi_parse_memory_affinity() numa_add_memblk() node_set(numa_nodes_parsed) acpi_parse_slit() acpi_numa_slit_init() numa_set_distance() numa_alloc_distance() numa_nodemask_from_meminfo() Case 2: amd_numa_init() numa_add_memblk() node_set(numa_nodes_parsed) Case 3 dummy_numa_init() node_set(numa_nodes_parsed) numa_add_memblk() numa_register_memblks() numa_nodemask_from_meminfo() Thus, in all three cases, the respective bit in numa_nodes_parsed is set, which means it is not necessary to set it again in a copy of numa_nodes_parsed. So remove that function. Signed-off-by: Wei Yang Cc: x86-ml Link: http://lkml.kernel.org/r/20170314030801.13656-2-richard.weiyang@gmail.com [ Heavily massage commit message. ] Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner --- arch/x86/mm/numa.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 93671d8b3b0d..175f54ac6772 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -314,20 +314,6 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) return 0; } -/* - * Set nodes, which have memory in @mi, in *@nodemask. - */ -static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, - const struct numa_meminfo *mi) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(mi->blk); i++) - if (mi->blk[i].start != mi->blk[i].end && - mi->blk[i].nid != NUMA_NO_NODE) - node_set(mi->blk[i].nid, *nodemask); -} - /** * numa_reset_distance - Reset NUMA distance table * @@ -347,16 +333,12 @@ void __init numa_reset_distance(void) static int __init numa_alloc_distance(void) { - nodemask_t nodes_parsed; size_t size; int i, j, cnt = 0; u64 phys; /* size the new table and allocate it */ - nodes_parsed = numa_nodes_parsed; - numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); - - for_each_node_mask(i, nodes_parsed) + for_each_node_mask(i, numa_nodes_parsed) cnt = i; cnt++; size = cnt * cnt * sizeof(numa_distance[0]); @@ -535,7 +517,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) /* Account for nodes with cpus and no memory */ node_possible_map = numa_nodes_parsed; - numa_nodemask_from_meminfo(&node_possible_map, mi); if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; -- cgit v1.2.3 From 3677d4c6a2010e4f5a0ca8b617b595fe4cc7ba6b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Mar 2017 11:07:25 +0300 Subject: x86/boot: Detect 5-level paging support In this initial implementation we force-require 5-level paging support from the hardware, when compiled with CONFIG_X86_5LEVEL=y. (The kernel will panic during boot on CPUs that don't support 5-level paging.) We will implement boot-time switch between 4- and 5-level paging later. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-2-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/boot/cpucheck.c | 9 +++++++++ arch/x86/boot/cpuflags.c | 12 ++++++++++-- arch/x86/include/asm/disabled-features.h | 8 +++++++- arch/x86/include/asm/required-features.h | 8 +++++++- 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4ad7d70e8739..8f0c4c9fc904 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -44,6 +44,15 @@ static const u32 req_flags[NCAPINTS] = 0, /* REQUIRED_MASK5 not implemented in this file */ REQUIRED_MASK6, 0, /* REQUIRED_MASK7 not implemented in this file */ + 0, /* REQUIRED_MASK8 not implemented in this file */ + 0, /* REQUIRED_MASK9 not implemented in this file */ + 0, /* REQUIRED_MASK10 not implemented in this file */ + 0, /* REQUIRED_MASK11 not implemented in this file */ + 0, /* REQUIRED_MASK12 not implemented in this file */ + 0, /* REQUIRED_MASK13 not implemented in this file */ + 0, /* REQUIRED_MASK14 not implemented in this file */ + 0, /* REQUIRED_MASK15 not implemented in this file */ + REQUIRED_MASK16, }; #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 6687ab953257..9e77c23c2422 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -70,16 +70,19 @@ int has_eflag(unsigned long mask) # define EBX_REG "=b" #endif -static inline void cpuid(u32 id, u32 *a, u32 *b, u32 *c, u32 *d) +static inline void cpuid_count(u32 id, u32 count, + u32 *a, u32 *b, u32 *c, u32 *d) { asm volatile(".ifnc %%ebx,%3 ; movl %%ebx,%3 ; .endif \n\t" "cpuid \n\t" ".ifnc %%ebx,%3 ; xchgl %%ebx,%3 ; .endif \n\t" : "=a" (*a), "=c" (*c), "=d" (*d), EBX_REG (*b) - : "a" (id) + : "a" (id), "c" (count) ); } +#define cpuid(id, a, b, c, d) cpuid_count(id, 0, a, b, c, d) + void get_cpuflags(void) { u32 max_intel_level, max_amd_level; @@ -108,6 +111,11 @@ void get_cpuflags(void) cpu.model += ((tfms >> 16) & 0xf) << 4; } + if (max_intel_level >= 0x00000007) { + cpuid_count(0x00000007, 0, &ignored, &ignored, + &cpu.flags[16], &ignored); + } + cpuid(0x80000000, &max_amd_level, &ignored, &ignored, &ignored); diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 85599ad4d024..5dff775af7cd 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -36,6 +36,12 @@ # define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ +#ifdef CONFIG_X86_5LEVEL +# define DISABLE_LA57 0 +#else +# define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -55,7 +61,7 @@ #define DISABLED_MASK13 0 #define DISABLED_MASK14 0 #define DISABLED_MASK15 0 -#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE) +#define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57) #define DISABLED_MASK17 0 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index fac9a5c0abe9..d91ba04dd007 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -53,6 +53,12 @@ # define NEED_MOVBE 0 #endif +#ifdef CONFIG_X86_5LEVEL +# define NEED_LA57 (1<<(X86_FEATURE_LA57 & 31)) +#else +# define NEED_LA57 0 +#endif + #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT /* Paravirtualized systems may not have PSE or PGE available */ @@ -98,7 +104,7 @@ #define REQUIRED_MASK13 0 #define REQUIRED_MASK14 0 #define REQUIRED_MASK15 0 -#define REQUIRED_MASK16 0 +#define REQUIRED_MASK16 (NEED_LA57) #define REQUIRED_MASK17 0 #define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18) -- cgit v1.2.3 From 361b4b58ec4cf123e12a773909c6454dbd5e6dbc Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Mar 2017 11:07:26 +0300 Subject: x86/asm: Remove __VIRTUAL_MASK_SHIFT==47 assert We don't need the assert anymore, as: 17be0aec74fb ("x86/asm/entry/64: Implement better check for canonical addresses") made canonical address checks generic wrt. address width. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-3-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 044d18ebc43c..f07b4efb34d5 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -265,12 +265,9 @@ return_from_SYSCALL_64: * * If width of "canonical tail" ever becomes variable, this will need * to be updated to remain correct on both old and new CPUs. + * + * Change top 16 bits to be the sign-extension of 47th bit */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- SYSRET checks need update" - .endif - - /* Change top 16 bits to be the sign-extension of 47th bit */ shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx -- cgit v1.2.3 From 4c7c44837be77e2689c577abef155c4b5d873c82 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Mar 2017 11:07:27 +0300 Subject: x86/mm: Define virtual memory map for 5-level paging The first part of memory map (up to %esp fixup) simply scales existing map for 4-level paging by factor of 9 -- number of bits addressed by the additional page table level. The rest of the map is unchanged. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-4-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/mm.txt | 33 ++++++++++++++++++++++++++++++--- arch/x86/Kconfig | 1 + arch/x86/include/asm/kasan.h | 9 ++++++--- arch/x86/include/asm/page_64_types.h | 10 ++++++++++ arch/x86/include/asm/pgtable_64_types.h | 6 ++++++ arch/x86/include/asm/sparsemem.h | 9 +++++++-- 6 files changed, 60 insertions(+), 8 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index ee3f9c30957c..b0798e281aa6 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -4,7 +4,7 @@ Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm -hole caused by [48:63] sign extension +hole caused by [47:63] sign extension ffff800000000000 - ffff87ffffffffff (=43 bits) guard hole, reserved for hypervisor ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole @@ -23,12 +23,39 @@ ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole +Virtual memory map with 5 level page tables: + +0000000000000000 - 00ffffffffffffff (=56 bits) user space, different per mm +hole caused by [56:63] sign extension +ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor +ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory +ff90000000000000 - ff91ffffffffffff (=49 bits) hole +ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space +ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole +ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) +... unused hole ... +ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB) +... unused hole ... +ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks +... unused hole ... +ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space +... unused hole ... +ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 +ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space +ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls +ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole + +Architecture defines a 64-bit virtual address. Implementations can support +less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 +through to the most-significant implemented bit are set to either all ones +or all zero. This causes hole between user space and kernel addresses. + The direct mapping covers all memory in the system up to the highest memory address (this means in some cases it can also include PCI memory holes). -vmalloc space is lazily synchronized into the different PML4 pages of -the processes using the page fault handler, with init_level4_pgt as +vmalloc space is lazily synchronized into the different PML4/PML5 pages of +the processes using the page fault handler, with init_top_pgt as reference. Current X86-64 implementations support up to 46 bits of address space (64 TB), diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8977d9c77373..a641b900fc1f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -291,6 +291,7 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KASAN_SHADOW_OFFSET hex depends on KASAN + default 0xdff8000000000000 if X86_5LEVEL default 0xdffffc0000000000 config HAVE_INTEL_TXT diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 1410b567ecde..f527b02a0ee3 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -11,9 +11,12 @@ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT */ #define KASAN_SHADOW_START (KASAN_SHADOW_OFFSET + \ - (0xffff800000000000ULL >> 3)) -/* 47 bits for kernel address -> (47 - 3) bits for shadow */ -#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (47 - 3))) + ((-1UL << __VIRTUAL_MASK_SHIFT) >> 3)) +/* + * 47 bits for kernel address -> (47 - 3) bits for shadow + * 56 bits for kernel address -> (56 - 3) bits for shadow + */ +#define KASAN_SHADOW_END (KASAN_SHADOW_START + (1ULL << (__VIRTUAL_MASK_SHIFT - 3))) #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 9215e0527647..3f5f08b010d0 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -36,7 +36,12 @@ * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's * what Xen requires. */ +#ifdef CONFIG_X86_5LEVEL +#define __PAGE_OFFSET_BASE _AC(0xff10000000000000, UL) +#else #define __PAGE_OFFSET_BASE _AC(0xffff880000000000, UL) +#endif + #ifdef CONFIG_RANDOMIZE_MEMORY #define __PAGE_OFFSET page_offset_base #else @@ -46,8 +51,13 @@ #define __START_KERNEL_map _AC(0xffffffff80000000, UL) /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ +#ifdef CONFIG_X86_5LEVEL +#define __PHYSICAL_MASK_SHIFT 52 +#define __VIRTUAL_MASK_SHIFT 56 +#else #define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 47 +#endif /* * Kernel image size is limited to 1GiB due to the fixmap living in the diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 516593e66bd6..4edc97917382 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -56,9 +56,15 @@ typedef struct { pteval_t pte; } pte_t; /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +#ifdef CONFIG_X86_5LEVEL +#define VMALLOC_SIZE_TB _AC(16384, UL) +#define __VMALLOC_BASE _AC(0xff92000000000000, UL) +#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +#else #define VMALLOC_SIZE_TB _AC(32, UL) #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) #define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +#endif #ifdef CONFIG_RANDOMIZE_MEMORY #define VMALLOC_START vmalloc_base #define VMEMMAP_START vmemmap_base diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 4517d6b93188..1f5bee2c202f 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -26,8 +26,13 @@ # endif #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ -# define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 46 +# ifdef CONFIG_X86_5LEVEL +# define MAX_PHYSADDR_BITS 52 +# define MAX_PHYSMEM_BITS 52 +# else +# define MAX_PHYSADDR_BITS 44 +# define MAX_PHYSMEM_BITS 46 +# endif #endif #endif /* CONFIG_SPARSEMEM */ -- cgit v1.2.3 From 335437fbf7d3a630d775f06c9ee37d60919561d8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Mar 2017 11:07:28 +0300 Subject: x86/paravirt: Add 5-level support to the paravirt code Add operations to allocate/release p4ds. Xen requires more work. We will need to come back to it. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-5-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/paravirt.h | 37 ++++++++++++++++++++++++----------- arch/x86/include/asm/paravirt_types.h | 7 ++++++- arch/x86/include/asm/pgalloc.h | 2 ++ arch/x86/kernel/paravirt.c | 9 +++++++-- 4 files changed, 41 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 158d877ce9e9..55fa56fe4e45 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -357,6 +357,16 @@ static inline void paravirt_release_pud(unsigned long pfn) PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); } +static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) +{ + PVOP_VCALL2(pv_mmu_ops.alloc_p4d, mm, pfn); +} + +static inline void paravirt_release_p4d(unsigned long pfn) +{ + PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); +} + static inline void pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -582,25 +592,25 @@ static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) val); } -static inline void p4d_clear(p4d_t *p4dp) +#if CONFIG_PGTABLE_LEVELS >= 5 + +static inline p4d_t __p4d(p4dval_t val) { - set_p4d(p4dp, __p4d(0)); -} + p4dval_t ret = PVOP_CALLEE1(p4dval_t, pv_mmu_ops.make_p4d, val); -#if CONFIG_PGTABLE_LEVELS >= 5 + return (p4d_t) { ret }; +} -#error FIXME +static inline p4dval_t p4d_val(p4d_t p4d) +{ + return PVOP_CALLEE1(p4dval_t, pv_mmu_ops.p4d_val, p4d.p4d); +} static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { pgdval_t val = native_pgd_val(pgd); - if (sizeof(pgdval_t) > sizeof(long)) - PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp, - val, (u64)val >> 32); - else - PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, - val); + PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp, val); } static inline void pgd_clear(pgd_t *pgdp) @@ -610,6 +620,11 @@ static inline void pgd_clear(pgd_t *pgdp) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ +static inline void p4d_clear(p4d_t *p4dp) +{ + set_p4d(p4dp, __p4d(0)); +} + #endif /* CONFIG_PGTABLE_LEVELS == 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 93c49cf09b63..7465d6fe336f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -238,9 +238,11 @@ struct pv_mmu_ops { void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); + void (*alloc_p4d)(struct mm_struct *mm, unsigned long pfn); void (*release_pte)(unsigned long pfn); void (*release_pmd)(unsigned long pfn); void (*release_pud)(unsigned long pfn); + void (*release_p4d)(unsigned long pfn); /* Pagetable manipulation functions */ void (*set_pte)(pte_t *ptep, pte_t pteval); @@ -286,7 +288,10 @@ struct pv_mmu_ops { void (*set_p4d)(p4d_t *p4dp, p4d_t p4dval); #if CONFIG_PGTABLE_LEVELS >= 5 -#error FIXME + struct paravirt_callee_save p4d_val; + struct paravirt_callee_save make_p4d; + + void (*set_pgd)(pgd_t *pgdp, pgd_t pgdval); #endif /* CONFIG_PGTABLE_LEVELS >= 5 */ #endif /* CONFIG_PGTABLE_LEVELS >= 4 */ diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index 2f585054c63c..b2d0cd8288aa 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -17,9 +17,11 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) {} static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} +static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_release_pte(unsigned long pfn) {} static inline void paravirt_release_pmd(unsigned long pfn) {} static inline void paravirt_release_pud(unsigned long pfn) {} +static inline void paravirt_release_p4d(unsigned long pfn) {} #endif /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 110daf22f5c7..3586996fc50d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -405,9 +405,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .alloc_pte = paravirt_nop, .alloc_pmd = paravirt_nop, .alloc_pud = paravirt_nop, + .alloc_p4d = paravirt_nop, .release_pte = paravirt_nop, .release_pmd = paravirt_nop, .release_pud = paravirt_nop, + .release_p4d = paravirt_nop, .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, @@ -437,8 +439,11 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .set_p4d = native_set_p4d, #if CONFIG_PGTABLE_LEVELS >= 5 -#error FIXME -#endif /* CONFIG_PGTABLE_LEVELS >= 4 */ + .p4d_val = PTE_IDENT, + .make_p4d = PTE_IDENT, + + .set_pgd = native_set_pgd, +#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ #endif /* CONFIG_PGTABLE_LEVELS >= 4 */ #endif /* CONFIG_PGTABLE_LEVELS >= 3 */ -- cgit v1.2.3 From b8504058a06bd19286c8b59539eebfda69d1ecb5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Mar 2017 11:07:29 +0300 Subject: x86/mm: Add basic defines/helpers for CONFIG_X86_5LEVEL=y Extends pagetable headers to support the new paging mode. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-6-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64.h | 11 +++++++++++ arch/x86/include/asm/pgtable_64_types.h | 20 ++++++++++++++++++++ arch/x86/include/asm/pgtable_types.h | 10 +++++++++- arch/x86/mm/pgtable.c | 32 +++++++++++++++++++++++++++++++- 4 files changed, 71 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 0593a1ae7573..12ea31274eb6 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -35,6 +35,13 @@ extern void paging_init(void); #define pud_ERROR(e) \ pr_err("%s:%d: bad pud %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pud_val(e)) + +#if CONFIG_PGTABLE_LEVELS >= 5 +#define p4d_ERROR(e) \ + pr_err("%s:%d: bad p4d %p(%016lx)\n", \ + __FILE__, __LINE__, &(e), p4d_val(e)) +#endif + #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %p(%016lx)\n", \ __FILE__, __LINE__, &(e), pgd_val(e)) @@ -128,7 +135,11 @@ static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) static inline void native_p4d_clear(p4d_t *p4d) { +#ifdef CONFIG_X86_5LEVEL + native_set_p4d(p4d, native_make_p4d(0)); +#else native_set_p4d(p4d, (p4d_t) { .pgd = native_make_pgd(0)}); +#endif } static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 4edc97917382..adc3e7b107ee 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -23,12 +23,32 @@ typedef struct { pteval_t pte; } pte_t; #define SHARED_KERNEL_PMD 0 +#ifdef CONFIG_X86_5LEVEL + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 48 +#define PTRS_PER_PGD 512 + +/* + * 4th level page in 5-level paging case + */ +#define P4D_SHIFT 39 +#define PTRS_PER_P4D 512 +#define P4D_SIZE (_AC(1, UL) << P4D_SHIFT) +#define P4D_MASK (~(P4D_SIZE - 1)) + +#else /* CONFIG_X86_5LEVEL */ + /* * PGDIR_SHIFT determines what a top-level page table entry can map */ #define PGDIR_SHIFT 39 #define PTRS_PER_PGD 512 +#endif /* CONFIG_X86_5LEVEL */ + /* * 3rd level page */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 4930afe9df0a..bf9638e1ee42 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -273,9 +273,17 @@ static inline pgdval_t pgd_flags(pgd_t pgd) } #if CONFIG_PGTABLE_LEVELS > 4 +typedef struct { p4dval_t p4d; } p4d_t; -#error FIXME +static inline p4d_t native_make_p4d(pudval_t val) +{ + return (p4d_t) { val }; +} +static inline p4dval_t native_p4d_val(p4d_t p4d) +{ + return p4d.p4d; +} #else #include diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 38b6daf72deb..508a708eb9a6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -81,6 +81,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); } + +#if CONFIG_PGTABLE_LEVELS > 4 +void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) +{ + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); + tlb_remove_page(tlb, virt_to_page(p4d)); +} +#endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ @@ -120,7 +128,7 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) references from swapper_pg_dir. */ if (CONFIG_PGTABLE_LEVELS == 2 || (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) || - CONFIG_PGTABLE_LEVELS == 4) { + CONFIG_PGTABLE_LEVELS >= 4) { clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, swapper_pg_dir + KERNEL_PGD_BOUNDARY, KERNEL_PGD_PTRS); @@ -582,6 +590,28 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys, } #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP +#ifdef CONFIG_X86_5LEVEL +/** + * p4d_set_huge - setup kernel P4D mapping + * + * No 512GB pages yet -- always return 0 + */ +int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) +{ + return 0; +} + +/** + * p4d_clear_huge - clear kernel P4D mapping when it is set + * + * No 512GB pages yet -- always return 0 + */ +int p4d_clear_huge(p4d_t *p4d) +{ + return 0; +} +#endif + /** * pud_set_huge - setup kernel PUD mapping * -- cgit v1.2.3 From 5480bb61cfba5de23df59793527c3a37c0d2e247 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Mar 2017 11:07:30 +0300 Subject: x86/kasan: Extend KASAN to support 5-level paging This patch bring support for a non-folded additional page table level. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-7-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/kasan_init_64.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 3d1059db6bf6..47efdcfe7113 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -55,8 +55,18 @@ static void __init kasan_map_early_shadow(pgd_t *pgd) unsigned long end = KASAN_SHADOW_END; for (i = pgd_index(start); start < end; i++) { - pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) - | _KERNPG_TABLE); + switch (CONFIG_PGTABLE_LEVELS) { + case 4: + pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) | + _KERNPG_TABLE); + break; + case 5: + pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) | + _KERNPG_TABLE); + break; + default: + BUILD_BUG(); + } start += PGDIR_SIZE; } } @@ -84,6 +94,7 @@ void __init kasan_early_init(void) pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; + p4dval_t p4d_val = __pa_nodebug(kasan_zero_pud) | _KERNPG_TABLE; for (i = 0; i < PTRS_PER_PTE; i++) kasan_zero_pte[i] = __pte(pte_val); @@ -94,6 +105,9 @@ void __init kasan_early_init(void) for (i = 0; i < PTRS_PER_PUD; i++) kasan_zero_pud[i] = __pud(pud_val); + for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++) + kasan_zero_p4d[i] = __p4d(p4d_val); + kasan_map_early_shadow(early_level4_pgt); kasan_map_early_shadow(init_level4_pgt); } -- cgit v1.2.3 From 1d33b219563fb9f7384c8c609c767cb6bfd44b8e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 30 Mar 2017 11:07:31 +0300 Subject: x86/espfix: Add support for 5-level paging We don't need extra virtual address space for ESPFIX, so it stays within one PUD page table for both 4- and 5-level paging. Redefining ESPFIX_BASE_ADDR using P4D_SHIFT instead of PGDIR_SHIFT would make it stay in the same place regarding of paging mode. Signed-off-by: Kirill A. Shutemov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-arch@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20170330080731.65421-8-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_64_types.h | 2 +- arch/x86/kernel/espfix_64.c | 12 +++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index adc3e7b107ee..06470da156ba 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -98,7 +98,7 @@ typedef struct { pteval_t pte; } pte_t; #define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) #define MODULES_LEN (MODULES_END - MODULES_VADDR) #define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << PGDIR_SHIFT) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) #define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) #define EFI_VA_END (-68 * (_AC(1, UL) << 30)) diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 04f89caef9c4..8e598a1ad986 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -50,11 +50,11 @@ #define ESPFIX_STACKS_PER_PAGE (PAGE_SIZE/ESPFIX_STACK_SIZE) /* There is address space for how many espfix pages? */ -#define ESPFIX_PAGE_SPACE (1UL << (PGDIR_SHIFT-PAGE_SHIFT-16)) +#define ESPFIX_PAGE_SPACE (1UL << (P4D_SHIFT-PAGE_SHIFT-16)) #define ESPFIX_MAX_CPUS (ESPFIX_STACKS_PER_PAGE * ESPFIX_PAGE_SPACE) #if CONFIG_NR_CPUS > ESPFIX_MAX_CPUS -# error "Need more than one PGD for the ESPFIX hack" +# error "Need more virtual address space for the ESPFIX hack" #endif #define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) @@ -121,11 +121,13 @@ static void init_espfix_random(void) void __init init_espfix_bsp(void) { - pgd_t *pgd_p; + pgd_t *pgd; + p4d_t *p4d; /* Install the espfix pud into the kernel page directory */ - pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; - pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page); + pgd = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)]; + p4d = p4d_alloc(&init_mm, pgd, ESPFIX_BASE_ADDR); + p4d_populate(&init_mm, p4d, espfix_pud_page); /* Randomize the locations */ init_espfix_random(); -- cgit v1.2.3 From b678c91aefa7ce05a5d195e0a5c7a357b62d3283 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 8 Apr 2017 00:00:53 +0200 Subject: Revert "x86/mm/numa: Remove numa_nodemask_from_meminfo()" This reverts commit 474aeffd88b87746a75583f356183d5c6caa4213 due to testing failures. Reported-by: "Kirill A. Shutemov" Signed-off-by: Thomas Gleixner Cc: Wei Yang Cc: Borislav Petkov Link: https://lkml.kernel.org/r/20170406124459.dwn5zhpr2xqg3lqm@node.shutemov.name --- arch/x86/mm/numa.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 175f54ac6772..93671d8b3b0d 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -314,6 +314,20 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) return 0; } +/* + * Set nodes, which have memory in @mi, in *@nodemask. + */ +static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask, + const struct numa_meminfo *mi) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mi->blk); i++) + if (mi->blk[i].start != mi->blk[i].end && + mi->blk[i].nid != NUMA_NO_NODE) + node_set(mi->blk[i].nid, *nodemask); +} + /** * numa_reset_distance - Reset NUMA distance table * @@ -333,12 +347,16 @@ void __init numa_reset_distance(void) static int __init numa_alloc_distance(void) { + nodemask_t nodes_parsed; size_t size; int i, j, cnt = 0; u64 phys; /* size the new table and allocate it */ - for_each_node_mask(i, numa_nodes_parsed) + nodes_parsed = numa_nodes_parsed; + numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo); + + for_each_node_mask(i, nodes_parsed) cnt = i; cnt++; size = cnt * cnt * sizeof(numa_distance[0]); @@ -517,6 +535,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) /* Account for nodes with cpus and no memory */ node_possible_map = numa_nodes_parsed; + numa_nodemask_from_meminfo(&node_possible_map, mi); if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; -- cgit v1.2.3 From 5ed386ec09a5d75bcf073967e55e895c2607a5c3 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 6 Apr 2017 16:19:22 +0200 Subject: x86/mpx: Correctly report do_mpx_bt_fault() failures to user-space When this function fails it just sends a SIGSEGV signal to user-space using force_sig(). This signal is missing essential information about the cause, e.g. the trap_nr or an error code. Fix this by propagating the error to the only caller of mpx_handle_bd_fault(), do_bounds(), which sends the correct SIGSEGV signal to the process. Signed-off-by: Joerg Roedel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: fe3d197f84319 ('x86, mpx: On-demand kernel allocation of bounds tables') Link: http://lkml.kernel.org/r/1491488362-27198-1-git-send-email-joro@8bytes.org Signed-off-by: Ingo Molnar --- arch/x86/mm/mpx.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index cd44ae727df7..1c34b767c84c 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -526,15 +526,7 @@ int mpx_handle_bd_fault(void) if (!kernel_managing_mpx_tables(current->mm)) return -EINVAL; - if (do_mpx_bt_fault()) { - force_sig(SIGSEGV, current); - /* - * The force_sig() is essentially "handling" this - * exception, so we do not pass up the error - * from do_mpx_bt_fault(). - */ - } - return 0; + return do_mpx_bt_fault(); } /* -- cgit v1.2.3 From 5f2173e056b2a5c215b862f136192744c447844e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 6 Apr 2017 16:23:18 +0200 Subject: x86/mpx, selftests: Only check bounds-vs-shadow when we keep shadow The check between the hardware state and our shadow of it is checked in the signal handler for all bounds exceptions, even for the ones where we don't keep the shadow up2date. This is a problem because when no shadow is kept the handler fails at this point and hides the real reason of the exception. Move the check into the code-path evaluating normal bounds exceptions to prevent this. Signed-off-by: Joerg Roedel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-kselftest@vger.kernel.org Link: http://lkml.kernel.org/r/1491488598-27346-1-git-send-email-joro@8bytes.org Signed-off-by: Ingo Molnar --- tools/testing/selftests/x86/mpx-mini-test.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c index 616ee9673339..a8df159a8924 100644 --- a/tools/testing/selftests/x86/mpx-mini-test.c +++ b/tools/testing/selftests/x86/mpx-mini-test.c @@ -404,8 +404,6 @@ void handler(int signum, siginfo_t *si, void *vucontext) dprintf2("info->si_lower: %p\n", __si_bounds_lower(si)); dprintf2("info->si_upper: %p\n", __si_bounds_upper(si)); - check_siginfo_vs_shadow(si); - for (i = 0; i < 8; i++) dprintf3("[%d]: %p\n", i, si_addr_ptr[i]); switch (br_reason) { @@ -416,6 +414,9 @@ void handler(int signum, siginfo_t *si, void *vucontext) exit(5); case 1: /* #BR MPX bounds exception */ /* these are normal and we expect to see them */ + + check_siginfo_vs_shadow(si); + dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n", status, (void *)ip, si->si_addr); num_bnd_chk++; -- cgit v1.2.3 From 84bbabc3a452e8085cfbd745ff0bff2b89074417 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Wed, 12 Apr 2017 16:36:34 +0200 Subject: x86/mm: Fix dump pagetables for 4 levels of page tables Commit fdd3d8ce0ea62 ("x86/dump_pagetables: Add support for 5-level paging") introduced an error for dumping with only 4 levels by setting PGD_LEVEL_MULT to a wrong value. This is leading to e.g. addresses printed as "(null)" for ranges: x86/mm: Found insecure W+X mapping at address (null)/(null) Make PGD_LEVEL_MULT a multiple of PTRS_PER_P4D instead of PTRS_PER_PUD Fixes: fdd3d8ce0ea62 ("x86/dump_pagetables: Add support for 5-level paging") Signed-off-by: Juergen Gross Reviewed-by: Kirill A. Shutemov Link: http://lkml.kernel.org/r/20170412143634.6846-1-jgross@suse.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/dump_pagetables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 9f305be71a72..bce6990b1d81 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -111,7 +111,7 @@ static struct addr_marker address_markers[] = { #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT) #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT) #define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT) -#define PGD_LEVEL_MULT (PTRS_PER_PUD * P4D_LEVEL_MULT) +#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT) #define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \ ({ \ -- cgit v1.2.3 From ace2fb5a8b65d6aba530068ea9331f18e10ef565 Mon Sep 17 00:00:00 2001 From: Colin King Date: Thu, 13 Apr 2017 16:59:12 +0100 Subject: x86/boot/e820: Remove a redundant self assignment Remove a redundant self assignment of table->nr_entries, it does nothing and is an artifact of code simplification re-work. Detected by CoverityScan, CID#1428450 ("Self assignment") Fixes: 441ac2f33dd7 ("x86/boot/e820: Simplify e820__update_table()") Signed-off-by: Colin Ian King Cc: kernel-janitors@vger.kernel.org Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/20170413155912.12078-1-colin.king@canonical.com Signed-off-by: Thomas Gleixner --- arch/x86/kernel/e820.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6e9b26fa6d05..d78a586ba8dc 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -270,7 +270,6 @@ int __init e820__update_table(struct e820_table *table) if (table->nr_entries < 2) return -1; - table->nr_entries = table->nr_entries; BUG_ON(table->nr_entries > max_nr_entries); /* Bail out if we find any unreasonable addresses in the map: */ -- cgit v1.2.3 From 6dd29b3df975582ef429b5b93c899e6575785940 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 23 Apr 2017 11:37:17 +0200 Subject: Revert "x86/mm/gup: Switch GUP to the generic get_user_page_fast() implementation" This reverts commit 2947ba054a4dabbd82848728d765346886050029. Dan Williams reported dax-pmem kernel warnings with the following signature: WARNING: CPU: 8 PID: 245 at lib/percpu-refcount.c:155 percpu_ref_switch_to_atomic_rcu+0x1f5/0x200 percpu ref (dax_pmem_percpu_release [dax_pmem]) <= 0 (0) after switching to atomic ... and bisected it to this commit, which suggests possible memory corruption caused by the x86 fast-GUP conversion. He also pointed out: " This is similar to the backtrace when we were not properly handling pud faults and was fixed with this commit: 220ced1676c4 "mm: fix get_user_pages() vs device-dax pud mappings" I've found some missing _devmap checks in the generic get_user_pages_fast() path, but this does not fix the regression [...] " So given that there are known bugs, and a pretty robust looking bisection points to this commit suggesting that are unknown bugs in the conversion as well, revert it for the time being - we'll re-try in v4.13. Reported-by: Dan Williams Cc: Andrew Morton Cc: Borislav Petkov Cc: Catalin Marinas Cc: Kirill A. Shutemov Cc: Linus Torvalds Cc: Michal Hocko Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: aneesh.kumar@linux.vnet.ibm.com Cc: dann.frazier@canonical.com Cc: dave.hansen@intel.com Cc: steve.capper@linaro.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/arm/Kconfig | 2 +- arch/arm64/Kconfig | 2 +- arch/powerpc/Kconfig | 2 +- arch/x86/Kconfig | 3 - arch/x86/include/asm/mmu_context.h | 12 + arch/x86/include/asm/pgtable-3level.h | 47 ---- arch/x86/include/asm/pgtable.h | 53 ---- arch/x86/include/asm/pgtable_64.h | 16 +- arch/x86/mm/Makefile | 2 +- arch/x86/mm/gup.c | 496 ++++++++++++++++++++++++++++++++++ mm/Kconfig | 2 +- mm/gup.c | 10 +- 12 files changed, 519 insertions(+), 128 deletions(-) create mode 100644 arch/x86/mm/gup.c diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 454fadd077ad..0d4e71b42c77 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1666,7 +1666,7 @@ config ARCH_SELECT_MEMORY_MODEL config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM -config HAVE_GENERIC_GUP +config HAVE_GENERIC_RCU_GUP def_bool y depends on ARM_LPAE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index af62bf79721a..3741859765cf 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -205,7 +205,7 @@ config GENERIC_CALIBRATE_DELAY config ZONE_DMA def_bool y -config HAVE_GENERIC_GUP +config HAVE_GENERIC_RCU_GUP def_bool y config ARCH_DMA_ADDR_T_64BIT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 3a716b2dcde9..97a8bc8a095c 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,7 +135,7 @@ config PPC select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS - select HAVE_GENERIC_GUP + select HAVE_GENERIC_RCU_GUP select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) select HAVE_IDE select HAVE_IOREMAP_PROT diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a641b900fc1f..2bde14451e54 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2789,9 +2789,6 @@ config X86_DMA_REMAP bool depends on STA2X11 -config HAVE_GENERIC_GUP - def_bool y - source "net/Kconfig" source "drivers/Kconfig" diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 6e933d2d88d9..68b329d77b3a 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -220,6 +220,18 @@ static inline int vma_pkey(struct vm_area_struct *vma) } #endif +static inline bool __pkru_allows_pkey(u16 pkey, bool write) +{ + u32 pkru = read_pkru(); + + if (!__pkru_allows_read(pkru, pkey)) + return false; + if (write && !__pkru_allows_write(pkru, pkey)) + return false; + + return true; +} + /* * We only want to enforce protection keys on the current process * because we effectively have no access to PKRU for other diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index c8821bab938f..50d35e3185f5 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -212,51 +212,4 @@ static inline pud_t native_pudp_get_and_clear(pud_t *pudp) #define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) #define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } }) -#define gup_get_pte gup_get_pte -/* - * WARNING: only to be used in the get_user_pages_fast() implementation. - * - * With get_user_pages_fast(), we walk down the pagetables without taking - * any locks. For this we would like to load the pointers atomically, - * but that is not possible (without expensive cmpxchg8b) on PAE. What - * we do have is the guarantee that a PTE will only either go from not - * present to present, or present to not present or both -- it will not - * switch to a completely different present page without a TLB flush in - * between; something that we are blocking by holding interrupts off. - * - * Setting ptes from not present to present goes: - * - * ptep->pte_high = h; - * smp_wmb(); - * ptep->pte_low = l; - * - * And present to not present goes: - * - * ptep->pte_low = 0; - * smp_wmb(); - * ptep->pte_high = 0; - * - * We must ensure here that the load of pte_low sees 'l' iff pte_high - * sees 'h'. We load pte_high *after* loading pte_low, which ensures we - * don't see an older value of pte_high. *Then* we recheck pte_low, - * which ensures that we haven't picked up a changed pte high. We might - * have gotten rubbish values from pte_low and pte_high, but we are - * guaranteed that pte_low will not have the present bit set *unless* - * it is 'l'. Because get_user_pages_fast() only operates on present ptes - * we're safe. - */ -static inline pte_t gup_get_pte(pte_t *ptep) -{ - pte_t pte; - - do { - pte.pte_low = ptep->pte_low; - smp_rmb(); - pte.pte_high = ptep->pte_high; - smp_rmb(); - } while (unlikely(pte.pte_low != ptep->pte_low)); - - return pte; -} - #endif /* _ASM_X86_PGTABLE_3LEVEL_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 942482ac36a8..f5af95a0c6b8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -244,11 +244,6 @@ static inline int pud_devmap(pud_t pud) return 0; } #endif - -static inline int pgd_devmap(pgd_t pgd) -{ - return 0; -} #endif #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -1190,54 +1185,6 @@ static inline u16 pte_flags_pkey(unsigned long pte_flags) #endif } -static inline bool __pkru_allows_pkey(u16 pkey, bool write) -{ - u32 pkru = read_pkru(); - - if (!__pkru_allows_read(pkru, pkey)) - return false; - if (write && !__pkru_allows_write(pkru, pkey)) - return false; - - return true; -} - -/* - * 'pteval' can come from a PTE, PMD or PUD. We only check - * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the - * same value on all 3 types. - */ -static inline bool __pte_access_permitted(unsigned long pteval, bool write) -{ - unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; - - if (write) - need_pte_bits |= _PAGE_RW; - - if ((pteval & need_pte_bits) != need_pte_bits) - return 0; - - return __pkru_allows_pkey(pte_flags_pkey(pteval), write); -} - -#define pte_access_permitted pte_access_permitted -static inline bool pte_access_permitted(pte_t pte, bool write) -{ - return __pte_access_permitted(pte_val(pte), write); -} - -#define pmd_access_permitted pmd_access_permitted -static inline bool pmd_access_permitted(pmd_t pmd, bool write) -{ - return __pte_access_permitted(pmd_val(pmd), write); -} - -#define pud_access_permitted pud_access_permitted -static inline bool pud_access_permitted(pud_t pud, bool write) -{ - return __pte_access_permitted(pud_val(pud), write); -} - #include #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 12ea31274eb6..9991224f6238 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -227,20 +227,6 @@ extern void cleanup_highmap(void); extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); -#define gup_fast_permitted gup_fast_permitted -static inline bool gup_fast_permitted(unsigned long start, int nr_pages, - int write) -{ - unsigned long len, end; - - len = (unsigned long)nr_pages << PAGE_SHIFT; - end = start + len; - if (end < start) - return false; - if (end >> __VIRTUAL_MASK_SHIFT) - return false; - return true; -} - #endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 0fbdcb64f9f8..96d2b847e09e 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o setup_nx.o tlb.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 000000000000..456dfdfd2249 --- /dev/null +++ b/arch/x86/mm/gup.c @@ -0,0 +1,496 @@ +/* + * Lockless get_user_pages_fast for x86 + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +static inline pte_t gup_get_pte(pte_t *ptep) +{ +#ifndef CONFIG_X86_PAE + return READ_ONCE(*ptep); +#else + /* + * With get_user_pages_fast, we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atomically, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a pte will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees l iff pte_high + * sees h. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have got rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. And get_user_pages_fast only operates on present ptes, so + * we're safe. + * + * gup_get_pte should not be used or copied outside gup.c without being + * very careful -- it does not atomically load the pte or anything that + * is likely to be useful for you. + */ + pte_t pte; + +retry: + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + if (unlikely(pte.pte_low != ptep->pte_low)) + goto retry; + + return pte; +#endif +} + +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + +/* + * 'pteval' can come from a pte, pmd, pud or p4d. We only check + * _PAGE_PRESENT, _PAGE_USER, and _PAGE_RW in here which are the + * same value on all 4 types. + */ +static inline int pte_allows_gup(unsigned long pteval, int write) +{ + unsigned long need_pte_bits = _PAGE_PRESENT|_PAGE_USER; + + if (write) + need_pte_bits |= _PAGE_RW; + + if ((pteval & need_pte_bits) != need_pte_bits) + return 0; + + /* Check memory protection keys permissions. */ + if (!__pkru_allows_pkey(pte_flags_pkey(pteval), write)) + return 0; + + return 1; +} + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct dev_pagemap *pgmap = NULL; + int nr_start = *nr, ret = 0; + pte_t *ptep, *ptem; + + /* + * Keep the original mapped PTE value (ptem) around since we + * might increment ptep off the end of the page when finishing + * our loop iteration. + */ + ptem = ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = gup_get_pte(ptep); + struct page *page; + + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_protnone(pte)) + break; + + if (!pte_allows_gup(pte_val(pte), write)) + break; + + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + break; + } + } else if (pte_special(pte)) + break; + + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + get_page(page); + put_dev_pagemap(pgmap); + SetPageReferenced(page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + if (addr == end) + ret = 1; + pte_unmap(ptem); + + return ret; +} + +static inline void get_head_page_multiple(struct page *page, int nr) +{ + VM_BUG_ON_PAGE(page != compound_head(page), page); + VM_BUG_ON_PAGE(page_count(page) == 0, page); + page_ref_add(page, nr); + SetPageReferenced(page); +} + +static int __gup_device_huge(unsigned long pfn, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + struct dev_pagemap *pgmap = NULL; + + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} + +static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page; + int refs; + + if (!pte_allows_gup(pmd_val(pmd), write)) + return 0; + + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); + if (pmd_devmap(pmd)) + return __gup_device_huge_pmd(pmd, addr, end, pages, nr); + + /* hugepages are never "special" */ + VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); + + refs = 0; + head = pmd_page(pmd); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_protnone(pmd)) + return 0; + if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static noinline int gup_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + struct page *head, *page; + int refs; + + if (!pte_allows_gup(pud_val(pud), write)) + return 0; + + VM_BUG_ON(!pfn_valid(pud_pfn(pud))); + if (pud_devmap(pud)) + return __gup_device_huge_pud(pud, addr, end, pages, nr); + + /* hugepages are never "special" */ + VM_BUG_ON(pud_flags(pud) & _PAGE_SPECIAL); + + refs = 0; + head = pud_page(pud); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON_PAGE(compound_head(page) != head, page); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&p4d, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (unlikely(pud_large(pud))) { + if (!gup_huge_pud(pud, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } + } while (pudp++, addr = next, addr != end); + + return 1; +} + +static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + p4d_t *p4dp; + + p4dp = p4d_offset(&pgd, addr); + do { + p4d_t p4d = *p4dp; + + next = p4d_addr_end(addr, end); + if (p4d_none(p4d)) + return 0; + BUILD_BUG_ON(p4d_large(p4d)); + if (!gup_pud_range(p4d, addr, next, write, pages, nr)) + return 0; + } while (p4dp++, addr = next, addr != end); + + return 1; +} + +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + */ +int __get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + unsigned long flags; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + (void __user *)start, len))) + return 0; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_save(flags); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + break; + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) + break; + } while (pgdp++, addr = next, addr != end); + local_irq_restore(flags); + + return nr; +} + +/** + * get_user_pages_fast() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * + * Attempt to pin user pages in memory without taking mm->mmap_sem. + * If not successful, it will fall back to taking the lock and + * calling get_user_pages(). + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. + */ +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long addr, len, end; + unsigned long next; + pgd_t *pgdp; + int nr = 0; + + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + + end = start + len; + if (end < start) + goto slow_irqon; + +#ifdef CONFIG_X86_64 + if (end >> __VIRTUAL_MASK_SHIFT) + goto slow_irqon; +#endif + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_p4d_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + ret = get_user_pages_unlocked(start, + (end - start) >> PAGE_SHIFT, + pages, write ? FOLL_WRITE : 0); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} diff --git a/mm/Kconfig b/mm/Kconfig index c89f472b658c..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -137,7 +137,7 @@ config HAVE_MEMBLOCK_NODE_MAP config HAVE_MEMBLOCK_PHYS_MAP bool -config HAVE_GENERIC_GUP +config HAVE_GENERIC_RCU_GUP bool config ARCH_DISCARD_MEMBLOCK diff --git a/mm/gup.c b/mm/gup.c index 2559a3987de7..527ec2c6cca3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1155,7 +1155,7 @@ struct page *get_dump_page(unsigned long addr) #endif /* CONFIG_ELF_CORE */ /* - * Generic Fast GUP + * Generic RCU Fast GUP * * get_user_pages_fast attempts to pin user pages by walking the page * tables directly and avoids taking locks. Thus the walker needs to be @@ -1176,8 +1176,8 @@ struct page *get_dump_page(unsigned long addr) * Before activating this code, please be aware that the following assumptions * are currently made: * - * *) Either HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to - * free pages containing page tables or TLB flushing requires IPI broadcast. + * *) HAVE_RCU_TABLE_FREE is enabled, and tlb_remove_table is used to free + * pages containing page tables. * * *) ptes can be read atomically by the architecture. * @@ -1187,7 +1187,7 @@ struct page *get_dump_page(unsigned long addr) * * This code is based heavily on the PowerPC implementation by Nick Piggin. */ -#ifdef CONFIG_HAVE_GENERIC_GUP +#ifdef CONFIG_HAVE_GENERIC_RCU_GUP #ifndef gup_get_pte /* @@ -1677,4 +1677,4 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, return ret; } -#endif /* CONFIG_HAVE_GENERIC_GUP */ +#endif /* CONFIG_HAVE_GENERIC_RCU_GUP */ -- cgit v1.2.3 From e6ab9c4d437764c7fb728d428dc9e717cdb183d0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 25 Apr 2017 12:25:57 +0300 Subject: x86/mm/64: Fix crash in remove_pagetable() remove_pagetable() does page walk using p*d_page_vaddr() plus cast. It's not canonical approach -- we usually use p*d_offset() for that. It works fine as long as all page table levels are present. We broke the invariant by introducing folded p4d page table level. As result, remove_pagetable() interprets PMD as PUD and it leads to crash: BUG: unable to handle kernel paging request at ffff880300000000 IP: memchr_inv+0x60/0x110 PGD 317d067 P4D 317d067 PUD 3180067 PMD 33f102067 PTE 8000000300000060 Let's fix this by using p*d_offset() instead of p*d_page_vaddr() for page walk. Reported-by: Dan Williams Tested-by: Dan Williams Signed-off-by: Kirill A. Shutemov Cc: Andy Lutomirski Cc: Dave Hansen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Fixes: f2a6a7050109 ("x86: Convert the rest of the code to support p4d_t") Link: http://lkml.kernel.org/r/20170425092557.21852-1-kirill.shutemov@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a242139df8fe..745e5e183169 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -962,7 +962,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, continue; } - pmd_base = (pmd_t *)pud_page_vaddr(*pud); + pmd_base = pmd_offset(pud, 0); remove_pmd_table(pmd_base, addr, next, direct); free_pmd_table(pmd_base, pud); } @@ -988,7 +988,7 @@ remove_p4d_table(p4d_t *p4d_start, unsigned long addr, unsigned long end, BUILD_BUG_ON(p4d_large(*p4d)); - pud_base = (pud_t *)p4d_page_vaddr(*p4d); + pud_base = pud_offset(p4d, 0); remove_pud_table(pud_base, addr, next, direct); free_pud_table(pud_base, p4d); } @@ -1013,7 +1013,7 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) if (!pgd_present(*pgd)) continue; - p4d = (p4d_t *)pgd_page_vaddr(*pgd); + p4d = p4d_offset(pgd, 0); remove_p4d_table(p4d, addr, next, direct); } -- cgit v1.2.3 From 9ccee2373f0658f234727700e619df097ba57023 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Apr 2017 00:01:19 -0700 Subject: x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly() mark_screen_rdonly() is the last remaining caller of flush_tlb(). flush_tlb_mm_range() is potentially faster and isn't obsolete. Compile-tested only because I don't know whether software that uses this mechanism even exists. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Sasha Levin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/791a644076fc3577ba7f7b7cafd643cc089baa7d.1492844372.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/vm86_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 62597c300d94..7924a5356c8a 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -197,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) pte_unmap_unlock(pte, ptl); out: up_write(&mm->mmap_sem); - flush_tlb(); + flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL); } -- cgit v1.2.3 From 29961b59a51f8c6838a26a45e871a7ed6771809b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Apr 2017 00:01:20 -0700 Subject: x86/mm: Remove flush_tlb() and flush_tlb_current_task() I was trying to figure out what how flush_tlb_current_task() would possibly work correctly if current->mm != current->active_mm, but I realized I could spare myself the effort: it has no callers except the unused flush_tlb() macro. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e52d64c11690f85e9f1d69d7b48cc2269cd2e94b.1492844372.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/tlbflush.h | 9 --------- arch/x86/mm/tlb.c | 17 ----------------- 2 files changed, 26 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 75d002bdb3f3..6ed9ea469b48 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -215,7 +215,6 @@ static inline void __flush_tlb_one(unsigned long addr) /* * TLB flushing: * - * - flush_tlb() flushes the current mm struct TLBs * - flush_tlb_all() flushes all processes TLBs * - flush_tlb_mm(mm) flushes the specified mm context TLB's * - flush_tlb_page(vma, vmaddr) flushes one page @@ -247,11 +246,6 @@ static inline void flush_tlb_all(void) __flush_tlb_all(); } -static inline void flush_tlb(void) -{ - __flush_tlb_up(); -} - static inline void local_flush_tlb(void) { __flush_tlb_up(); @@ -313,14 +307,11 @@ static inline void flush_tlb_kernel_range(unsigned long start, flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags) extern void flush_tlb_all(void); -extern void flush_tlb_current_task(void); extern void flush_tlb_page(struct vm_area_struct *, unsigned long); extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); -#define flush_tlb() flush_tlb_current_task() - void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index a7655f6caf7d..92ec37f517ab 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -289,23 +289,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, smp_call_function_many(cpumask, flush_tlb_func, &info, 1); } -void flush_tlb_current_task(void) -{ - struct mm_struct *mm = current->mm; - - preempt_disable(); - - count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); - - /* This is an implicit full barrier that synchronizes with switch_mm. */ - local_flush_tlb(); - - trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); - if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); - preempt_enable(); -} - /* * See Documentation/x86/tlb.txt for details. We choose 33 * because it is large enough to cover the vast majority (at -- cgit v1.2.3 From ce27374fabf553153c3f53efcaa9bfab9216bd8c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Apr 2017 00:01:21 -0700 Subject: x86/mm: Make flush_tlb_mm_range() more predictable I'm about to rewrite the function almost completely, but first I want to get a functional change out of the way. Currently, if flush_tlb_mm_range() does not flush the local TLB at all, it will never do individual page flushes on remote CPUs. This seems to be an accident, and preserving it will be awkward. Let's change it first so that any regressions in the rewrite will be easier to bisect and so that the rewrite can attempt to change no visible behavior at all. The fix is simple: we can simply avoid short-circuiting the calculation of base_pages_to_flush. As a side effect, this also eliminates a potential corner case: if tlb_single_page_flush_ceiling == TLB_FLUSH_ALL, flush_tlb_mm_range() could have ended up flushing the entire address space one page at a time. Signed-off-by: Andy Lutomirski Acked-by: Dave Hansen Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Michal Hocko Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4b29b771d9975aad7154c314534fec235618175a.1492844372.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 92ec37f517ab..9db9260a5e9f 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -309,6 +309,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); + + if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) + base_pages_to_flush = (end - start) >> PAGE_SHIFT; + if (base_pages_to_flush > tlb_single_page_flush_ceiling) + base_pages_to_flush = TLB_FLUSH_ALL; + if (current->active_mm != mm) { /* Synchronize with switch_mm. */ smp_mb(); @@ -325,15 +331,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, goto out; } - if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) - base_pages_to_flush = (end - start) >> PAGE_SHIFT; - /* * Both branches below are implicit full barriers (MOV to CR or * INVLPG) that synchronize with switch_mm. */ - if (base_pages_to_flush > tlb_single_page_flush_ceiling) { - base_pages_to_flush = TLB_FLUSH_ALL; + if (base_pages_to_flush == TLB_FLUSH_ALL) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { -- cgit v1.2.3 From dbd68d8e84c606673ebbcf15862f8c155fa92326 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 22 Apr 2017 00:01:22 -0700 Subject: x86/mm: Fix flush_tlb_page() on Xen flush_tlb_page() passes a bogus range to flush_tlb_others() and expects the latter to fix it up. native_flush_tlb_others() has the fixup but Xen's version doesn't. Move the fixup to flush_tlb_others(). AFAICS the only real effect is that, without this fix, Xen would flush everything instead of just the one page on remote vCPUs in when flush_tlb_page() was called. Signed-off-by: Andy Lutomirski Reviewed-by: Boris Ostrovsky Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Juergen Gross Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Michal Hocko Cc: Nadav Amit Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Fixes: e7b52ffd45a6 ("x86/flush_tlb: try flush_tlb_single one by one in flush_tlb_range") Link: http://lkml.kernel.org/r/10ed0e4dfea64daef10b87fb85df1746999b4dba.1492844372.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 9db9260a5e9f..6e7bedf69af7 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -263,8 +263,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask, { struct flush_tlb_info info; - if (end == 0) - end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; @@ -378,7 +376,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); + flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE); preempt_enable(); } -- cgit v1.2.3 From 71389703839ebe9cb426c72d5f0bd549592e583c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 28 Apr 2017 10:23:37 -0700 Subject: mm, zone_device: Replace {get, put}_zone_device_page() with a single reference to fix pmem crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The x86 conversion to the generic GUP code included a small change which causes crashes and data corruption in the pmem code - not good. The root cause is that the /dev/pmem driver code implicitly relies on the x86 get_user_pages() implementation doing a get_page() on the page refcount, because get_page() does a get_zone_device_page() which properly refcounts pmem's separate page struct arrays that are not present in the regular page struct structures. (The pmem driver does this because it can cover huge memory areas.) But the x86 conversion to the generic GUP code changed the get_page() to page_cache_get_speculative() which is faster but doesn't do the get_zone_device_page() call the pmem code relies on. One way to solve the regression would be to change the generic GUP code to use get_page(), but that would slow things down a bit and punish other generic-GUP using architectures for an x86-ism they did not care about. (Arguably the pmem driver was probably not working reliably for them: but nvdimm is an Intel feature, so non-x86 exposure is probably still limited.) So restructure the pmem code's interface with the MM instead: get rid of the get/put_zone_device_page() distinction, integrate put_zone_device_page() into __put_page() and and restructure the pmem completion-wait and teardown machinery: Kirill points out that the calls to {get,put}_dev_pagemap() can be removed from the mm fast path if we take a single get_dev_pagemap() reference to signify that the page is alive and use the final put of the page to drop that reference. This does require some care to make sure that any waits for the percpu_ref to drop to zero occur *after* devm_memremap_page_release(), since it now maintains its own elevated reference. This speeds up things while also making the pmem refcounting more robust going forward. Suggested-by: Kirill Shutemov Tested-by: Kirill Shutemov Signed-off-by: Dan Williams Reviewed-by: Logan Gunthorpe Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Jérôme Glisse Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/149339998297.24933.1129582806028305912.stgit@dwillia2-desk3.amr.corp.intel.com Signed-off-by: Ingo Molnar --- drivers/dax/pmem.c | 2 +- drivers/nvdimm/pmem.c | 13 +++++++++++-- include/linux/mm.h | 14 -------------- kernel/memremap.c | 22 +++++++++------------- mm/swap.c | 10 ++++++++++ 5 files changed, 31 insertions(+), 30 deletions(-) diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 033f49b31fdc..cb0d742fa23f 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -43,6 +43,7 @@ static void dax_pmem_percpu_exit(void *data) struct dax_pmem *dax_pmem = to_dax_pmem(ref); dev_dbg(dax_pmem->dev, "%s\n", __func__); + wait_for_completion(&dax_pmem->cmp); percpu_ref_exit(ref); } @@ -53,7 +54,6 @@ static void dax_pmem_percpu_kill(void *data) dev_dbg(dax_pmem->dev, "%s\n", __func__); percpu_ref_kill(ref); - wait_for_completion(&dax_pmem->cmp); } static int dax_pmem_probe(struct device *dev) diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 5b536be5a12e..fb7bbc79ac26 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -231,6 +232,11 @@ static void pmem_release_queue(void *q) blk_cleanup_queue(q); } +static void pmem_freeze_queue(void *q) +{ + blk_mq_freeze_queue_start(q); +} + static void pmem_release_disk(void *disk) { del_gendisk(disk); @@ -284,6 +290,9 @@ static int pmem_attach_disk(struct device *dev, if (!q) return -ENOMEM; + if (devm_add_action_or_reset(dev, pmem_release_queue, q)) + return -ENOMEM; + pmem->pfn_flags = PFN_DEV; if (is_nd_pfn(dev)) { addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, @@ -303,10 +312,10 @@ static int pmem_attach_disk(struct device *dev, pmem->size, ARCH_MEMREMAP_PMEM); /* - * At release time the queue must be dead before + * At release time the queue must be frozen before * devm_memremap_pages is unwound */ - if (devm_add_action_or_reset(dev, pmem_release_queue, q)) + if (devm_add_action_or_reset(dev, pmem_freeze_queue, q)) return -ENOMEM; if (IS_ERR(addr)) diff --git a/include/linux/mm.h b/include/linux/mm.h index a835edd2db34..695da2a19b4c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -762,19 +762,11 @@ static inline enum zone_type page_zonenum(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void get_zone_device_page(struct page *page); -void put_zone_device_page(struct page *page); static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } #else -static inline void get_zone_device_page(struct page *page) -{ -} -static inline void put_zone_device_page(struct page *page) -{ -} static inline bool is_zone_device_page(const struct page *page) { return false; @@ -790,9 +782,6 @@ static inline void get_page(struct page *page) */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); - - if (unlikely(is_zone_device_page(page))) - get_zone_device_page(page); } static inline void put_page(struct page *page) @@ -801,9 +790,6 @@ static inline void put_page(struct page *page) if (put_page_testzero(page)) __put_page(page); - - if (unlikely(is_zone_device_page(page))) - put_zone_device_page(page); } #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) diff --git a/kernel/memremap.c b/kernel/memremap.c index 07e85e5229da..23a6483c3666 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -182,18 +182,6 @@ struct page_map { struct vmem_altmap altmap; }; -void get_zone_device_page(struct page *page) -{ - percpu_ref_get(page->pgmap->ref); -} -EXPORT_SYMBOL(get_zone_device_page); - -void put_zone_device_page(struct page *page) -{ - put_dev_pagemap(page->pgmap); -} -EXPORT_SYMBOL(put_zone_device_page); - static void pgmap_radix_release(struct resource *res) { resource_size_t key, align_start, align_size, align_end; @@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data) struct resource *res = &page_map->res; resource_size_t align_start, align_size; struct dev_pagemap *pgmap = &page_map->pgmap; + unsigned long pfn; + + for_each_device_pfn(pfn, page_map) + put_page(pfn_to_page(pfn)); if (percpu_ref_tryget_live(pgmap->ref)) { dev_WARN(dev, "%s: page mapping is still live!\n", __func__); @@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) * * Notes: * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time - * (or devm release event). + * (or devm release event). The expected order of events is that @ref has + * been through percpu_ref_kill() before devm_memremap_pages_release(). The + * wait for the completion of all references being dropped and + * percpu_ref_exit() must occur after devm_memremap_pages_release(). * * 2/ @res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, */ list_del(&page->lru); page->pgmap = pgmap; + percpu_ref_get(ref); } devres_add(dev, page_map); return __va(res->start); diff --git a/mm/swap.c b/mm/swap.c index c4910f14f957..a4e6113276b5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { + if (is_zone_device_page(page)) { + put_dev_pagemap(page->pgmap); + + /* + * The page belongs to the device that created pgmap. Do + * not return it to page allocator. + */ + return; + } + if (unlikely(PageCompound(page))) __put_compound_page(page); else -- cgit v1.2.3