From 8609d1b5daa36350e020e737946c40887af1743a Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 19 Nov 2015 17:07:55 -0800 Subject: x86/mm: Turn CONFIG_X86_PTDUMP into a module Being able to examine page tables is handy, so make this a module that can be loaded as needed. Signed-off-by: Kees Cook Cc: Andrey Ryabinin Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephen Smalley Cc: Thomas Gleixner Cc: Toshi Kani Cc: Vladimir Murzin Cc: Will Deacon Link: http://lkml.kernel.org/r/20151120010755.GA9060@www.outflux.net Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 1 + arch/x86/mm/debug_pagetables.c | 46 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/dump_pagetables.c | 34 ++----------------------------- 3 files changed, 49 insertions(+), 32 deletions(-) create mode 100644 arch/x86/mm/debug_pagetables.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 65c47fda26fc..f9d38a48e3c8 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_X86_PTDUMP_CORE) += dump_pagetables.o +obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c new file mode 100644 index 000000000000..b35ee86a9316 --- /dev/null +++ b/arch/x86/mm/debug_pagetables.c @@ -0,0 +1,46 @@ +#include +#include +#include +#include + +static int ptdump_show(struct seq_file *m, void *v) +{ + ptdump_walk_pgd_level(m, NULL); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, ptdump_show, NULL); +} + +static const struct file_operations ptdump_fops = { + .owner = THIS_MODULE, + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *pe; + +static int __init pt_dump_debug_init(void) +{ + pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + &ptdump_fops); + if (!pe) + return -ENOMEM; + + return 0; +} + +static void __exit pt_dump_debug_exit(void) +{ + debugfs_remove_recursive(pe); +} + +module_init(pt_dump_debug_init); +module_exit(pt_dump_debug_exit); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven "); +MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables"); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a035c2aa7801..90a1dc054873 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -426,38 +426,15 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) { ptdump_walk_pgd_level_core(m, pgd, false); } +EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level); void ptdump_walk_pgd_level_checkwx(void) { ptdump_walk_pgd_level_core(NULL, NULL, true); } -#ifdef CONFIG_X86_PTDUMP -static int ptdump_show(struct seq_file *m, void *v) +static int __init pt_dump_init(void) { - ptdump_walk_pgd_level(m, NULL); - return 0; -} - -static int ptdump_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, ptdump_show, NULL); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - -static int pt_dump_init(void) -{ -#ifdef CONFIG_X86_PTDUMP - struct dentry *pe; -#endif - #ifdef CONFIG_X86_32 /* Not a compile-time constant on x86-32 */ address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; @@ -468,13 +445,6 @@ static int pt_dump_init(void) address_markers[FIXADDR_START_NR].start_address = FIXADDR_START; #endif -#ifdef CONFIG_X86_PTDUMP - pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, - &ptdump_fops); - if (!pe) - return -ENOMEM; -#endif - return 0; } -- cgit v1.2.3 From d6ccc3ec95251d8d3276f2900b59cbc468dd74f4 Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 17 Nov 2015 15:51:19 +0100 Subject: x86/paravirt: Remove paravirt ops pmd_update[_defer] and pte_update_defer pte_update_defer can be removed as it is always set to the same function as pte_update. So any usage of pte_update_defer() can be replaced by pte_update(). pmd_update and pmd_update_defer are always set to paravirt_nop, so they can just be nuked. Signed-off-by: Juergen Gross Acked-by: Rusty Russell Cc: jeremy@goop.org Cc: chrisw@sous-sol.org Cc: akataria@vmware.com Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xen.org Cc: konrad.wilk@oracle.com Cc: david.vrabel@citrix.com Cc: boris.ostrovsky@oracle.com Link: http://lkml.kernel.org/r/1447771879-1806-1-git-send-email-jgross@suse.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/paravirt.h | 17 ----------------- arch/x86/include/asm/paravirt_types.h | 6 ------ arch/x86/include/asm/pgtable.h | 15 ++------------- arch/x86/kernel/paravirt.c | 3 --- arch/x86/lguest/boot.c | 1 - arch/x86/mm/pgtable.c | 7 +------ arch/x86/xen/mmu.c | 1 - 7 files changed, 3 insertions(+), 47 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 4d7f080e4706..cbbf41c0a328 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -366,23 +366,6 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr, { PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); } -static inline void pmd_update(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp); -} - -static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); -} - -static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp); -} static inline pte_t __pte(pteval_t val) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7afeafb8a4ee..0451503e1716 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -266,12 +266,6 @@ struct pv_mmu_ops { pmd_t *pmdp, pmd_t pmdval); void (*pte_update)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); - void (*pte_update_defer)(struct mm_struct *mm, - unsigned long addr, pte_t *ptep); - void (*pmd_update)(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp); - void (*pmd_update_defer)(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c0b41f111a9a..e99cbe814ea8 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -69,9 +69,6 @@ extern struct mm_struct *pgd_page_get_mm(struct page *page); #define pmd_clear(pmd) native_pmd_clear(pmd) #define pte_update(mm, addr, ptep) do { } while (0) -#define pte_update_defer(mm, addr, ptep) do { } while (0) -#define pmd_update(mm, addr, ptep) do { } while (0) -#define pmd_update_defer(mm, addr, ptep) do { } while (0) #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) @@ -721,14 +718,9 @@ static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, * updates should either be sets, clears, or set_pte_atomic for P->P * transitions, which means this hook should only be called for user PTEs. * This hook implies a P->P protection or access change has taken place, which - * requires a subsequent TLB flush. The notification can optionally be delayed - * until the TLB flush event by using the pte_update_defer form of the - * interface, but care must be taken to assure that the flush happens while - * still holding the same page table lock so that the shadow and primary pages - * do not become out of sync on SMP. + * requires a subsequent TLB flush. */ #define pte_update(mm, addr, ptep) do { } while (0) -#define pte_update_defer(mm, addr, ptep) do { } while (0) #endif /* @@ -820,9 +812,7 @@ static inline int pmd_write(pmd_t pmd) static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = native_pmdp_get_and_clear(pmdp); - pmd_update(mm, addr, pmdp); - return pmd; + return native_pmdp_get_and_clear(pmdp); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -830,7 +820,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); - pmd_update(mm, addr, pmdp); } /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f27962cca0c7..3265ea0fceeb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -426,9 +426,6 @@ struct pv_mmu_ops pv_mmu_ops = { .set_pmd = native_set_pmd, .set_pmd_at = native_set_pmd_at, .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, - .pmd_update = paravirt_nop, - .pmd_update_defer = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index a0d09f6c6533..a1900d4682c0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1472,7 +1472,6 @@ __init void lguest_init(void) pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; pv_mmu_ops.lazy_mode.flush = paravirt_flush_lazy_mmu; pv_mmu_ops.pte_update = lguest_pte_update; - pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC /* APIC read/write intercepts */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index fb0a9dd1d6e4..ee9c2e3a7199 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -414,7 +414,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; - pte_update_defer(vma->vm_mm, address, ptep); + pte_update(vma->vm_mm, address, ptep); } return changed; @@ -431,7 +431,6 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *pmdp = entry; - pmd_update_defer(vma->vm_mm, address, pmdp); /* * We had a write-protection fault here and changed the pmd * to to more permissive. No need to flush the TLB for that, @@ -469,9 +468,6 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, (unsigned long *)pmdp); - if (ret) - pmd_update(vma->vm_mm, addr, pmdp); - return ret; } #endif @@ -518,7 +514,6 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, set = !test_and_set_bit(_PAGE_BIT_SPLITTING, (unsigned long *)pmdp); if (set) { - pmd_update(vma->vm_mm, address, pmdp); /* need tlb flush only to serialize against gup-fast */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 9c479fe40459..41ee3e25fcce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -2436,7 +2436,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .flush_tlb_others = xen_flush_tlb_others, .pte_update = paravirt_nop, - .pte_update_defer = paravirt_nop, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, -- cgit v1.2.3 From 071ac0c4e8e90d5de05f0779b03ae69ce84820d5 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 Nov 2015 13:12:59 +0100 Subject: x86/mm/ptdump: Make (debugfs)/kernel_page_tables read-only File should be created with S_IRUSR and not with S_IWUSR too because writing to it doesn't make any sense. I mean, we don't have a ->write method anyway but let's have the permissions correct too. Signed-off-by: Borislav Petkov Cc: Andrew Morton Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1448885579-32506-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/mm/debug_pagetables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/debug_pagetables.c b/arch/x86/mm/debug_pagetables.c index b35ee86a9316..bfcffdf6c577 100644 --- a/arch/x86/mm/debug_pagetables.c +++ b/arch/x86/mm/debug_pagetables.c @@ -26,7 +26,7 @@ static struct dentry *pe; static int __init pt_dump_debug_init(void) { - pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL, + pe = debugfs_create_file("kernel_page_tables", S_IRUSR, NULL, NULL, &ptdump_fops); if (!pe) return -ENOMEM; -- cgit v1.2.3 From 8e8efe0379bd93e8219ca0fc6fa80b5dd85b09cb Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 30 Nov 2015 16:31:13 -0800 Subject: x86/mpx: Fix instruction decoder condition MPX decodes instructions in order to tell which bounds register was violated. Part of this decoding involves looking at the "REX prefix" which is a special instrucion prefix used to retrofit support for new registers in to old instructions. The X86_REX_*() macros are defined to return actual bit values: #define X86_REX_R(rex) ((rex) & 4) *not* boolean values. However, the MPX code was checking for them like they were booleans. This might have led to us mis-decoding the "REX prefix" and giving false information out to userspace about bounds violations. X86_REX_B() actually is bit 1, so this is really only broken for the X86_REX_X() case. Fix the conditionals up to tolerate the non-boolean values. Fixes: fcc7ffd67991 "x86, mpx: Decode MPX instruction to get bound violation information" Reported-by: Dan Carpenter Signed-off-by: Dave Hansen Cc: x86@kernel.org Cc: Dave Hansen Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20151201003113.D800C1E0@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/mpx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index 1202d5ca2fb5..b2fd67da1701 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -101,19 +101,19 @@ static int get_reg_offset(struct insn *insn, struct pt_regs *regs, switch (type) { case REG_TYPE_RM: regno = X86_MODRM_RM(insn->modrm.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_INDEX: regno = X86_SIB_INDEX(insn->sib.value); - if (X86_REX_X(insn->rex_prefix.value) == 1) + if (X86_REX_X(insn->rex_prefix.value)) regno += 8; break; case REG_TYPE_BASE: regno = X86_SIB_BASE(insn->sib.value); - if (X86_REX_B(insn->rex_prefix.value) == 1) + if (X86_REX_B(insn->rex_prefix.value)) regno += 8; break; -- cgit v1.2.3 From 8dd3303001976aa8583bf20f6b93590c74114308 Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Fri, 4 Dec 2015 14:07:05 +0100 Subject: x86/mm: Introduce max_possible_pfn max_possible_pfn will be used for tracking max possible PFN for memory that isn't present in E820 table and could be hotplugged later. By default max_possible_pfn is initialized with max_pfn, but later it could be updated with highest PFN of hotpluggable memory ranges declared in ACPI SRAT table if any present. Signed-off-by: Igor Mammedov Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: akataria@vmware.com Cc: fujita.tomonori@lab.ntt.co.jp Cc: konrad.wilk@oracle.com Cc: pbonzini@redhat.com Cc: revers@redhat.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1449234426-273049-2-git-send-email-imammedo@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ arch/x86/mm/srat.c | 2 ++ include/linux/bootmem.h | 4 ++++ mm/bootmem.c | 1 + mm/nobootmem.c | 1 + 5 files changed, 10 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 29db25f9a745..16a846548ece 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1048,6 +1048,8 @@ void __init setup_arch(char **cmdline_p) if (mtrr_trim_uncached_memory(max_pfn)) max_pfn = e820_end_of_ram_pfn(); + max_possible_pfn = max_pfn; + #ifdef CONFIG_X86_32 /* max_low_pfn get updated here */ find_low_pfn_range(); diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index c2aea63bee20..b5f821881465 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -203,6 +203,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) pr_warn("SRAT: Failed to mark hotplug range [mem %#010Lx-%#010Lx] in memblock\n", (unsigned long long)start, (unsigned long long)end - 1); + max_possible_pfn = max(max_possible_pfn, PFN_UP(end - 1)); + return 0; out_err_bad_srat: bad_srat(); diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index f589222bfa87..35b22f94d2d2 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -19,6 +19,10 @@ extern unsigned long min_low_pfn; * highest page */ extern unsigned long max_pfn; +/* + * highest possible page + */ +extern unsigned long long max_possible_pfn; #ifndef CONFIG_NO_BOOTMEM /* diff --git a/mm/bootmem.c b/mm/bootmem.c index 3b6380784c28..91e32bc8517f 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -33,6 +33,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index e57cf24babd6..99feb2b07fc5 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -31,6 +31,7 @@ EXPORT_SYMBOL(contig_page_data); unsigned long max_low_pfn; unsigned long min_low_pfn; unsigned long max_pfn; +unsigned long long max_possible_pfn; static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, u64 goal, u64 limit) -- cgit v1.2.3 From 173ae9ba63349194ae180622f05fb49f55a4df46 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 15 Dec 2015 10:15:57 -0800 Subject: Fix user-visible spelling error Pavel Machek reports a warning about W+X pages found in the "Persisent" kmap area. After grepping for it (using the correct spelling), and not finding it, I noticed how the debug printk was just misspelled. Fix it. The actual mapping bug that Pavel reported is still open. It's apparently a separate issue from the known EFI page tables, looks like it's related to the HIGHMEM mappings. Signed-off-by: Linus Torvalds --- arch/x86/mm/dump_pagetables.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a035c2aa7801..0f1c6fc3ddd8 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -89,7 +89,7 @@ static struct addr_marker address_markers[] = { { 0/* VMALLOC_START */, "vmalloc() Area" }, { 0/*VMALLOC_END*/, "vmalloc() End" }, # ifdef CONFIG_HIGHMEM - { 0/*PKMAP_BASE*/, "Persisent kmap() Area" }, + { 0/*PKMAP_BASE*/, "Persistent kmap() Area" }, # endif { 0/*FIXADDR_START*/, "Fixmap Area" }, #endif -- cgit v1.2.3 From 362f924b64ba0f4be2ee0cb697690c33d40be721 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 7 Dec 2015 10:39:41 +0100 Subject: x86/cpufeature: Remove unused and seldomly used cpu_has_xx macros Those are stupid and code should use static_cpu_has_safe() or boot_cpu_has() instead. Kill the least used and unused ones. The remaining ones need more careful inspection before a conversion can happen. On the TODO. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1449481182-27541-4-git-send-email-bp@alien8.de Cc: David Sterba Cc: Herbert Xu Cc: Peter Zijlstra Cc: Matt Mackall Cc: Chris Mason Cc: Josef Bacik Signed-off-by: Thomas Gleixner --- arch/x86/crypto/chacha20_glue.c | 2 +- arch/x86/crypto/crc32c-intel_glue.c | 2 +- arch/x86/include/asm/cmpxchg_32.h | 2 +- arch/x86/include/asm/cmpxchg_64.h | 2 +- arch/x86/include/asm/cpufeature.h | 37 ++++------------------------- arch/x86/include/asm/xor_32.h | 2 +- arch/x86/kernel/cpu/amd.c | 4 ++-- arch/x86/kernel/cpu/common.c | 4 +++- arch/x86/kernel/cpu/intel.c | 3 ++- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 ++--- arch/x86/kernel/cpu/mtrr/generic.c | 2 +- arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/cpu/perf_event_amd.c | 4 ++-- arch/x86/kernel/cpu/perf_event_amd_uncore.c | 11 +++++---- arch/x86/kernel/fpu/init.c | 4 ++-- arch/x86/kernel/hw_breakpoint.c | 6 +++-- arch/x86/kernel/smpboot.c | 2 +- arch/x86/kernel/vm86_32.c | 4 +++- arch/x86/mm/setup_nx.c | 4 ++-- drivers/char/hw_random/via-rng.c | 5 ++-- drivers/crypto/padlock-aes.c | 2 +- drivers/crypto/padlock-sha.c | 2 +- drivers/iommu/intel_irq_remapping.c | 2 +- fs/btrfs/disk-io.c | 2 +- 24 files changed, 48 insertions(+), 68 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c index 722bacea040e..8baaff5af0b5 100644 --- a/arch/x86/crypto/chacha20_glue.c +++ b/arch/x86/crypto/chacha20_glue.c @@ -125,7 +125,7 @@ static struct crypto_alg alg = { static int __init chacha20_simd_mod_init(void) { - if (!cpu_has_ssse3) + if (!boot_cpu_has(X86_FEATURE_SSSE3)) return -ENODEV; #ifdef CONFIG_AS_AVX2 diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 81a595d75cf5..0e9871693f24 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -257,7 +257,7 @@ static int __init crc32c_intel_mod_init(void) if (!x86_match_cpu(crc32c_cpu_id)) return -ENODEV; #ifdef CONFIG_X86_64 - if (cpu_has_pclmulqdq) { + if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index f7e142926481..e4959d023af8 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -109,6 +109,6 @@ static inline u64 __cmpxchg64_local(volatile u64 *ptr, u64 old, u64 new) #endif -#define system_has_cmpxchg_double() cpu_has_cx8 +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX8) #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 1af94697aae5..caa23a34c963 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -18,6 +18,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) cmpxchg_local((ptr), (o), (n)); \ }) -#define system_has_cmpxchg_double() cpu_has_cx16 +#define system_has_cmpxchg_double() boot_cpu_has(X86_FEATURE_CX16) #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 35401fef0d75..144b042c0872 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -385,58 +385,29 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; } while (0) #define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU) -#define cpu_has_de boot_cpu_has(X86_FEATURE_DE) #define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE) #define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC) #define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE) #define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC) -#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP) -#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR) -#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX) #define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR) #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) -#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) -#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) #define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_avx2 boot_cpu_has(X86_FEATURE_AVX2) -#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) -#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) -#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE) -#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN) -#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT) -#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN) -#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2) -#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN) -#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE) -#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN) -#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM) -#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN) -#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS) -#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS) #define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLUSH) -#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS) #define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES) #define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON) #define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT) -#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1) -#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) -#define cpu_has_xsaveopt boot_cpu_has(X86_FEATURE_XSAVEOPT) #define cpu_has_xsaves boot_cpu_has(X86_FEATURE_XSAVES) #define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) -#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) -#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) -#define cpu_has_perfctr_nb boot_cpu_has(X86_FEATURE_PERFCTR_NB) -#define cpu_has_perfctr_l2 boot_cpu_has(X86_FEATURE_PERFCTR_L2) -#define cpu_has_cx8 boot_cpu_has(X86_FEATURE_CX8) -#define cpu_has_cx16 boot_cpu_has(X86_FEATURE_CX16) -#define cpu_has_eager_fpu boot_cpu_has(X86_FEATURE_EAGER_FPU) -#define cpu_has_topoext boot_cpu_has(X86_FEATURE_TOPOEXT) -#define cpu_has_bpext boot_cpu_has(X86_FEATURE_BPEXT) +/* + * Do not add any more of those clumsy macros - use static_cpu_has_safe() for + * fast paths and boot_cpu_has() otherwise! + */ #if __GNUC__ >= 4 extern void warn_pre_alternatives(void); diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 5a08bc8bff33..c54beb44c4c1 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -553,7 +553,7 @@ do { \ if (cpu_has_xmm) { \ xor_speed(&xor_block_pIII_sse); \ xor_speed(&xor_block_sse_pf64); \ - } else if (cpu_has_mmx) { \ + } else if (boot_cpu_has(X86_FEATURE_MMX)) { \ xor_speed(&xor_block_pII_mmx); \ xor_speed(&xor_block_p5_mmx); \ } else { \ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a8816b325162..34c3ad608dd4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -304,7 +304,7 @@ static void amd_get_topology(struct cpuinfo_x86 *c) int cpu = smp_processor_id(); /* get information required for multi-node processors */ - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { u32 eax, ebx, ecx, edx; cpuid(0x8000001e, &eax, &ebx, &ecx, &edx); @@ -922,7 +922,7 @@ static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum) void set_dr_addr_mask(unsigned long mask, int dr) { - if (!cpu_has_bpext) + if (!boot_cpu_has(X86_FEATURE_BPEXT)) return; switch (dr) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e14d5bd8671f..4d5279c95d5f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1445,7 +1445,9 @@ void cpu_init(void) printk(KERN_INFO "Initializing CPU#%d\n", cpu); - if (cpu_feature_enabled(X86_FEATURE_VME) || cpu_has_tsc || cpu_has_de) + if (cpu_feature_enabled(X86_FEATURE_VME) || + cpu_has_tsc || + boot_cpu_has(X86_FEATURE_DE)) cr4_clear_bits(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_current_idt(); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 209ac1e7d1f0..565648bc1a0a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -445,7 +445,8 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (cpu_has_ds) { + + if (boot_cpu_has(X86_FEATURE_DS)) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index e38d338a6447..0b6c52388cf4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -591,7 +591,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) unsigned edx; if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (cpu_has_topoext) + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &edx); else @@ -637,7 +637,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) void init_amd_cacheinfo(struct cpuinfo_x86 *c) { - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { num_cache_leaves = find_num_cache_leaves(c); } else if (c->extended_cpuid_level >= 0x80000006) { if (cpuid_edx(0x80000006) & 0xf000) @@ -809,7 +809,7 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, struct cacheinfo *this_leaf; int i, sibling; - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; this_leaf = this_cpu_ci->info_list + index; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3b533cf37c74..c870af161008 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -349,7 +349,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (cpu_has_mtrr) + if (boot_cpu_has(X86_FEATURE_MTRR)) get_fixed_ranges(mtrr_state.fixed_ranges); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index f891b4750f04..5c3d149ee91c 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -682,7 +682,7 @@ void __init mtrr_bp_init(void) phys_addr = 32; - if (cpu_has_mtrr) { + if (boot_cpu_has(X86_FEATURE_MTRR)) { mtrr_if = &generic_mtrr_ops; size_or_mask = SIZE_OR_MASK_BITS(36); size_and_mask = 0x00f00000; diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index 1cee5d2d7ece..3ea177cb7366 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -160,7 +160,7 @@ static inline int amd_pmu_addr_offset(int index, bool eventsel) if (offset) return offset; - if (!cpu_has_perfctr_core) + if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) offset = index; else offset = index << 1; @@ -652,7 +652,7 @@ static __initconst const struct x86_pmu amd_pmu = { static int __init amd_core_pmu_init(void) { - if (!cpu_has_perfctr_core) + if (!boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) return 0; switch (boot_cpu_data.x86) { diff --git a/arch/x86/kernel/cpu/perf_event_amd_uncore.c b/arch/x86/kernel/cpu/perf_event_amd_uncore.c index cc6cedb8f25d..49742746a6c9 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_amd_uncore.c @@ -523,10 +523,10 @@ static int __init amd_uncore_init(void) if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) goto fail_nodev; - if (!cpu_has_topoext) + if (!boot_cpu_has(X86_FEATURE_TOPOEXT)) goto fail_nodev; - if (cpu_has_perfctr_nb) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) { amd_uncore_nb = alloc_percpu(struct amd_uncore *); if (!amd_uncore_nb) { ret = -ENOMEM; @@ -540,7 +540,7 @@ static int __init amd_uncore_init(void) ret = 0; } - if (cpu_has_perfctr_l2) { + if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) { amd_uncore_l2 = alloc_percpu(struct amd_uncore *); if (!amd_uncore_l2) { ret = -ENOMEM; @@ -583,10 +583,11 @@ fail_online: /* amd_uncore_nb/l2 should have been freed by cleanup_cpu_online */ amd_uncore_nb = amd_uncore_l2 = NULL; - if (cpu_has_perfctr_l2) + + if (boot_cpu_has(X86_FEATURE_PERFCTR_L2)) perf_pmu_unregister(&amd_l2_pmu); fail_l2: - if (cpu_has_perfctr_nb) + if (boot_cpu_has(X86_FEATURE_PERFCTR_NB)) perf_pmu_unregister(&amd_nb_pmu); if (amd_uncore_l2) free_percpu(amd_uncore_l2); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index be39b5fde4b9..22abea04731e 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -12,7 +12,7 @@ */ static void fpu__init_cpu_ctx_switch(void) { - if (!cpu_has_eager_fpu) + if (!boot_cpu_has(X86_FEATURE_EAGER_FPU)) stts(); else clts(); @@ -287,7 +287,7 @@ static void __init fpu__init_system_ctx_switch(void) current_thread_info()->status = 0; /* Auto enable eagerfpu for xsaveopt */ - if (cpu_has_xsaveopt && eagerfpu != DISABLE) + if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; if (xfeatures_mask & XFEATURE_MASK_EAGER) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index 50a3fad5b89f..2bcfb5f2bc44 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -300,6 +300,10 @@ static int arch_build_bp_info(struct perf_event *bp) return -EINVAL; if (bp->attr.bp_addr & (bp->attr.bp_len - 1)) return -EINVAL; + + if (!boot_cpu_has(X86_FEATURE_BPEXT)) + return -EOPNOTSUPP; + /* * It's impossible to use a range breakpoint to fake out * user vs kernel detection because bp_len - 1 can't @@ -307,8 +311,6 @@ static int arch_build_bp_info(struct perf_event *bp) * breakpoints, then we'll have to check for kprobe-blacklisted * addresses anywhere in the range. */ - if (!cpu_has_bpext) - return -EOPNOTSUPP; info->mask = bp->attr.bp_len - 1; info->len = X86_BREAKPOINT_LEN_1; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f2281e9cfdbe..24d57f77b3c1 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -304,7 +304,7 @@ do { \ static bool match_smt(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { - if (cpu_has_topoext) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { int cpu1 = c->cpu_index, cpu2 = o->cpu_index; if (c->phys_proc_id == o->phys_proc_id && diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 524619351961..483231ebbb0b 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -357,8 +357,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) tss = &per_cpu(cpu_tss, get_cpu()); /* make room for real-mode segments */ tsk->thread.sp0 += 16; - if (cpu_has_sep) + + if (static_cpu_has_safe(X86_FEATURE_SEP)) tsk->thread.sysenter_cs = 0; + load_sp0(tss, &tsk->thread); put_cpu(); diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c index 90555bf60aa4..92e2eacb3321 100644 --- a/arch/x86/mm/setup_nx.c +++ b/arch/x86/mm/setup_nx.c @@ -31,7 +31,7 @@ early_param("noexec", noexec_setup); void x86_configure_nx(void) { - if (cpu_has_nx && !disable_nx) + if (boot_cpu_has(X86_FEATURE_NX) && !disable_nx) __supported_pte_mask |= _PAGE_NX; else __supported_pte_mask &= ~_PAGE_NX; @@ -39,7 +39,7 @@ void x86_configure_nx(void) void __init x86_report_nx(void) { - if (!cpu_has_nx) { + if (!boot_cpu_has(X86_FEATURE_NX)) { printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " "missing in CPU!\n"); } else { diff --git a/drivers/char/hw_random/via-rng.c b/drivers/char/hw_random/via-rng.c index 0c98a9d51a24..44ce80606944 100644 --- a/drivers/char/hw_random/via-rng.c +++ b/drivers/char/hw_random/via-rng.c @@ -140,7 +140,7 @@ static int via_rng_init(struct hwrng *rng) * RNG configuration like it used to be the case in this * register */ if ((c->x86 == 6) && (c->x86_model >= 0x0f)) { - if (!cpu_has_xstore_enabled) { + if (!boot_cpu_has(X86_FEATURE_XSTORE_EN)) { pr_err(PFX "can't enable hardware RNG " "if XSTORE is not enabled\n"); return -ENODEV; @@ -200,8 +200,9 @@ static int __init mod_init(void) { int err; - if (!cpu_has_xstore) + if (!boot_cpu_has(X86_FEATURE_XSTORE)) return -ENODEV; + pr_info("VIA RNG detected\n"); err = hwrng_register(&via_rng); if (err) { diff --git a/drivers/crypto/padlock-aes.c b/drivers/crypto/padlock-aes.c index da2d6777bd09..97a364694bfc 100644 --- a/drivers/crypto/padlock-aes.c +++ b/drivers/crypto/padlock-aes.c @@ -515,7 +515,7 @@ static int __init padlock_init(void) if (!x86_match_cpu(padlock_cpu_id)) return -ENODEV; - if (!cpu_has_xcrypt_enabled) { + if (!boot_cpu_has(X86_FEATURE_XCRYPT_EN)) { printk(KERN_NOTICE PFX "VIA PadLock detected, but not enabled. Hmm, strange...\n"); return -ENODEV; } diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c index 4e154c9b9206..8c5f90647b7a 100644 --- a/drivers/crypto/padlock-sha.c +++ b/drivers/crypto/padlock-sha.c @@ -540,7 +540,7 @@ static int __init padlock_init(void) struct shash_alg *sha1; struct shash_alg *sha256; - if (!x86_match_cpu(padlock_sha_ids) || !cpu_has_phe_enabled) + if (!x86_match_cpu(padlock_sha_ids) || !boot_cpu_has(X86_FEATURE_PHE_EN)) return -ENODEV; /* Register the newly added algorithm module if on * diff --git a/drivers/iommu/intel_irq_remapping.c b/drivers/iommu/intel_irq_remapping.c index 1fae1881648c..c12ba4516df2 100644 --- a/drivers/iommu/intel_irq_remapping.c +++ b/drivers/iommu/intel_irq_remapping.c @@ -753,7 +753,7 @@ static inline void set_irq_posting_cap(void) * should have X86_FEATURE_CX16 support, this has been confirmed * with Intel hardware guys. */ - if ( cpu_has_cx16 ) + if (boot_cpu_has(X86_FEATURE_CX16)) intel_irq_remap_ops.capability |= 1 << IRQ_POSTING_CAP; for_each_iommu(iommu, drhd) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 974be09e7556..42a378a4eefb 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -923,7 +923,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (cpu_has_xmm4_2) + if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) return 0; #endif return 1; -- cgit v1.2.3 From 9abb0ecdee69a2577560cc283368e490da974934 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Mon, 21 Dec 2015 12:01:14 -0800 Subject: x86/mm: Drop WARN from multi-BAR check ioremapping multiple BARs produces a warning with a message "Your kernel is fine". This message mostly serves to comfort kernel developers. Users do not read the message, they only see the big scary warning which means something must be horribly broken with their system. Less dramatically, the warn also sets the taint flag which makes it difficult to differentiate problems. If the kernel is actually fine as the warning claims it doesn't make sense for it to be tainted. Change the WARN_ONCE to a pr_warn with the caller of the ioremap. Signed-off-by: Laura Abbott Link: http://lkml.kernel.org/r/1450728074-31029-1-git-send-email-labbott@fedoraproject.org Signed-off-by: Thomas Gleixner --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index b9c78f3bcd67..0d8d53d1f5cc 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -194,8 +194,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, * Check if the request spans more than any BAR in the iomem resource * tree. */ - WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size), - KERN_INFO "Info: mapping multiple BARs. Your kernel is fine."); + if (iomem_map_sanity_check(unaligned_phys_addr, unaligned_size)) + pr_warn("caller %pS mapping multiple BARs\n", caller); return ret_addr; err_free_area: -- cgit v1.2.3 From d9fe4fab11976e56b2e992980bf6ce948bdf02ac Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 22 Dec 2015 17:54:23 -0700 Subject: x86/mm/pat: Add untrack_pfn_moved for mremap mremap() with MREMAP_FIXED on a VM_PFNMAP range causes the following WARN_ON_ONCE() message in untrack_pfn(). WARNING: CPU: 1 PID: 3493 at arch/x86/mm/pat.c:985 untrack_pfn+0xbd/0xd0() Call Trace: [] dump_stack+0x45/0x57 [] warn_slowpath_common+0x86/0xc0 [] warn_slowpath_null+0x1a/0x20 [] untrack_pfn+0xbd/0xd0 [] unmap_single_vma+0x80e/0x860 [] unmap_vmas+0x55/0xb0 [] unmap_region+0xac/0x120 [] do_munmap+0x28a/0x460 [] move_vma+0x1b3/0x2e0 [] SyS_mremap+0x3b3/0x510 [] entry_SYSCALL_64_fastpath+0x12/0x71 MREMAP_FIXED moves a pfnmap from old vma to new vma. untrack_pfn() is called with the old vma after its pfnmap page table has been removed, which causes follow_phys() to fail. The new vma has a new pfnmap to the same pfn & cache type with VM_PAT set. Therefore, we only need to clear VM_PAT from the old vma in this case. Add untrack_pfn_moved(), which clears VM_PAT from a given old vma. move_vma() is changed to call this function with the old vma when VM_PFNMAP is set. move_vma() then calls do_munmap(), and untrack_pfn() is a no-op since VM_PAT is cleared. Reported-by: Stas Sergeev Signed-off-by: Toshi Kani Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Borislav Petkov Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1450832064-10093-2-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pat.c | 10 ++++++++++ include/asm-generic/pgtable.h | 10 +++++++++- mm/mremap.c | 4 ++++ 3 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 188e3e07eeeb..1aca073ba571 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -992,6 +992,16 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, vma->vm_flags &= ~VM_PAT; } +/* + * untrack_pfn_moved is called, while mremapping a pfnmap for a new region, + * with the old vma after its pfnmap page table has been removed. The new + * vma has a new pfnmap to the same pfn & cache type with VM_PAT set. + */ +void untrack_pfn_moved(struct vm_area_struct *vma) +{ + vma->vm_flags &= ~VM_PAT; +} + pgprot_t pgprot_writecombine(pgprot_t prot) { return __pgprot(pgprot_val(prot) | diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 14b0ff32fb9f..3a6803cb0ec9 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -569,7 +569,7 @@ static inline int track_pfn_copy(struct vm_area_struct *vma) } /* - * untrack_pfn_vma is called while unmapping a pfnmap for a region. + * untrack_pfn is called while unmapping a pfnmap for a region. * untrack can be called for a specific region indicated by pfn and size or * can be for the entire vma (in which case pfn, size are zero). */ @@ -577,6 +577,13 @@ static inline void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size) { } + +/* + * untrack_pfn_moved is called while mremapping a pfnmap for a new region. + */ +static inline void untrack_pfn_moved(struct vm_area_struct *vma) +{ +} #else extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, @@ -586,6 +593,7 @@ extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); +extern void untrack_pfn_moved(struct vm_area_struct *vma); #endif #ifdef __HAVE_COLOR_ZERO_PAGE diff --git a/mm/mremap.c b/mm/mremap.c index c25bc6268e46..de824e72c3e8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -319,6 +319,10 @@ static unsigned long move_vma(struct vm_area_struct *vma, hiwater_vm = mm->hiwater_vm; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); + /* Tell pfnmap has moved from this vma */ + if (unlikely(vma->vm_flags & VM_PFNMAP)) + untrack_pfn_moved(vma); + if (do_munmap(mm, old_addr, old_len) < 0) { /* OOM: unable to split vma, just get accounts right */ vm_unacct_memory(excess >> PAGE_SHIFT); -- cgit v1.2.3 From 2039e6acaf94d83ec6b6d9f3d0bce7ea1f099918 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 22 Dec 2015 17:54:24 -0700 Subject: x86/mm/pat: Change free_memtype() to support shrinking case Using mremap() to shrink the map size of a VM_PFNMAP range causes the following error message, and leaves the pfn range allocated. x86/PAT: test:3493 freeing invalid memtype [mem 0x483200000-0x4863fffff] This is because rbt_memtype_erase(), called from free_memtype() with spin_lock held, only supports to free a whole memtype node in memtype_rbroot. Therefore, this patch changes rbt_memtype_erase() to support a request that shrinks the size of a memtype node for mremap(). memtype_rb_exact_match() is renamed to memtype_rb_match(), and is enhanced to support EXACT_MATCH and END_MATCH in @match_type. Since the memtype_rbroot tree allows overlapping ranges, rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free a whole node for the munmap case. If no such entry is found, it then checks with END_MATCH, i.e. shrink the size of a node from the end for the mremap case. On the mremap case, rbt_memtype_erase() proceeds in two steps, 1) remove the node, and then 2) insert the updated node. This allows proper update of augmented values, subtree_max_end, in the tree. Signed-off-by: Toshi Kani Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Borislav Petkov Cc: stsp@list.ru Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/1450832064-10093-3-git-send-email-toshi.kani@hpe.com Signed-off-by: Thomas Gleixner --- arch/x86/mm/pat.c | 2 +- arch/x86/mm/pat_rbtree.c | 52 +++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 44 insertions(+), 10 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 1aca073ba571..031782e74231 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -586,7 +586,7 @@ int free_memtype(u64 start, u64 end) entry = rbt_memtype_erase(start, end); spin_unlock(&memtype_lock); - if (!entry) { + if (IS_ERR(entry)) { pr_info("x86/PAT: %s:%d freeing invalid memtype [mem %#010Lx-%#010Lx]\n", current->comm, current->pid, start, end - 1); return -EINVAL; diff --git a/arch/x86/mm/pat_rbtree.c b/arch/x86/mm/pat_rbtree.c index 63931080366a..2f7702253ccf 100644 --- a/arch/x86/mm/pat_rbtree.c +++ b/arch/x86/mm/pat_rbtree.c @@ -98,8 +98,13 @@ static struct memtype *memtype_rb_lowest_match(struct rb_root *root, return last_lower; /* Returns NULL if there is no overlap */ } -static struct memtype *memtype_rb_exact_match(struct rb_root *root, - u64 start, u64 end) +enum { + MEMTYPE_EXACT_MATCH = 0, + MEMTYPE_END_MATCH = 1 +}; + +static struct memtype *memtype_rb_match(struct rb_root *root, + u64 start, u64 end, int match_type) { struct memtype *match; @@ -107,7 +112,12 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, while (match != NULL && match->start < end) { struct rb_node *node; - if (match->start == start && match->end == end) + if ((match_type == MEMTYPE_EXACT_MATCH) && + (match->start == start) && (match->end == end)) + return match; + + if ((match_type == MEMTYPE_END_MATCH) && + (match->start < start) && (match->end == end)) return match; node = rb_next(&match->rb); @@ -117,7 +127,7 @@ static struct memtype *memtype_rb_exact_match(struct rb_root *root, match = NULL; } - return NULL; /* Returns NULL if there is no exact match */ + return NULL; /* Returns NULL if there is no match */ } static int memtype_rb_check_conflict(struct rb_root *root, @@ -210,12 +220,36 @@ struct memtype *rbt_memtype_erase(u64 start, u64 end) { struct memtype *data; - data = memtype_rb_exact_match(&memtype_rbroot, start, end); - if (!data) - goto out; + /* + * Since the memtype_rbroot tree allows overlapping ranges, + * rbt_memtype_erase() checks with EXACT_MATCH first, i.e. free + * a whole node for the munmap case. If no such entry is found, + * it then checks with END_MATCH, i.e. shrink the size of a node + * from the end for the mremap case. + */ + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_EXACT_MATCH); + if (!data) { + data = memtype_rb_match(&memtype_rbroot, start, end, + MEMTYPE_END_MATCH); + if (!data) + return ERR_PTR(-EINVAL); + } + + if (data->start == start) { + /* munmap: erase this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + } else { + /* mremap: update the end value of this node */ + rb_erase_augmented(&data->rb, &memtype_rbroot, + &memtype_rb_augment_cb); + data->end = start; + data->subtree_max_end = data->end; + memtype_rb_insert(&memtype_rbroot, data); + return NULL; + } - rb_erase_augmented(&data->rb, &memtype_rbroot, &memtype_rb_augment_cb); -out: return data; } -- cgit v1.2.3 From 1f1a89ac05f6e88aa341e86e57435fdbb1177c0c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 8 Jan 2016 09:55:33 +0000 Subject: x86/mm: Micro-optimise clflush_cache_range() Whilst inspecting the asm for clflush_cache_range() and some perf profiles that required extensive flushing of single cachelines (from part of the intel-gpu-tools GPU benchmarks), we noticed that gcc was reloading boot_cpu_data.x86_clflush_size on every iteration of the loop. We can manually hoist that read which perf regarded as taking ~25% of the function time for a single cacheline flush. Signed-off-by: Chris Wilson Reviewed-by: Ross Zwisler Acked-by: "H. Peter Anvin" Cc: Toshi Kani Cc: Borislav Petkov Cc: Luis R. Rodriguez Cc: Stephen Rothwell Cc: Sai Praneeth Link: http://lkml.kernel.org/r/1452246933-10890-1-git-send-email-chris@chris-wilson.co.uk Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a3137a4feed1..6000ad7f560c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -129,14 +129,16 @@ within(unsigned long addr, unsigned long start, unsigned long end) */ void clflush_cache_range(void *vaddr, unsigned int size) { - unsigned long clflush_mask = boot_cpu_data.x86_clflush_size - 1; + const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; + void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); void *vend = vaddr + size; - void *p; + + if (p >= vend) + return; mb(); - for (p = (void *)((unsigned long)vaddr & ~clflush_mask); - p < vend; p += boot_cpu_data.x86_clflush_size) + for (; p < vend; p += clflush_size) clflushopt(p); mb(); -- cgit v1.2.3 From 71b3c126e61177eb693423f2e18a1914205b165e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 6 Jan 2016 12:21:01 -0800 Subject: x86/mm: Add barriers and document switch_mm()-vs-flush synchronization When switch_mm() activates a new PGD, it also sets a bit that tells other CPUs that the PGD is in use so that TLB flush IPIs will be sent. In order for that to work correctly, the bit needs to be visible prior to loading the PGD and therefore starting to fill the local TLB. Document all the barriers that make this work correctly and add a couple that were missing. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mmu_context.h | 33 ++++++++++++++++++++++++++++++++- arch/x86/mm/tlb.c | 29 ++++++++++++++++++++++++++--- 2 files changed, 58 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 379cd3658799..1edc9cd198b8 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, #endif cpumask_set_cpu(cpu, mm_cpumask(next)); - /* Re-load page tables */ + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs much + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. This barrier synchronizes with + * remote TLB flushers. Fortunately, load_cr3 is + * serializing and thus acts as a full barrier. + * + */ load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ @@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, * schedule, protecting us from simultaneous changes. */ cpumask_set_cpu(cpu, mm_cpumask(next)); + /* * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. + * + * As above, this is a barrier that forces + * TLB repopulation to be ordered after the + * store to mm_cpumask. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8ddb5d0d66fb..8f4cc3dfac32 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -161,7 +161,10 @@ void flush_tlb_current_task(void) preempt_disable(); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + + /* This is an implicit full barrier that synchronizes with switch_mm. */ local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); - if (current->active_mm != mm) + if (current->active_mm != mm) { + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; + } if (!current->mm) { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + goto out; } if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) base_pages_to_flush = (end - start) >> PAGE_SHIFT; + /* + * Both branches below are implicit full barriers (MOV to CR or + * INVLPG) that synchronize with switch_mm. + */ if (base_pages_to_flush > tlb_single_page_flush_ceiling) { base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); @@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) preempt_disable(); if (current->active_mm == mm) { - if (current->mm) + if (current->mm) { + /* + * Implicit full barrier (INVLPG) that synchronizes + * with switch_mm. + */ __flush_tlb_one(start); - else + } else { leave_mm(smp_processor_id()); + + /* Synchronize with switch_mm. */ + smp_mb(); + } } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) -- cgit v1.2.3 From c9e0d39126af9e338495b719e9d565ca9ebcd4c5 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 11 Jan 2016 12:04:28 -0500 Subject: x86/mm/pat: Make split_page_count() check for empty levels to fix /proc/meminfo output In CONFIG_PAGEALLOC_DEBUG=y builds, we disable 2M pages. Unfortunatly when we split up mappings during boot, split_page_count() doesn't take this into account, and starts decrementing an empty direct_pages_count[] level. This results in /proc/meminfo showing crazy things like: DirectMap2M: 18446744073709543424 kB Signed-off-by: Dave Jones Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6000ad7f560c..fc6a4c8f6e2a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages) static void split_page_count(int level) { + if (direct_pages_count[level] == 0) + return; + direct_pages_count[level]--; direct_pages_count[level - 1] += PTRS_PER_PTE; } -- cgit v1.2.3 From b500f77baea576b74c3961613b67eaffcdf65c57 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 12 Jan 2016 10:19:30 +0800 Subject: x86/mm: Use PAGE_ALIGNED instead of IS_ALIGNED Use PAGE_ALIGEND macro in to simplify code. Signed-off-by: Kefeng Wang Cc: Cc: Alexander Kuleshov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1452565170-11083-1-git-send-email-wangkefeng.wang@huawei.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec081fe0ce2c..8829482d69ec 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, if (phys_addr < (phys_addr_t)0x40000000) return; - if (IS_ALIGNED(addr, PAGE_SIZE) && - IS_ALIGNED(next, PAGE_SIZE)) { + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { /* * Do not free direct mapping pages since they were * freed when offlining, or simplely not in use. -- cgit v1.2.3 From 9e08f57d684ac2f40685f55f659564bfd91a971e Mon Sep 17 00:00:00 2001 From: Daniel Cashman Date: Thu, 14 Jan 2016 15:20:06 -0800 Subject: x86: mm: support ARCH_MMAP_RND_BITS x86: arch_mmap_rnd() uses hard-coded values, 8 for 32-bit and 28 for 64-bit, to generate the random offset for the mmap base address. This value represents a compromise between increased ASLR effectiveness and avoiding address-space fragmentation. Replace it with a Kconfig option, which is sensibly bounded, so that platform developers may choose where to place this compromise. Keep default values as new minimums. Signed-off-by: Daniel Cashman Cc: Russell King Acked-by: Kees Cook Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Don Zickus Cc: Eric W. Biederman Cc: Heinrich Schuchardt Cc: Josh Poimboeuf Cc: Kirill A. Shutemov Cc: Naoya Horiguchi Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Thomas Gleixner Cc: David Rientjes Cc: Mark Salyzyn Cc: Jeff Vander Stoep Cc: Nick Kralevich Cc: Catalin Marinas Cc: Will Deacon Cc: "H. Peter Anvin" Cc: Hector Marco-Gisbert Cc: Borislav Petkov Cc: Ralf Baechle Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 16 ++++++++++++++++ arch/x86/mm/mmap.c | 12 ++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5d2293417946..24f362bf3ec6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -83,6 +83,8 @@ config X86 select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_ARCH_KGDB select HAVE_ARCH_KMEMCHECK + select HAVE_ARCH_MMAP_RND_BITS if MMU + select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK @@ -184,6 +186,20 @@ config HAVE_LATENCYTOP_SUPPORT config MMU def_bool y +config ARCH_MMAP_RND_BITS_MIN + default 28 if 64BIT + default 8 + +config ARCH_MMAP_RND_BITS_MAX + default 32 if 64BIT + default 16 + +config ARCH_MMAP_RND_COMPAT_BITS_MIN + default 8 + +config ARCH_MMAP_RND_COMPAT_BITS_MAX + default 16 + config SBUS bool diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 844b06d67df4..96bd1e2bffaf 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -69,14 +69,14 @@ unsigned long arch_mmap_rnd(void) { unsigned long rnd; - /* - * 8 bits of randomness in 32bit mmaps, 20 address space bits - * 28 bits of randomness in 64bit mmaps, 40 address space bits - */ if (mmap_is_ia32()) - rnd = (unsigned long)get_random_int() % (1<<8); +#ifdef CONFIG_COMPAT + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_compat_bits) - 1); +#else + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); +#endif else - rnd = (unsigned long)get_random_int() % (1<<28); + rnd = (unsigned long)get_random_int() & ((1 << mmap_rnd_bits) - 1); return rnd << PAGE_SHIFT; } -- cgit v1.2.3 From ddc58f27f9eee9117219936f77e90ad5b2e00e96 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:52:56 -0800 Subject: mm: drop tail page refcounting Tail page refcounting is utterly complicated and painful to support. It uses ->_mapcount on tail pages to store how many times this page is pinned. get_page() bumps ->_mapcount on tail page in addition to ->_count on head. This information is required by split_huge_page() to be able to distribute pins from head of compound page to tails during the split. We will need ->_mapcount to account PTE mappings of subpages of the compound page. We eliminate need in current meaning of ->_mapcount in tail pages by forbidding split entirely if the page is pinned. The only user of tail page refcounting is THP which is marked BROKEN for now. Let's drop all this mess. It makes get_page() and put_page() much simpler. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Tested-by: Aneesh Kumar K.V Acked-by: Vlastimil Babka Acked-by: Jerome Marchand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/gup.c | 4 - arch/powerpc/mm/hugetlbpage.c | 13 +- arch/s390/mm/gup.c | 13 +- arch/sparc/mm/gup.c | 14 +-- arch/x86/mm/gup.c | 4 - include/linux/mm.h | 47 ++------ include/linux/mm_types.h | 17 +-- mm/gup.c | 34 +----- mm/huge_memory.c | 41 +------ mm/hugetlb.c | 2 +- mm/internal.h | 44 ------- mm/swap.c | 273 +++--------------------------------------- 12 files changed, 40 insertions(+), 466 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/mips/mm/gup.c b/arch/mips/mm/gup.c index 349995d19c7f..36a35115dc2e 100644 --- a/arch/mips/mm/gup.c +++ b/arch/mips/mm/gup.c @@ -87,8 +87,6 @@ static int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -153,8 +151,6 @@ static int gup_huge_pud(pud_t pud, unsigned long addr, unsigned long end, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 61b8b7ccea4f..6bd3afa775d3 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -999,7 +999,7 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, { unsigned long mask; unsigned long pte_end; - struct page *head, *page, *tail; + struct page *head, *page; pte_t pte; int refs; @@ -1022,7 +1022,6 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, head = pte_page(pte); page = head + ((addr & (sz-1)) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -1044,15 +1043,5 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, return 0; } - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 21c74a71e2ab..1a2cad52babf 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -55,7 +55,7 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { unsigned long mask, result; - struct page *head, *page, *tail; + struct page *head, *page; int refs; result = write ? 0 : _SEGMENT_ENTRY_PROTECT; @@ -67,7 +67,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -88,16 +87,6 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 0; } - /* - * Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/arch/sparc/mm/gup.c b/arch/sparc/mm/gup.c index 2e5c4fc2daa9..9091c5daa2e1 100644 --- a/arch/sparc/mm/gup.c +++ b/arch/sparc/mm/gup.c @@ -56,8 +56,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(head); return 0; } - if (head != page) - get_huge_page_tail(page); pages[*nr] = page; (*nr)++; @@ -70,7 +68,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (!(pmd_val(pmd) & _PAGE_VALID)) @@ -82,7 +80,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, refs = 0; head = pmd_page(pmd); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -103,15 +100,6 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, return 0; } - /* Any tail page need their mapcount reference taken before we - * return. - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index ae9a37bf1371..65b0261d4bf3 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -136,8 +136,6 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; @@ -212,8 +210,6 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; - if (PageTail(page)) - get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/include/linux/mm.h b/include/linux/mm.h index 839d9e9a1c38..34387351930c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -466,44 +466,9 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline bool __compound_tail_refcounted(struct page *page) -{ - return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page); -} - -/* - * This takes a head page as parameter and tells if the - * tail page reference counting can be skipped. - * - * For this to be safe, PageSlab and PageHeadHuge must remain true on - * any given page where they return true here, until all tail pins - * have been released. - */ -static inline bool compound_tail_refcounted(struct page *page) -{ - VM_BUG_ON_PAGE(!PageHead(page), page); - return __compound_tail_refcounted(page); -} - -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run from under us. - */ - VM_BUG_ON_PAGE(!PageTail(page), page); - VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - if (compound_tail_refcounted(compound_head(page))) - atomic_inc(&page->_mapcount); -} - -extern bool __get_page_tail(struct page *page); - static inline void get_page(struct page *page) { - if (unlikely(PageTail(page))) - if (likely(__get_page_tail(page))) - return; + page = compound_head(page); /* * Getting a normal page or the head of a compound page * requires to already have an elevated page->_count. @@ -528,7 +493,15 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } -void put_page(struct page *page); +void __put_page(struct page *page); + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + if (put_page_testzero(page)) + __put_page(page); +} + void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6bc9a0ce2253..faf6fe88d6b3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -81,20 +81,9 @@ struct page { union { /* - * Count of ptes mapped in - * mms, to show when page is - * mapped & limit reverse map - * searches. - * - * Used also for tail pages - * refcounting instead of - * _count. Tail pages cannot - * be mapped and keeping the - * tail page _count zero at - * all times guarantees - * get_page_unless_zero() will - * never succeed on tail - * pages. + * Count of ptes mapped in mms, to show + * when page is mapped & limit reverse + * map searches. */ atomic_t _mapcount; diff --git a/mm/gup.c b/mm/gup.c index 1c50b506888d..7017abea9fd6 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -130,7 +130,7 @@ retry: } if (flags & FOLL_GET) - get_page_foll(page); + get_page(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -1153,7 +1153,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pmd_write(orig)) @@ -1162,7 +1162,6 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1183,24 +1182,13 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } - /* - * Any tail pages need their mapcount reference taken before we - * return. (This allows the THP code to bump their ref count when - * they are split into base pages). - */ - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - struct page *head, *page, *tail; + struct page *head, *page; int refs; if (write && !pud_write(orig)) @@ -1209,7 +1197,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1230,12 +1217,6 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } @@ -1244,7 +1225,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, struct page **pages, int *nr) { int refs; - struct page *head, *page, *tail; + struct page *head, *page; if (write && !pgd_write(orig)) return 0; @@ -1252,7 +1233,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); - tail = page; do { VM_BUG_ON_PAGE(compound_head(page) != head, page); pages[*nr] = page; @@ -1273,12 +1253,6 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } - while (refs--) { - if (PageTail(tail)) - get_huge_page_tail(tail); - tail++; - } - return 1; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 016b70ab5ed4..72fd53fe2b61 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1038,37 +1038,6 @@ unlock: spin_unlock(ptl); } -/* - * Save CONFIG_DEBUG_PAGEALLOC from faulting falsely on tail pages - * during copy_user_huge_page()'s copy_page_rep(): in the case when - * the source page gets split and a tail freed before copy completes. - * Called under pmd_lock of checked pmd, so safe from splitting itself. - */ -static void get_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - atomic_add(HPAGE_PMD_NR, &page->_count); - while (++page < endpage) - get_huge_page_tail(page); - } else { - get_page(page); - } -} - -static void put_user_huge_page(struct page *page) -{ - if (IS_ENABLED(CONFIG_DEBUG_PAGEALLOC)) { - struct page *endpage = page + HPAGE_PMD_NR; - - while (page < endpage) - put_page(page++); - } else { - put_page(page); - } -} - static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -1221,7 +1190,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ret |= VM_FAULT_WRITE; goto out_unlock; } - get_user_huge_page(page); + get_page(page); spin_unlock(ptl); alloc: if (transparent_hugepage_enabled(vma) && @@ -1242,7 +1211,7 @@ alloc: split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; } - put_user_huge_page(page); + put_page(page); } count_vm_event(THP_FAULT_FALLBACK); goto out; @@ -1253,7 +1222,7 @@ alloc: put_page(new_page); if (page) { split_huge_pmd(vma, pmd, address); - put_user_huge_page(page); + put_page(page); } else split_huge_pmd(vma, pmd, address); ret |= VM_FAULT_FALLBACK; @@ -1275,7 +1244,7 @@ alloc: spin_lock(ptl); if (page) - put_user_huge_page(page); + put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); mem_cgroup_cancel_charge(new_page, memcg, true); @@ -1360,7 +1329,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page), page); if (flags & FOLL_GET) - get_page_foll(page); + get_page(page); out: return page; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e924529f7b38..84af842e828d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3865,7 +3865,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, same_page: if (pages) { pages[i] = mem_map_offset(page, pfn_offset); - get_page_foll(pages[i]); + get_page(pages[i]); } if (vmas) diff --git a/mm/internal.h b/mm/internal.h index 38e24b89e4c4..569facd1f6da 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -66,50 +66,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __get_page_tail_foll(struct page *page, - bool get_page_head) -{ - /* - * If we're getting a tail page, the elevated page->_count is - * required only in the head page and we will elevate the head - * page->_count and tail page->_mapcount. - * - * We elevate page_tail->_mapcount for tail pages to force - * page_tail->_count to be zero at all times to avoid getting - * false positives from get_page_unless_zero() with - * speculative page access (like in - * page_cache_get_speculative()) on tail pages. - */ - VM_BUG_ON_PAGE(atomic_read(&compound_head(page)->_count) <= 0, page); - if (get_page_head) - atomic_inc(&compound_head(page)->_count); - get_huge_page_tail(page); -} - -/* - * This is meant to be called as the FOLL_GET operation of - * follow_page() and it must be called while holding the proper PT - * lock while the pte (or pmd_trans_huge) is still mapping the page. - */ -static inline void get_page_foll(struct page *page) -{ - if (unlikely(PageTail(page))) - /* - * This is safe only because - * __split_huge_page_refcount() can't run under - * get_page_foll() because we hold the proper PT lock. - */ - __get_page_tail_foll(page, true); - else { - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); - } -} - extern unsigned long highest_memmap_pfn; /* diff --git a/mm/swap.c b/mm/swap.c index 39395fb549c0..3d65480422e8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -89,260 +89,14 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } -/** - * Two special cases here: we could avoid taking compound_lock_irqsave - * and could skip the tail refcounting(in _mapcount). - * - * 1. Hugetlbfs page: - * - * PageHeadHuge will remain true until the compound page - * is released and enters the buddy allocator, and it could - * not be split by __split_huge_page_refcount(). - * - * So if we see PageHeadHuge set, and we have the tail page pin, - * then we could safely put head page. - * - * 2. Slab THP page: - * - * PG_slab is cleared before the slab frees the head page, and - * tail pin cannot be the last reference left on the head page, - * because the slab code is free to reuse the compound page - * after a kfree/kmem_cache_free without having to check if - * there's any tail pin left. In turn all tail pinsmust be always - * released while the head is still pinned by the slab code - * and so we know PG_slab will be still set too. - * - * So if we see PageSlab set, and we have the tail page pin, - * then we could safely put head page. - */ -static __always_inline -void put_unrefcounted_compound_page(struct page *page_head, struct page *page) -{ - /* - * If @page is a THP tail, we must read the tail page - * flags after the head page flags. The - * __split_huge_page_refcount side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here, see the comment above this function. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab THP page, - * the tail pin must not be the last reference - * held on the page, because the PG_slab cannot - * be cleared before all tail pins (which skips - * the _mapcount tail refcounting) have been - * released. - * - * If this is the tail of a hugetlbfs page, - * the tail pin may be the last reference on - * the page instead, because PageHeadHuge will - * not go away until the compound page enters - * the buddy allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - } else - /* - * __split_huge_page_refcount run before us, - * @page was a THP tail. The split @page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - if (put_page_testzero(page)) - __put_single_page(page); -} - -static __always_inline -void put_refcounted_compound_page(struct page *page_head, struct page *page) -{ - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * @page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The @page_head may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != compound_head(page), page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* @page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } -} - -static void put_compound_page(struct page *page) -{ - struct page *page_head; - - /* - * We see the PageCompound set and PageTail not set, so @page maybe: - * 1. hugetlbfs head page, or - * 2. THP head page. - */ - if (likely(!PageTail(page))) { - if (put_page_testzero(page)) { - /* - * By the time all refcounts have been released - * split_huge_page cannot run anymore from under us. - */ - if (PageHead(page)) - __put_compound_page(page); - else - __put_single_page(page); - } - return; - } - - /* - * We see the PageCompound set and PageTail set, so @page maybe: - * 1. a tail hugetlbfs page, or - * 2. a tail THP page, or - * 3. a split THP page. - * - * Case 3 is possible, as we may race with - * __split_huge_page_refcount tearing down a THP page. - */ - page_head = compound_head(page); - if (!__compound_tail_refcounted(page_head)) - put_unrefcounted_compound_page(page_head, page); - else - put_refcounted_compound_page(page_head, page); -} - -void put_page(struct page *page) +void __put_page(struct page *page) { if (unlikely(PageCompound(page))) - put_compound_page(page); - else if (put_page_testzero(page)) + __put_compound_page(page); + else __put_single_page(page); } -EXPORT_SYMBOL(put_page); - -/* - * This function is exported but must not be called by anything other - * than get_page(). It implements the slow path of get_page(). - */ -bool __get_page_tail(struct page *page) -{ - /* - * This takes care of get_page() if run on a tail page - * returned by one of the get_user_pages/follow_page variants. - * get_user_pages/follow_page itself doesn't need the compound - * lock because it runs __get_page_tail_foll() under the - * proper PT lock that already serializes against - * split_huge_page(). - */ - unsigned long flags; - bool got; - struct page *page_head = compound_head(page); - - /* Ref to put_compound_page() comment. */ - if (!__compound_tail_refcounted(page_head)) { - smp_rmb(); - if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - __get_page_tail_foll(page, true); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - return false; - } - } - - got = false; - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); - } - return got; -} -EXPORT_SYMBOL(__get_page_tail); +EXPORT_SYMBOL(__put_page); /** * put_pages_list() - release a list of pages @@ -918,15 +672,6 @@ void release_pages(struct page **pages, int nr, bool cold) for (i = 0; i < nr; i++) { struct page *page = pages[i]; - if (unlikely(PageCompound(page))) { - if (zone) { - spin_unlock_irqrestore(&zone->lru_lock, flags); - zone = NULL; - } - put_compound_page(page); - continue; - } - /* * Make sure the IRQ-safe lock-holding time does not get * excessive with a continuous string of pages from the @@ -937,9 +682,19 @@ void release_pages(struct page **pages, int nr, bool cold) zone = NULL; } + page = compound_head(page); if (!put_page_testzero(page)) continue; + if (PageCompound(page)) { + if (zone) { + spin_unlock_irqrestore(&zone->lru_lock, flags); + zone = NULL; + } + __put_compound_page(page); + continue; + } + if (PageLRU(page)) { struct zone *pagezone = page_zone(page); -- cgit v1.2.3 From 1f19617d775879c0863e697fb648154e56918051 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 15 Jan 2016 16:53:35 -0800 Subject: x86, thp: remove infrastructure for handling splitting PMDs With new refcounting we don't need to mark PMDs splitting. Let's drop code to handle this. Signed-off-by: Kirill A. Shutemov Tested-by: Sasha Levin Acked-by: Jerome Marchand Cc: Aneesh Kumar K.V Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Dave Hansen Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: Steve Capper Cc: Johannes Weiner Cc: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 9 --------- arch/x86/include/asm/pgtable_types.h | 2 -- arch/x86/mm/gup.c | 13 +------------ arch/x86/mm/pgtable.c | 13 ------------- 4 files changed, 1 insertion(+), 36 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d3eee663c41f..61e706c9f710 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -162,11 +162,6 @@ static inline int pmd_large(pmd_t pte) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_splitting(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_SPLITTING; -} - static inline int pmd_trans_huge(pmd_t pmd) { return pmd_val(pmd) & _PAGE_PSE; @@ -808,10 +803,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - #define __HAVE_ARCH_PMD_WRITE static inline int pmd_write(pmd_t pmd) { diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index a471cadb9630..d1b76f88ccd1 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -22,7 +22,6 @@ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 -#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ #define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ @@ -46,7 +45,6 @@ #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) #define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) #define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) -#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING) #define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_KMEMCHECK diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 65b0261d4bf3..f8cb3e8ac250 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -156,18 +156,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, pmd_t pmd = *pmdp; next = pmd_addr_end(addr, end); - /* - * The pmd_trans_splitting() check below explains why - * pmdp_splitting_flush has to flush the tlb, to stop - * this gup-fast code from running while we set the - * splitting bit in the pmd. Returning zero will take - * the slow path that will call wait_split_huge_page() - * if the pmd is still in splitting state. gup-fast - * can't because it has irq disabled and - * wait_split_huge_page() would never return as the - * tlb flush IPI wouldn't run. - */ - if (pmd_none(pmd) || pmd_trans_splitting(pmd)) + if (pmd_none(pmd)) return 0; if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { /* diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ee9c2e3a7199..4eb287e25043 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -505,19 +505,6 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, return young; } - -void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - int set; - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)pmdp); - if (set) { - /* need tlb flush only to serialize against gup-fast */ - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); - } -} #endif /** -- cgit v1.2.3 From 4b94ffdc4163bae1ec73b6e977ffb7a7da3d06d3 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:22 -0800 Subject: x86, mm: introduce vmem_altmap to augment vmemmap_populate() In support of providing struct page for large persistent memory capacities, use struct vmem_altmap to change the default policy for allocating memory for the memmap array. The default vmemmap_populate() allocates page table storage area from the page allocator. Given persistent memory capacities relative to DRAM it may not be feasible to store the memmap in 'System Memory'. Instead vmem_altmap represents pre-allocated "device pages" to satisfy vmemmap_alloc_block_buf() requests. Signed-off-by: Dan Williams Reported-by: kbuild test robot Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 33 ++++++++++++++---- drivers/nvdimm/pmem.c | 6 ++-- include/linux/memory_hotplug.h | 3 +- include/linux/memremap.h | 39 +++++++++++++++++++--- include/linux/mm.h | 9 ++++- kernel/memremap.c | 72 +++++++++++++++++++++++++++++++++++++-- mm/memory_hotplug.c | 67 ++++++++++++++++++++++++++----------- mm/page_alloc.c | 11 +++++- mm/sparse-vmemmap.c | 76 ++++++++++++++++++++++++++++++++++++++++-- mm/sparse.c | 8 +++-- 10 files changed, 282 insertions(+), 42 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8829482d69ec..5488d21123bd 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -714,6 +715,12 @@ static void __meminit free_pagetable(struct page *page, int order) { unsigned long magic; unsigned int nr_pages = 1 << order; + struct vmem_altmap *altmap = to_vmem_altmap((unsigned long) page); + + if (altmap) { + vmem_altmap_free(altmap, nr_pages); + return; + } /* bootmem page has reserved flag */ if (PageReserved(page)) { @@ -1017,13 +1024,19 @@ int __ref arch_remove_memory(u64 start, u64 size) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; + struct page *page = pfn_to_page(start_pfn); + struct vmem_altmap *altmap; struct zone *zone; int ret; - zone = page_zone(pfn_to_page(start_pfn)); - kernel_physical_mapping_remove(start, start + size); + /* With altmap the first mapped page is offset from @start */ + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + page += vmem_altmap_offset(altmap); + zone = page_zone(page); ret = __remove_pages(zone, start_pfn, nr_pages); WARN_ON_ONCE(ret); + kernel_physical_mapping_remove(start, start + size); return ret; } @@ -1235,7 +1248,7 @@ static void __meminitdata *p_start, *p_end; static int __meminitdata node_start; static int __meminit vmemmap_populate_hugepages(unsigned long start, - unsigned long end, int node) + unsigned long end, int node, struct vmem_altmap *altmap) { unsigned long addr; unsigned long next; @@ -1258,7 +1271,7 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, if (pmd_none(*pmd)) { void *p; - p = vmemmap_alloc_block_buf(PMD_SIZE, node); + p = __vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); if (p) { pte_t entry; @@ -1279,7 +1292,8 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, addr_end = addr + PMD_SIZE; p_end = p + PMD_SIZE; continue; - } + } else if (altmap) + return -ENOMEM; /* no fallback */ } else if (pmd_large(*pmd)) { vmemmap_verify((pte_t *)pmd, node, addr, next); continue; @@ -1293,11 +1307,16 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) { + struct vmem_altmap *altmap = to_vmem_altmap(start); int err; if (cpu_has_pse) - err = vmemmap_populate_hugepages(start, end, node); - else + err = vmemmap_populate_hugepages(start, end, node, altmap); + else if (altmap) { + pr_err_once("%s: no cpu support for altmap allocations\n", + __func__); + err = -ENOMEM; + } else err = vmemmap_populate_basepages(start, end, node); if (!err) sync_global_pgds(start, end - 1, 0); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 904629b97c4f..be3f8547b702 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -178,7 +178,8 @@ static struct pmem_device *pmem_alloc(struct device *dev, pmem->pfn_flags = PFN_DEV; if (pmem_should_map_pages(dev)) { - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, res, + NULL); pmem->pfn_flags |= PFN_MAP; } else pmem->virt_addr = (void __pmem *) devm_memremap(dev, @@ -387,7 +388,8 @@ static int nvdimm_namespace_attach_pfn(struct nd_namespace_common *ndns) /* establish pfn range for lookup, and switch to direct map */ pmem = dev_get_drvdata(dev); devm_memunmap(dev, (void __force *) pmem->virt_addr); - pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res); + pmem->virt_addr = (void __pmem *) devm_memremap_pages(dev, &nsio->res, + NULL); pmem->pfn_flags |= PFN_MAP; if (IS_ERR(pmem->virt_addr)) { rc = PTR_ERR(pmem->virt_addr); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2ea574ff9714..43405992d027 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -275,7 +275,8 @@ extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); -extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); +extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d90721c178bb..aa3e82a80d7b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -4,21 +4,53 @@ struct resource; struct device; + +/** + * struct vmem_altmap - pre-allocated storage for vmemmap_populate + * @base_pfn: base of the entire dev_pagemap mapping + * @reserve: pages mapped, but reserved for driver use (relative to @base) + * @free: free pages set aside in the mapping for memmap storage + * @align: pages reserved to meet allocation alignments + * @alloc: track pages consumed, private to vmemmap_populate() + */ +struct vmem_altmap { + const unsigned long base_pfn; + const unsigned long reserve; + unsigned long free; + unsigned long align; + unsigned long alloc; +}; + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); + +#if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_ZONE_DEVICE) +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); +#else +static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + return NULL; +} +#endif + /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings + * @altmap: pre-allocated/reserved memory for vmemmap allocations * @dev: host device of the mapping for debug */ struct dev_pagemap { - /* TODO: vmem_altmap and percpu_ref count */ + struct vmem_altmap *altmap; + const struct resource *res; struct device *dev; }; #ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res); +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct vmem_altmap *altmap); struct dev_pagemap *find_dev_pagemap(resource_size_t phys); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res) + struct resource *res, struct vmem_altmap *altmap) { /* * Fail attempts to call devm_memremap_pages() without @@ -34,5 +66,4 @@ static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return NULL; } #endif - #endif /* _LINUX_MEMREMAP_H_ */ diff --git a/include/linux/mm.h b/include/linux/mm.h index d9fe12d45c21..8bb0907a3603 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2217,7 +2217,14 @@ pud_t *vmemmap_pud_populate(pgd_t *pgd, unsigned long addr, int node); pmd_t *vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node); pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node); void *vmemmap_alloc_block(unsigned long size, int node); -void *vmemmap_alloc_block_buf(unsigned long size, int node); +struct vmem_altmap; +void *__vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap); +static inline void *vmemmap_alloc_block_buf(unsigned long size, int node) +{ + return __vmemmap_alloc_block_buf(size, node, NULL); +} + void vmemmap_verify(pte_t *, int, unsigned long, unsigned long); int vmemmap_populate_basepages(unsigned long start, unsigned long end, int node); diff --git a/kernel/memremap.c b/kernel/memremap.c index 61cfbf4d3054..562f6471fe90 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -166,6 +166,7 @@ struct page_map { struct resource res; struct percpu_ref *ref; struct dev_pagemap pgmap; + struct vmem_altmap altmap; }; static void pgmap_radix_release(struct resource *res) @@ -183,6 +184,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) struct page_map *page_map = data; struct resource *res = &page_map->res; resource_size_t align_start, align_size; + struct dev_pagemap *pgmap = &page_map->pgmap; pgmap_radix_release(res); @@ -190,6 +192,8 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, + "%s: failed to free all reserved pages\n", __func__); } /* assumes rcu_read_lock() held at entry */ @@ -203,11 +207,23 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) return page_map ? &page_map->pgmap : NULL; } -void *devm_memremap_pages(struct device *dev, struct resource *res) +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @res: "host memory" address range + * @altmap: optional descriptor for allocating the memmap from @res + * + * Note, the expectation is that @res is a host memory range that could + * feasibly be treated as a "System RAM" range, i.e. not a device mmio + * range, but this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct vmem_altmap *altmap) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); resource_size_t key, align_start, align_size; + struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid; @@ -220,14 +236,27 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); + if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { + dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", + __func__); + return ERR_PTR(-ENXIO); + } + page_map = devres_alloc_node(devm_memremap_pages_release, sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); + pgmap = &page_map->pgmap; memcpy(&page_map->res, res, sizeof(*res)); - page_map->pgmap.dev = dev; + pgmap->dev = dev; + if (altmap) { + memcpy(&page_map->altmap, altmap, sizeof(*altmap)); + pgmap->altmap = &page_map->altmap; + } + pgmap->res = &page_map->res; + mutex_lock(&pgmap_lock); error = 0; for (key = res->start; key <= res->end; key += SECTION_SIZE) { @@ -273,4 +302,43 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + return altmap->reserve + altmap->free; +} + +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + /* + * 'memmap_start' is the virtual address for the first "struct + * page" in this range of the vmemmap array. In the case of + * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * pointer arithmetic, so we can perform this to_vmem_altmap() + * conversion without concern for the initialization state of + * the struct page fields. + */ + struct page *page = (struct page *) memmap_start; + struct dev_pagemap *pgmap; + + /* + * Uncoditionally retrieve a dev_pagemap associated with the + * given physical address, this is only for use in the + * arch_{add|remove}_memory() for setting up and tearing down + * the memmap. + */ + rcu_read_lock(); + pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + rcu_read_unlock(); + + return pgmap ? pgmap->altmap : NULL; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 92f95952692b..4af58a3a8ffa 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -506,10 +507,25 @@ int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn, unsigned long i; int err = 0; int start_sec, end_sec; + struct vmem_altmap *altmap; + /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); + altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); + if (altmap) { + /* + * Validate altmap is within bounds of the total request + */ + if (altmap->base_pfn != phys_start_pfn + || vmem_altmap_offset(altmap) > nr_pages) { + pr_warn_once("memory add fail, invalid altmap\n"); + return -EINVAL; + } + altmap->alloc = 0; + } + for (i = start_sec; i <= end_sec; i++) { err = __add_section(nid, zone, section_nr_to_pfn(i)); @@ -731,7 +747,8 @@ static void __remove_zone(struct zone *zone, unsigned long start_pfn) pgdat_resize_unlock(zone->zone_pgdat, &flags); } -static int __remove_section(struct zone *zone, struct mem_section *ms) +static int __remove_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { unsigned long start_pfn; int scn_nr; @@ -748,7 +765,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms) start_pfn = section_nr_to_pfn(scn_nr); __remove_zone(zone, start_pfn); - sparse_remove_one_section(zone, ms); + sparse_remove_one_section(zone, ms, map_offset); return 0; } @@ -767,9 +784,32 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, unsigned long nr_pages) { unsigned long i; - int sections_to_remove; - resource_size_t start, size; - int ret = 0; + unsigned long map_offset = 0; + int sections_to_remove, ret = 0; + + /* In the ZONE_DEVICE case device driver owns the memory region */ + if (is_dev_zone(zone)) { + struct page *page = pfn_to_page(phys_start_pfn); + struct vmem_altmap *altmap; + + altmap = to_vmem_altmap((unsigned long) page); + if (altmap) + map_offset = vmem_altmap_offset(altmap); + } else { + resource_size_t start, size; + + start = phys_start_pfn << PAGE_SHIFT; + size = nr_pages * PAGE_SIZE; + + ret = release_mem_region_adjustable(&iomem_resource, start, + size); + if (ret) { + resource_size_t endres = start + size - 1; + + pr_warn("Unable to release resource <%pa-%pa> (%d)\n", + &start, &endres, ret); + } + } /* * We can only remove entire sections @@ -777,23 +817,12 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - start = phys_start_pfn << PAGE_SHIFT; - size = nr_pages * PAGE_SIZE; - - /* in the ZONE_DEVICE case device driver owns the memory region */ - if (!is_dev_zone(zone)) - ret = release_mem_region_adjustable(&iomem_resource, start, size); - if (ret) { - resource_size_t endres = start + size - 1; - - pr_warn("Unable to release resource <%pa-%pa> (%d)\n", - &start, &endres, ret); - } - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; - ret = __remove_section(zone, __pfn_to_section(pfn)); + + ret = __remove_section(zone, __pfn_to_section(pfn), map_offset); + map_offset = 0; if (ret) break; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 25409714160e..7ca3fe6ef92d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include @@ -4485,8 +4486,9 @@ static inline unsigned long wait_table_bits(unsigned long size) void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn, enum memmap_context context) { - pg_data_t *pgdat = NODE_DATA(nid); + struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); unsigned long end_pfn = start_pfn + size; + pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; struct zone *z; unsigned long nr_initialised = 0; @@ -4494,6 +4496,13 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (highest_memmap_pfn < end_pfn - 1) highest_memmap_pfn = end_pfn - 1; + /* + * Honor reservation requested by the driver for this ZONE_DEVICE + * memory + */ + if (altmap && start_pfn == altmap->base_pfn) + start_pfn += altmap->reserve; + z = &pgdat->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4cba9c2783a1..b60802b3e5ea 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -70,7 +71,7 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) } /* need to make sure size is all the same during early stage */ -void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +static void * __meminit alloc_block_buf(unsigned long size, int node) { void *ptr; @@ -87,6 +88,77 @@ void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) return ptr; } +static unsigned long __meminit vmem_altmap_next_pfn(struct vmem_altmap *altmap) +{ + return altmap->base_pfn + altmap->reserve + altmap->alloc + + altmap->align; +} + +static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap) +{ + unsigned long allocated = altmap->alloc + altmap->align; + + if (altmap->free > allocated) + return altmap->free - allocated; + return 0; +} + +/** + * vmem_altmap_alloc - allocate pages from the vmem_altmap reservation + * @altmap - reserved page pool for the allocation + * @nr_pfns - size (in pages) of the allocation + * + * Allocations are aligned to the size of the request + */ +static unsigned long __meminit vmem_altmap_alloc(struct vmem_altmap *altmap, + unsigned long nr_pfns) +{ + unsigned long pfn = vmem_altmap_next_pfn(altmap); + unsigned long nr_align; + + nr_align = 1UL << find_first_bit(&nr_pfns, BITS_PER_LONG); + nr_align = ALIGN(pfn, nr_align) - pfn; + + if (nr_pfns + nr_align > vmem_altmap_nr_free(altmap)) + return ULONG_MAX; + altmap->alloc += nr_pfns; + altmap->align += nr_align; + return pfn + nr_align; +} + +static void * __meminit altmap_alloc_block_buf(unsigned long size, + struct vmem_altmap *altmap) +{ + unsigned long pfn, nr_pfns; + void *ptr; + + if (size & ~PAGE_MASK) { + pr_warn_once("%s: allocations must be multiple of PAGE_SIZE (%ld)\n", + __func__, size); + return NULL; + } + + nr_pfns = size >> PAGE_SHIFT; + pfn = vmem_altmap_alloc(altmap, nr_pfns); + if (pfn < ULONG_MAX) + ptr = __va(__pfn_to_phys(pfn)); + else + ptr = NULL; + pr_debug("%s: pfn: %#lx alloc: %ld align: %ld nr: %#lx\n", + __func__, pfn, altmap->alloc, altmap->align, nr_pfns); + + return ptr; +} + +/* need to make sure size is all the same during early stage */ +void * __meminit __vmemmap_alloc_block_buf(unsigned long size, int node, + struct vmem_altmap *altmap) +{ + if (altmap) + return altmap_alloc_block_buf(size, altmap); + return alloc_block_buf(size, node); +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -103,7 +175,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); + void *p = alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); diff --git a/mm/sparse.c b/mm/sparse.c index d1b48b691ac8..3717ceed4177 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -748,7 +748,7 @@ static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) if (!memmap) return; - for (i = 0; i < PAGES_PER_SECTION; i++) { + for (i = 0; i < nr_pages; i++) { if (PageHWPoison(&memmap[i])) { atomic_long_sub(1, &num_poisoned_pages); ClearPageHWPoison(&memmap[i]); @@ -788,7 +788,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) free_map_bootmem(memmap); } -void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) +void sparse_remove_one_section(struct zone *zone, struct mem_section *ms, + unsigned long map_offset) { struct page *memmap = NULL; unsigned long *usemap = NULL, flags; @@ -804,7 +805,8 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) } pgdat_resize_unlock(pgdat, &flags); - clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); + clear_hwpoisoned_pages(memmap + map_offset, + PAGES_PER_SECTION - map_offset); free_section_usemap(memmap, usemap); } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3 From f25748e3c34eb8bb54853e9adba2d3dcf030503c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:43 -0800 Subject: mm, dax: convert vmf_insert_pfn_pmd() to pfn_t Similar to the conversion of vm_insert_mixed() use pfn_t in the vmf_insert_pfn_pmd() to tag the resulting pte with _PAGE_DEVICE when the pfn is backed by a devm_memremap_pages() mapping. Signed-off-by: Dan Williams Cc: Dave Hansen Cc: Matthew Wilcox Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 5 +++++ arch/x86/mm/pat.c | 5 +++-- fs/dax.c | 2 +- include/asm-generic/pgtable.h | 6 ++++-- include/linux/huge_mm.h | 2 +- include/linux/pfn_t.h | 8 ++++++++ mm/huge_memory.c | 11 +++++++---- mm/memory.c | 2 +- 8 files changed, 30 insertions(+), 11 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4c668f15a532..6585a8b10fea 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -286,6 +286,11 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_DIRTY | _PAGE_SOFT_DIRTY); } +static inline pmd_t pmd_mkdevmap(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DEVMAP); +} + static inline pmd_t pmd_mkhuge(pmd_t pmd) { return pmd_set_flags(pmd, _PAGE_PSE); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 031782e74231..f4ae536b0914 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -949,7 +950,7 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, } int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn) + pfn_t pfn) { enum page_cache_mode pcm; @@ -957,7 +958,7 @@ int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, return 0; /* Set prot based on lookup */ - pcm = lookup_memtype((resource_size_t)pfn << PAGE_SHIFT); + pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(vma->vm_page_prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); diff --git a/fs/dax.c b/fs/dax.c index 574763eed8a3..96ac3072463d 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -693,7 +693,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, dax_unmap_atomic(bdev, &dax); result |= vmf_insert_pfn_pmd(vma, address, pmd, - pfn_t_to_pfn(dax.pfn), write); + dax.pfn, write); } out: diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index af0a6cc81635..0b3c0d39ef75 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -1,6 +1,8 @@ #ifndef _ASM_GENERIC_PGTABLE_H #define _ASM_GENERIC_PGTABLE_H +#include + #ifndef __ASSEMBLY__ #ifdef CONFIG_MMU @@ -549,7 +551,7 @@ static inline int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, * by vm_insert_pfn(). */ static inline int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn) + pfn_t pfn) { return 0; } @@ -584,7 +586,7 @@ extern int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size); extern int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - unsigned long pfn); + pfn_t pfn); extern int track_pfn_copy(struct vm_area_struct *vma); extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, unsigned long size); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 0160201993d4..8ca35a131904 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -37,7 +37,7 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa); int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, - unsigned long pfn, bool write); + pfn_t pfn, bool write); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index bdaa275d7623..0703b5360d31 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -77,6 +77,13 @@ static inline pte_t pfn_t_pte(pfn_t pfn, pgprot_t pgprot) } #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pfn_t_pmd(pfn_t pfn, pgprot_t pgprot) +{ + return pfn_pmd(pfn_t_to_pfn(pfn), pgprot); +} +#endif + #ifdef __HAVE_ARCH_PTE_DEVMAP static inline bool pfn_t_devmap(pfn_t pfn) { @@ -90,5 +97,6 @@ static inline bool pfn_t_devmap(pfn_t pfn) return false; } pte_t pte_mkdevmap(pte_t pte); +pmd_t pmd_mkdevmap(pmd_t pmd); #endif #endif /* _LINUX_PFN_T_H_ */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 996e86dbeb43..d93706013a55 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -931,14 +932,16 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, pgprot_t prot, bool write) + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; spinlock_t *ptl; ptl = pmd_lock(mm, pmd); - entry = pmd_mkhuge(pfn_pmd(pfn, prot)); + entry = pmd_mkhuge(pfn_t_pmd(pfn, prot)); + if (pfn_t_devmap(pfn)) + entry = pmd_mkdevmap(entry); if (write) { entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); @@ -949,7 +952,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, } int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, unsigned long pfn, bool write) + pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; /* @@ -961,7 +964,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); - BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); + BUG_ON(!pfn_t_devmap(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index 7f03652723ea..552ae3d69435 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1567,7 +1567,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_insert(vma, &pgprot, pfn)) + if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV))) return -EINVAL; ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot); -- cgit v1.2.3 From 3565fce3a6597e91b8dee3e8e36ebf70f8b7ef9b Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 15 Jan 2016 16:56:55 -0800 Subject: mm, x86: get_user_pages() for dax mappings A dax mapping establishes a pte with _PAGE_DEVMAP set when the driver has established a devm_memremap_pages() mapping, i.e. when the pfn_t return from ->direct_access() has PFN_DEV and PFN_MAP set. Later, when encountering _PAGE_DEVMAP during a page table walk we lookup and pin a struct dev_pagemap instance to keep the result of pfn_to_page() valid until put_page(). Signed-off-by: Dan Williams Tested-by: Logan Gunthorpe Cc: Dave Hansen Cc: Mel Gorman Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 7 ++++ arch/x86/mm/gup.c | 57 ++++++++++++++++++++++++++++++-- include/linux/huge_mm.h | 10 +++++- include/linux/mm.h | 59 +++++++++++++++++++++++---------- kernel/memremap.c | 12 +++++++ mm/gup.c | 30 +++++++++++++++-- mm/huge_memory.c | 75 +++++++++++++++++++++++++++++++++--------- mm/swap.c | 1 + 8 files changed, 212 insertions(+), 39 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 6a0ad82c8d0f..0687c4748b8f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -479,6 +479,13 @@ static inline int pte_present(pte_t a) return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); } +#ifdef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t a) +{ + return (pte_flags(a) & _PAGE_DEVMAP) == _PAGE_DEVMAP; +} +#endif + #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index f8cb3e8ac250..6d5eb5900372 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -9,6 +9,7 @@ #include #include #include +#include #include @@ -63,6 +64,16 @@ retry: #endif } +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -71,7 +82,9 @@ retry: static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { + struct dev_pagemap *pgmap = NULL; unsigned long mask; + int nr_start = *nr; pte_t *ptep; mask = _PAGE_PRESENT|_PAGE_USER; @@ -89,13 +102,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, return 0; } - if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { + page = pte_page(pte); + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + pte_unmap(ptep); + return 0; + } + } else if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; } VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); get_page(page); + put_dev_pagemap(pgmap); SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -114,6 +135,32 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + unsigned long pfn = pmd_pfn(pmd); + struct dev_pagemap *pgmap = NULL; + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -126,9 +173,13 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, mask |= _PAGE_RW; if ((pmd_flags(pmd) & mask) != mask) return 0; + + VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); + if (pmd_devmap(pmd)) + return __gup_device_huge_pmd(pmd, addr, end, pages, nr); + /* hugepages are never "special" */ VM_BUG_ON(pmd_flags(pmd) & _PAGE_SPECIAL); - VM_BUG_ON(!pfn_valid(pmd_pfn(pmd))); refs = 0; head = pmd_page(pmd); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d39fa60bd6bf..cfe81e10bd54 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -38,7 +38,6 @@ extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, int prot_numa); int vmf_insert_pfn_pmd(struct vm_area_struct *, unsigned long addr, pmd_t *, pfn_t pfn, bool write); - enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, @@ -55,6 +54,9 @@ enum transparent_hugepage_flag { #define HPAGE_PMD_NR (1< #include #include +#include #include #include #include @@ -465,17 +466,6 @@ static inline int page_count(struct page *page) return atomic_read(&compound_head(page)->_count); } -static inline void get_page(struct page *page) -{ - page = compound_head(page); - /* - * Getting a normal page or the head of a compound page - * requires to already have an elevated page->_count. - */ - VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); - atomic_inc(&page->_count); -} - static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); @@ -494,13 +484,6 @@ static inline void init_page_count(struct page *page) void __put_page(struct page *page); -static inline void put_page(struct page *page) -{ - page = compound_head(page); - if (put_page_testzero(page)) - __put_page(page); -} - void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); @@ -682,17 +665,50 @@ static inline enum zone_type page_zonenum(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE +void get_zone_device_page(struct page *page); +void put_zone_device_page(struct page *page); static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } #else +static inline void get_zone_device_page(struct page *page) +{ +} +static inline void put_zone_device_page(struct page *page) +{ +} static inline bool is_zone_device_page(const struct page *page) { return false; } #endif +static inline void get_page(struct page *page) +{ + page = compound_head(page); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON_PAGE(atomic_read(&page->_count) <= 0, page); + atomic_inc(&page->_count); + + if (unlikely(is_zone_device_page(page))) + get_zone_device_page(page); +} + +static inline void put_page(struct page *page) +{ + page = compound_head(page); + + if (put_page_testzero(page)) + __put_page(page); + + if (unlikely(is_zone_device_page(page))) + put_zone_device_page(page); +} + #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif @@ -1444,6 +1460,13 @@ static inline void sync_mm_rss(struct mm_struct *mm) } #endif +#ifndef __HAVE_ARCH_PTE_DEVMAP +static inline int pte_devmap(pte_t pte) +{ + return 0; +} +#endif + int vma_wants_writenotify(struct vm_area_struct *vma); extern pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, diff --git a/kernel/memremap.c b/kernel/memremap.c index 3eb8944265d5..e517a16cb426 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -169,6 +169,18 @@ struct page_map { struct vmem_altmap altmap; }; +void get_zone_device_page(struct page *page) +{ + percpu_ref_get(page->pgmap->ref); +} +EXPORT_SYMBOL(get_zone_device_page); + +void put_zone_device_page(struct page *page) +{ + put_dev_pagemap(page->pgmap); +} +EXPORT_SYMBOL(put_zone_device_page); + static void pgmap_radix_release(struct resource *res) { resource_size_t key; diff --git a/mm/gup.c b/mm/gup.c index e95b0cb6ed81..aa21c4b865a5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -62,6 +63,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap = NULL; struct page *page; spinlock_t *ptl; pte_t *ptep, pte; @@ -98,7 +100,17 @@ retry: } page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { + if (!page && pte_devmap(pte) && (flags & FOLL_GET)) { + /* + * Only return device mapping pages in the FOLL_GET case since + * they are only valid while holding the pgmap reference. + */ + pgmap = get_dev_pagemap(pte_pfn(pte), NULL); + if (pgmap) + page = pte_page(pte); + else + goto no_page; + } else if (unlikely(!page)) { if (flags & FOLL_DUMP) { /* Avoid special (like zero) pages in core dumps */ page = ERR_PTR(-EFAULT); @@ -129,8 +141,15 @@ retry: goto retry; } - if (flags & FOLL_GET) + if (flags & FOLL_GET) { get_page(page); + + /* drop the pgmap reference now that we hold the page */ + if (pgmap) { + put_dev_pagemap(pgmap); + pgmap = NULL; + } + } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) @@ -237,6 +256,13 @@ struct page *follow_page_mask(struct vm_area_struct *vma, } if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) return no_page_table(vma, flags); + if (pmd_devmap(*pmd)) { + ptl = pmd_lock(mm, pmd); + page = follow_devmap_pmd(vma, address, pmd, flags); + spin_unlock(ptl); + if (page) + return page; + } if (likely(!pmd_trans_huge(*pmd))) return follow_page_pte(vma, address, pmd, flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 82bed2bec3ed..b2db98136af9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -974,6 +975,63 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, return VM_FAULT_NOPAGE; } +static void touch_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd) +{ + pmd_t _pmd; + + /* + * We should set the dirty bit only for FOLL_WRITE but for now + * the dirty bit in the pmd is meaningless. And if the dirty + * bit will become meaningful and we'll only set it with + * FOLL_WRITE, an atomic set_bit will be required on the pmd to + * set the young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); +} + +struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, int flags) +{ + unsigned long pfn = pmd_pfn(*pmd); + struct mm_struct *mm = vma->vm_mm; + struct dev_pagemap *pgmap; + struct page *page; + + assert_spin_locked(pmd_lockptr(mm, pmd)); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + return NULL; + + if (pmd_present(*pmd) && pmd_devmap(*pmd)) + /* pass */; + else + return NULL; + + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); + + /* + * device mapped pages can only be returned if the + * caller will manage the page reference count. + */ + if (!(flags & FOLL_GET)) + return ERR_PTR(-EEXIST); + + pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT; + pgmap = get_dev_pagemap(pfn, NULL); + if (!pgmap) + return ERR_PTR(-EFAULT); + page = pfn_to_page(pfn); + get_page(page); + put_dev_pagemap(pgmap); + + return page; +} + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) @@ -1331,21 +1389,8 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page), page); - if (flags & FOLL_TOUCH) { - pmd_t _pmd; - /* - * We should set the dirty bit only for FOLL_WRITE but - * for now the dirty bit in the pmd is meaningless. - * And if the dirty bit will become meaningful and - * we'll only set it with FOLL_WRITE, an atomic - * set_bit will be required on the pmd to set the - * young bit, instead of the current set_pmd_at. - */ - _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, - pmd, _pmd, 1)) - update_mmu_cache_pmd(vma, addr, pmd); - } + if (flags & FOLL_TOUCH) + touch_pmd(vma, addr, pmd); if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { /* * We don't mlock() pte-mapped THPs. This way we can avoid diff --git a/mm/swap.c b/mm/swap.c index 674e2c93da4e..09fe5e97714a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 742563777e8da62197d6cb4b99f4027f59454735 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 29 Jan 2016 11:36:10 +0000 Subject: x86/mm/pat: Avoid truncation when converting cpa->numpages to address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a couple of nasty truncation bugs lurking in the pageattr code that can be triggered when mapping EFI regions, e.g. when we pass a cpa->pgd pointer. Because cpa->numpages is a 32-bit value, shifting left by PAGE_SHIFT will truncate the resultant address to 32-bits. Viorel-Cătălin managed to trigger this bug on his Dell machine that provides a ~5GB EFI region which requires 1236992 pages to be mapped. When calling populate_pud() the end of the region gets calculated incorrectly in the following buggy expression, end = start + (cpa->numpages << PAGE_SHIFT); And only 188416 pages are mapped. Next, populate_pud() gets invoked for a second time because of the loop in __change_page_attr_set_clr(), only this time no pages get mapped because shifting the remaining number of pages (1048576) by PAGE_SHIFT is zero. At which point the loop in __change_page_attr_set_clr() spins forever because we fail to map progress. Hitting this bug depends very much on the virtual address we pick to map the large region at and how many pages we map on the initial run through the loop. This explains why this issue was only recently hit with the introduction of commit a5caa209ba9c ("x86/efi: Fix boot crash by mapping EFI memmap entries bottom-up at runtime, instead of top-down") It's interesting to note that safe uses of cpa->numpages do exist in the pageattr code. If instead of shifting ->numpages we multiply by PAGE_SIZE, no truncation occurs because PAGE_SIZE is a UL value, and so the result is unsigned long. To avoid surprises when users try to convert very large cpa->numpages values to addresses, change the data type from 'int' to 'unsigned long', thereby making it suitable for shifting by PAGE_SHIFT without any type casting. The alternative would be to make liberal use of casting, but that is far more likely to cause problems in the future when someone adds more code and fails to cast properly; this bug was difficult enough to track down in the first place. Reported-and-tested-by: Viorel-Cătălin Răpițeanu Acked-by: Borislav Petkov Cc: Sai Praneeth Prakhya Cc: Signed-off-by: Matt Fleming Link: https://bugzilla.kernel.org/show_bug.cgi?id=110131 Link: http://lkml.kernel.org/r/1454067370-10374-1-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Thomas Gleixner --- arch/x86/mm/pageattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index fc6a4c8f6e2a..2440814b0069 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -33,7 +33,7 @@ struct cpa_data { pgd_t *pgd; pgprot_t mask_set; pgprot_t mask_clr; - int numpages; + unsigned long numpages; int flags; unsigned long pfn; unsigned force_split : 1; @@ -1350,7 +1350,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->numpages > numpages); + BUG_ON(cpa->numpages > numpages || !cpa->numpages); numpages -= cpa->numpages; if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) cpa->curpage++; -- cgit v1.2.3