From 85c9f4b04a08f6bc770b77530c22d04103468b8f Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Mon, 13 Oct 2014 15:51:01 -0700 Subject: mm/slab: fix unaligned access on sparc64 Commit bf0dea23a9c0 ("mm/slab: use percpu allocator for cpu cache") changed the allocation method for cpu cache array from slab allocator to percpu allocator. Alignment should be provided for aligned memory in percpu allocator case, but, that commit mistakenly set this alignment to 0. So, percpu allocator returns unaligned memory address. It doesn't cause any problem on x86 which permits unaligned access, but, it causes the problem on sparc64 which needs strong guarantee of alignment. Following bug report is reported from David Miller. I'm getting tons of the following on sparc64: [603965.383447] Kernel unaligned access at TPC[546b58] free_block+0x98/0x1a0 [603965.396987] Kernel unaligned access at TPC[546b60] free_block+0xa0/0x1a0 ... [603970.554394] log_unaligned: 333 callbacks suppressed ... This patch provides a proper alignment parameter when allocating cpu cache to fix this unaligned memory access problem on sparc64. Reported-by: David Miller Tested-by: David Miller Tested-by: Meelis Roos Signed-off-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 154aac8411c5..eb2b2ea30130 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1992,7 +1992,7 @@ static struct array_cache __percpu *alloc_kmem_cache_cpus( struct array_cache __percpu *cpu_cache; size = sizeof(void *) * entries + sizeof(struct array_cache); - cpu_cache = __alloc_percpu(size, 0); + cpu_cache = __alloc_percpu(size, sizeof(void *)); if (!cpu_cache) return NULL; -- cgit v1.2.3 From 68faed630fc151a7a1c4853df00fb3dcacf782b4 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Mon, 13 Oct 2014 15:51:03 -0700 Subject: mm/cma: fix cma bitmap aligned mask computing The current cma bitmap aligned mask computation is incorrect. It could cause an unexpected alignment when using cma_alloc() if the wanted align order is larger than cma->order_per_bit. Take kvm for example (PAGE_SHIFT = 12), kvm_cma->order_per_bit is set to 6. When kvm_alloc_rma() tries to alloc kvm_rma_pages, it will use 15 as the expected align value. After using the current implementation however, we get 0 as cma bitmap aligned mask other than 511. This patch fixes the cma bitmap aligned mask calculation. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Weijie Yang Acked-by: Michal Nazarewicz Cc: Joonsoo Kim Cc: "Aneesh Kumar K.V" Cc: [3.17] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/cma.c b/mm/cma.c index 474c644a0dc6..a951a3b3ed36 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -58,7 +58,9 @@ unsigned long cma_get_size(struct cma *cma) static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) { - return (1UL << (align_order >> cma->order_per_bit)) - 1; + if (align_order <= cma->order_per_bit) + return 0; + return (1UL << (align_order - cma->order_per_bit)) - 1; } static unsigned long cma_bitmap_maxno(struct cma *cma) -- cgit v1.2.3 From de9e14eebf33a60712a52a0bc6e08c043c0aba53 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Mon, 13 Oct 2014 15:51:09 -0700 Subject: drivers: dma-contiguous: add initialization from device tree Add a function to create CMA region from previously reserved memory and add support for handling 'shared-dma-pool' reserved-memory device tree nodes. Based on previous code provided by Josh Cartwright Signed-off-by: Marek Szyprowski Cc: Arnd Bergmann Cc: Michal Nazarewicz Cc: Grant Likely Cc: Laura Abbott Cc: Josh Cartwright Cc: Joonsoo Kim Cc: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/dma-contiguous.c | 66 +++++++++++++++++++++++++++++++++++++++++++ include/linux/cma.h | 3 ++ mm/cma.c | 62 ++++++++++++++++++++++++++++++++-------- 3 files changed, 120 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 6606abdf880c..473ff4892401 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -211,3 +211,69 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, { return cma_release(dev_get_cma_area(dev), pages, count); } + +/* + * Support for reserved memory regions defined in device tree + */ +#ifdef CONFIG_OF_RESERVED_MEM +#include +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) fmt + +static void rmem_cma_device_init(struct reserved_mem *rmem, struct device *dev) +{ + dev_set_cma_area(dev, rmem->priv); +} + +static void rmem_cma_device_release(struct reserved_mem *rmem, + struct device *dev) +{ + dev_set_cma_area(dev, NULL); +} + +static const struct reserved_mem_ops rmem_cma_ops = { + .device_init = rmem_cma_device_init, + .device_release = rmem_cma_device_release, +}; + +static int __init rmem_cma_setup(struct reserved_mem *rmem) +{ + phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + phys_addr_t mask = align - 1; + unsigned long node = rmem->fdt_node; + struct cma *cma; + int err; + + if (!of_get_flat_dt_prop(node, "reusable", NULL) || + of_get_flat_dt_prop(node, "no-map", NULL)) + return -EINVAL; + + if ((rmem->base & mask) || (rmem->size & mask)) { + pr_err("Reserved memory: incorrect alignment of CMA region\n"); + return -EINVAL; + } + + err = cma_init_reserved_mem(rmem->base, rmem->size, 0, &cma); + if (err) { + pr_err("Reserved memory: unable to setup CMA region\n"); + return err; + } + /* Architecture specific contiguous memory fixup. */ + dma_contiguous_early_fixup(rmem->base, rmem->size); + + if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + dma_contiguous_set_default(cma); + + rmem->ops = &rmem_cma_ops; + rmem->priv = cma; + + pr_info("Reserved memory: created CMA memory pool at %pa, size %ld MiB\n", + &rmem->base, (unsigned long)rmem->size / SZ_1M); + + return 0; +} +RESERVEDMEM_OF_DECLARE(cma, "shared-dma-pool", rmem_cma_setup); +#endif diff --git a/include/linux/cma.h b/include/linux/cma.h index 371b93042520..0430ed05d3b9 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -22,6 +22,9 @@ extern int __init cma_declare_contiguous(phys_addr_t size, phys_addr_t base, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma); +extern int cma_init_reserved_mem(phys_addr_t size, + phys_addr_t base, int order_per_bit, + struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); extern bool cma_release(struct cma *cma, struct page *pages, int count); #endif diff --git a/mm/cma.c b/mm/cma.c index a951a3b3ed36..963bc4add9af 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -142,6 +142,54 @@ static int __init cma_init_reserved_areas(void) } core_initcall(cma_init_reserved_areas); +/** + * cma_init_reserved_mem() - create custom contiguous area from reserved memory + * @base: Base address of the reserved area + * @size: Size of the reserved area (in bytes), + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @res_cma: Pointer to store the created cma region. + * + * This function creates custom contiguous area from already reserved memory. + */ +int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, + int order_per_bit, struct cma **res_cma) +{ + struct cma *cma; + phys_addr_t alignment; + + /* Sanity checks */ + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size || !memblock_is_region_reserved(base, size)) + return -EINVAL; + + /* ensure minimal alignment requied by mm core */ + alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + + /* alignment should be aligned with order_per_bit */ + if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + if (ALIGN(base, alignment) != base || ALIGN(size, alignment) != size) + return -EINVAL; + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + return 0; +} + /** * cma_declare_contiguous() - reserve custom contiguous area * @base: Base address of the reserved area optional, use 0 for any @@ -165,7 +213,6 @@ int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t alignment, unsigned int order_per_bit, bool fixed, struct cma **res_cma) { - struct cma *cma; phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t highmem_start = __pa(high_memory); int ret = 0; @@ -237,16 +284,9 @@ int __init cma_declare_contiguous(phys_addr_t base, } } - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma = &cma_areas[cma_area_count]; - cma->base_pfn = PFN_DOWN(base); - cma->count = size >> PAGE_SHIFT; - cma->order_per_bit = order_per_bit; - *res_cma = cma; - cma_area_count++; + ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); + if (ret) + goto err; pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, (unsigned long)base); -- cgit v1.2.3 From 64e455079e1bd7787cc47be30b7f601ce682a5f6 Mon Sep 17 00:00:00 2001 From: Peter Feiner Date: Mon, 13 Oct 2014 15:55:46 -0700 Subject: mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared For VMAs that don't want write notifications, PTEs created for read faults have their write bit set. If the read fault happens after VM_SOFTDIRTY is cleared, then the PTE's softdirty bit will remain clear after subsequent writes. Here's a simple code snippet to demonstrate the bug: char* m = mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0); system("echo 4 > /proc/$PPID/clear_refs"); /* clear VM_SOFTDIRTY */ assert(*m == '\0'); /* new PTE allows write access */ assert(!soft_dirty(x)); *m = 'x'; /* should dirty the page */ assert(soft_dirty(x)); /* fails */ With this patch, write notifications are enabled when VM_SOFTDIRTY is cleared. Furthermore, to avoid unnecessary faults, write notifications are disabled when VM_SOFTDIRTY is set. As a side effect of enabling and disabling write notifications with care, this patch fixes a bug in mprotect where vm_page_prot bits set by drivers were zapped on mprotect. An analogous bug was fixed in mmap by commit c9d0bf241451 ("mm: uncached vma support with writenotify"). Signed-off-by: Peter Feiner Reported-by: Peter Feiner Suggested-by: Kirill A. Shutemov Cc: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Jamie Liu Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: Bjorn Helgaas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 19 +++++++++++++----- include/asm-generic/pgtable.h | 14 ++++++++++++++ include/linux/mm.h | 5 +++++ mm/memory.c | 3 ++- mm/mmap.c | 45 +++++++++++++++++++++++++++---------------- mm/mprotect.c | 20 +++++-------------- 6 files changed, 68 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index b7a7dc963a35..4e0388cffe3d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -827,8 +827,21 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .private = &cp, }; down_read(&mm->mmap_sem); - if (type == CLEAR_REFS_SOFT_DIRTY) + if (type == CLEAR_REFS_SOFT_DIRTY) { + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); + } + downgrade_write(&mm->mmap_sem); + break; + } mmu_notifier_invalidate_range_start(mm, 0, -1); + } for (vma = mm->mmap; vma; vma = vma->vm_next) { cp.vma = vma; if (is_vm_hugetlb_page(vma)) @@ -848,10 +861,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, continue; if (type == CLEAR_REFS_MAPPED && !vma->vm_file) continue; - if (type == CLEAR_REFS_SOFT_DIRTY) { - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - } walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 081ff8826bf6..752e30d63904 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -253,6 +253,20 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #define pgprot_device pgprot_noncached #endif +#ifndef pgprot_modify +#define pgprot_modify pgprot_modify +static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) +{ + if (pgprot_val(oldprot) == pgprot_val(pgprot_noncached(oldprot))) + newprot = pgprot_noncached(newprot); + if (pgprot_val(oldprot) == pgprot_val(pgprot_writecombine(oldprot))) + newprot = pgprot_writecombine(newprot); + if (pgprot_val(oldprot) == pgprot_val(pgprot_device(oldprot))) + newprot = pgprot_device(newprot); + return newprot; +} +#endif + /* * When walking page tables, get the address of the next boundary, * or the end address of the range if that comes earlier. Although no diff --git a/include/linux/mm.h b/include/linux/mm.h index 4cd45cb95e6d..02d11ee7f19d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1974,11 +1974,16 @@ static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); +void vma_set_page_prot(struct vm_area_struct *vma); #else static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) { return __pgprot(0); } +static inline void vma_set_page_prot(struct vm_area_struct *vma) +{ + vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); +} #endif #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/memory.c b/mm/memory.c index e229970e4223..1cc6bfbd872e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2053,7 +2053,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { /* - * VM_MIXEDMAP !pfn_valid() case + * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a + * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable as we can't do any dirty diff --git a/mm/mmap.c b/mm/mmap.c index 93d28c7e5420..7f855206e7fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -89,6 +89,25 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) } EXPORT_SYMBOL(vm_get_page_prot); +static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) +{ + return pgprot_modify(oldprot, vm_get_page_prot(vm_flags)); +} + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +void vma_set_page_prot(struct vm_area_struct *vma) +{ + unsigned long vm_flags = vma->vm_flags; + + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, vm_flags); + if (vma_wants_writenotify(vma)) { + vm_flags &= ~VM_SHARED; + vma->vm_page_prot = vm_pgprot_modify(vma->vm_page_prot, + vm_flags); + } +} + + int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio __read_mostly = 50; /* default is 50% */ unsigned long sysctl_overcommit_kbytes __read_mostly; @@ -1475,11 +1494,16 @@ int vma_wants_writenotify(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->page_mkwrite) return 1; - /* The open routine did something to the protections already? */ + /* The open routine did something to the protections that pgprot_modify + * won't preserve? */ if (pgprot_val(vma->vm_page_prot) != - pgprot_val(vm_get_page_prot(vm_flags))) + pgprot_val(vm_pgprot_modify(vma->vm_page_prot, vm_flags))) return 0; + /* Do we need to track softdirty? */ + if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY)) + return 1; + /* Specialty mapping? */ if (vm_flags & VM_PFNMAP) return 0; @@ -1615,21 +1639,6 @@ munmap_back: goto free_vma; } - if (vma_wants_writenotify(vma)) { - pgprot_t pprot = vma->vm_page_prot; - - /* Can vma->vm_page_prot have changed?? - * - * Answer: Yes, drivers may have changed it in their - * f_op->mmap method. - * - * Ensures that vmas marked as uncached stay that way. - */ - vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); - if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - } - vma_link(mm, vma, prev, rb_link, rb_parent); /* Once vma denies write, undo our temporary denial count */ if (file) { @@ -1663,6 +1672,8 @@ out: */ vma->vm_flags |= VM_SOFTDIRTY; + vma_set_page_prot(vma); + return addr; unmap_and_free_vma: diff --git a/mm/mprotect.c b/mm/mprotect.c index c43d557941f8..ace93454ce8e 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,13 +29,6 @@ #include #include -#ifndef pgprot_modify -static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) -{ - return newprot; -} -#endif - /* * For a prot_numa update we only hold mmap_sem for read so there is a * potential race with faulting where a pmd was temporarily none. This @@ -93,7 +86,9 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * Avoid taking write faults for pages we * know to be dirty. */ - if (dirty_accountable && pte_dirty(ptent)) + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) ptent = pte_mkwrite(ptent); ptep_modify_prot_commit(mm, addr, pte, ptent); updated = true; @@ -320,13 +315,8 @@ success: * held in write mode. */ vma->vm_flags = newflags; - vma->vm_page_prot = pgprot_modify(vma->vm_page_prot, - vm_get_page_prot(newflags)); - - if (vma_wants_writenotify(vma)) { - vma->vm_page_prot = vm_get_page_prot(newflags & ~VM_SHARED); - dirty_accountable = 1; - } + dirty_accountable = vma_wants_writenotify(vma); + vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0); -- cgit v1.2.3