From 155433cb365ee4666bdf7c3c7bc2978b17be36a4 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 10 Mar 2017 20:32:22 +0000 Subject: arm64: cache: Remove support for ASID-tagged VIVT I-caches As a recent change to ARMv8, ASID-tagged VIVT I-caches are removed retrospectively from the architecture. Consequently, we don't need to support them in Linux either. Acked-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/context.c | 3 --- arch/arm64/mm/flush.c | 2 -- 2 files changed, 5 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 68634c630cdd..ab9f5f0fb2c7 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -119,9 +119,6 @@ static void flush_context(unsigned int cpu) /* Queue a TLB invalidate and flush the I-cache if necessary. */ cpumask_setall(&tlb_flush_pending); - - if (icache_is_aivivt()) - __flush_icache_all(); } static bool check_update_reserved_asid(u64 asid, u64 newasid) diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 554a2558c12e..1e968222a544 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -65,8 +65,6 @@ void __sync_icache_dcache(pte_t pte, unsigned long addr) if (!test_and_set_bit(PG_dcache_clean, &page->flags)) sync_icache_aliases(page_address(page), PAGE_SIZE << compound_order(page)); - else if (icache_is_aivivt()) - __flush_icache_all(); } /* -- cgit v1.2.3 From 02f7760e6e5c3d726cd9622749cdae17c571b9a3 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 10 Mar 2017 20:32:23 +0000 Subject: arm64: cache: Merge cachetype.h into cache.h cachetype.h and cache.h are small and both obviously related to caches. Merge them together to reduce clutter. Acked-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cache.h | 31 ++++++++++++++++++++- arch/arm64/include/asm/cachetype.h | 55 -------------------------------------- arch/arm64/include/asm/kvm_mmu.h | 2 +- arch/arm64/kernel/cpuinfo.c | 2 +- arch/arm64/mm/flush.c | 2 +- 5 files changed, 33 insertions(+), 59 deletions(-) delete mode 100644 arch/arm64/include/asm/cachetype.h (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h index 5082b30bc2c0..7acb52634299 100644 --- a/arch/arm64/include/asm/cache.h +++ b/arch/arm64/include/asm/cache.h @@ -16,7 +16,17 @@ #ifndef __ASM_CACHE_H #define __ASM_CACHE_H -#include +#include + +#define CTR_L1IP_SHIFT 14 +#define CTR_L1IP_MASK 3 +#define CTR_CWG_SHIFT 24 +#define CTR_CWG_MASK 15 + +#define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK) + +#define ICACHE_POLICY_VIPT 2 +#define ICACHE_POLICY_PIPT 3 #define L1_CACHE_SHIFT 7 #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) @@ -32,6 +42,25 @@ #ifndef __ASSEMBLY__ +#include + +#define ICACHEF_ALIASING 0 +extern unsigned long __icache_flags; + +/* + * Whilst the D-side always behaves as PIPT on AArch64, aliasing is + * permitted in the I-cache. + */ +static inline int icache_is_aliasing(void) +{ + return test_bit(ICACHEF_ALIASING, &__icache_flags); +} + +static inline u32 cache_type_cwg(void) +{ + return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK; +} + #define __read_mostly __attribute__((__section__(".data..read_mostly"))) static inline int cache_line_size(void) diff --git a/arch/arm64/include/asm/cachetype.h b/arch/arm64/include/asm/cachetype.h deleted file mode 100644 index fbab37c669a0..000000000000 --- a/arch/arm64/include/asm/cachetype.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#ifndef __ASM_CACHETYPE_H -#define __ASM_CACHETYPE_H - -#include - -#define CTR_L1IP_SHIFT 14 -#define CTR_L1IP_MASK 3 -#define CTR_CWG_SHIFT 24 -#define CTR_CWG_MASK 15 - -#define ICACHE_POLICY_VIPT 2 -#define ICACHE_POLICY_PIPT 3 - -#ifndef __ASSEMBLY__ - -#include - -#define CTR_L1IP(ctr) (((ctr) >> CTR_L1IP_SHIFT) & CTR_L1IP_MASK) - -#define ICACHEF_ALIASING 0 - -extern unsigned long __icache_flags; - -/* - * Whilst the D-side always behaves as PIPT on AArch64, aliasing is - * permitted in the I-cache. - */ -static inline int icache_is_aliasing(void) -{ - return test_bit(ICACHEF_ALIASING, &__icache_flags); -} - -static inline u32 cache_type_cwg(void) -{ - return (read_cpuid_cachetype() >> CTR_CWG_SHIFT) & CTR_CWG_MASK; -} - -#endif /* __ASSEMBLY__ */ - -#endif /* __ASM_CACHETYPE_H */ diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 4be5773d4606..dc3624d8b9db 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -108,7 +108,7 @@ alternative_else_nop_endif #else #include -#include +#include #include #include #include diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index efe74ecc9738..260b54f415b8 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -15,7 +15,7 @@ * along with this program. If not, see . */ #include -#include +#include #include #include #include diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 1e968222a544..21a8d828cbf4 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -22,7 +22,7 @@ #include #include -#include +#include #include void sync_icache_aliases(void *kaddr, unsigned long len) -- cgit v1.2.3 From 44176bb38fa48726a9658b2ccf407ea94113c760 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 7 Mar 2017 18:43:32 +0100 Subject: arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to IOMMU Add support for allocating physically contiguous DMA buffers on arm64 systems with an IOMMU. This can be useful when two or more devices with different memory requirements are involved in buffer sharing. Note that as this uses the CMA allocator, setting the DMA_ATTR_FORCE_CONTIGUOUS attribute has a runtime-dependency on CONFIG_DMA_CMA, just like on arm32. For arm64 systems using swiotlb, no changes are needed to support the allocation of physically contiguous DMA buffers: - swiotlb always uses physically contiguous buffers (up to IO_TLB_SEGSIZE = 128 pages), - arm64's __dma_alloc_coherent() already calls dma_alloc_from_contiguous() when CMA is available. Signed-off-by: Geert Uytterhoeven Acked-by: Laurent Pinchart Reviewed-by: Robin Murphy Signed-off-by: Catalin Marinas --- arch/arm64/mm/dma-mapping.c | 63 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 15 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 81cdb2e844ed..f7b54019ef55 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -584,20 +584,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, */ gfp |= __GFP_ZERO; - if (gfpflags_allow_blocking(gfp)) { - struct page **pages; - pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent); - - pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot, - handle, flush_page); - if (!pages) - return NULL; - - addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot, - __builtin_return_address(0)); - if (!addr) - iommu_dma_free(dev, pages, iosize, handle); - } else { + if (!gfpflags_allow_blocking(gfp)) { struct page *page; /* * In atomic context we can't remap anything, so we'll only @@ -621,6 +608,45 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size, __free_from_pool(addr, size); addr = NULL; } + } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent); + struct page *page; + + page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT, + get_order(size), gfp); + if (!page) + return NULL; + + *handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot); + if (iommu_dma_mapping_error(dev, *handle)) { + dma_release_from_contiguous(dev, page, + size >> PAGE_SHIFT); + return NULL; + } + if (!coherent) + __dma_flush_area(page_to_virt(page), iosize); + + addr = dma_common_contiguous_remap(page, size, VM_USERMAP, + prot, + __builtin_return_address(0)); + if (!addr) { + iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs); + dma_release_from_contiguous(dev, page, + size >> PAGE_SHIFT); + } + } else { + pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent); + struct page **pages; + + pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot, + handle, flush_page); + if (!pages) + return NULL; + + addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot, + __builtin_return_address(0)); + if (!addr) + iommu_dma_free(dev, pages, iosize, handle); } return addr; } @@ -632,7 +658,8 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, size = PAGE_ALIGN(size); /* - * @cpu_addr will be one of 3 things depending on how it was allocated: + * @cpu_addr will be one of 4 things depending on how it was allocated: + * - A remapped array of pages for contiguous allocations. * - A remapped array of pages from iommu_dma_alloc(), for all * non-atomic allocations. * - A non-cacheable alias from the atomic pool, for atomic @@ -644,6 +671,12 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr, if (__in_atomic_pool(cpu_addr, size)) { iommu_dma_unmap_page(dev, handle, iosize, 0, 0); __free_from_pool(cpu_addr, size); + } else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + struct page *page = vmalloc_to_page(cpu_addr); + + iommu_dma_unmap_page(dev, handle, iosize, 0, attrs); + dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT); + dma_common_free_remap(cpu_addr, size, VM_USERMAP); } else if (is_vmalloc_addr(cpu_addr)){ struct vm_struct *area = find_vm_area(cpu_addr); -- cgit v1.2.3 From aa8c09be7a6f58e1b0dd19413d872fffdb054677 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:00 +0100 Subject: arm64: mmu: move TLB maintenance from callers to create_mapping_late() In preparation of refactoring the kernel mapping logic so that text regions are never mapped writable, which would require adding explicit TLB maintenance to new call sites of create_mapping_late() (which is currently invoked twice from the same function), move the TLB maintenance from the call site into create_mapping_late() itself, and change it from a full TLB flush into a flush by VA, which is more appropriate here. Also, given that create_mapping_late() has evolved into a routine that only updates protection bits on existing mappings, rename it to update_mapping_prot() Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index d28dbcf596b6..6cafd8723d1a 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -319,17 +319,20 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgd_pgtable_alloc, page_mappings_only); } -static void create_mapping_late(phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot) +static void update_mapping_prot(phys_addr_t phys, unsigned long virt, + phys_addr_t size, pgprot_t prot) { if (virt < VMALLOC_START) { - pr_warn("BUG: not creating mapping for %pa at 0x%016lx - outside kernel range\n", + pr_warn("BUG: not updating mapping for %pa at 0x%016lx - outside kernel range\n", &phys, virt); return; } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, debug_pagealloc_enabled()); + + /* flush the TLBs after updating live kernel mappings */ + flush_tlb_kernel_range(virt, virt + size); } static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) @@ -402,19 +405,16 @@ void mark_rodata_ro(void) unsigned long section_size; section_size = (unsigned long)_etext - (unsigned long)_text; - create_mapping_late(__pa_symbol(_text), (unsigned long)_text, + update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, section_size, PAGE_KERNEL_ROX); /* * mark .rodata as read only. Use __init_begin rather than __end_rodata * to cover NOTES and EXCEPTION_TABLE. */ section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; - create_mapping_late(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, + update_mapping_prot(__pa_symbol(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); - /* flush the TLBs after updating live kernel mappings */ - flush_tlb_all(); - debug_checkwx(); } -- cgit v1.2.3 From 5ea5306c3235a157f06040c59730b1133115ed26 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:01 +0100 Subject: arm64: alternatives: apply boot time fixups via the linear mapping One important rule of thumb when desiging a secure software system is that memory should never be writable and executable at the same time. We mostly adhere to this rule in the kernel, except at boot time, when regions may be mapped RWX until after we are done applying alternatives or making other one-off changes. For the alternative patching, we can improve the situation by applying the fixups via the linear mapping, which is never mapped with executable permissions. So map the linear alias of .text with RW- permissions initially, and remove the write permissions as soon as alternative patching has completed. Reviewed-by: Laura Abbott Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu.h | 1 + arch/arm64/kernel/alternative.c | 11 ++++++----- arch/arm64/kernel/smp.c | 1 + arch/arm64/mm/mmu.c | 22 +++++++++++++++++----- 4 files changed, 25 insertions(+), 10 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 47619411f0ff..5468c834b072 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -37,5 +37,6 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); +extern void mark_linear_text_alias_ro(void); #endif diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 06d650f61da7..8840c109c5d6 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -105,11 +105,11 @@ static u32 get_alt_insn(struct alt_instr *alt, u32 *insnptr, u32 *altinsnptr) return insn; } -static void __apply_alternatives(void *alt_region) +static void __apply_alternatives(void *alt_region, bool use_linear_alias) { struct alt_instr *alt; struct alt_region *region = alt_region; - u32 *origptr, *replptr; + u32 *origptr, *replptr, *updptr; for (alt = region->begin; alt < region->end; alt++) { u32 insn; @@ -124,11 +124,12 @@ static void __apply_alternatives(void *alt_region) origptr = ALT_ORIG_PTR(alt); replptr = ALT_REPL_PTR(alt); + updptr = use_linear_alias ? (u32 *)lm_alias(origptr) : origptr; nr_inst = alt->alt_len / sizeof(insn); for (i = 0; i < nr_inst; i++) { insn = get_alt_insn(alt, origptr + i, replptr + i); - *(origptr + i) = cpu_to_le32(insn); + updptr[i] = cpu_to_le32(insn); } flush_icache_range((uintptr_t)origptr, @@ -155,7 +156,7 @@ static int __apply_alternatives_multi_stop(void *unused) isb(); } else { BUG_ON(patched); - __apply_alternatives(®ion); + __apply_alternatives(®ion, true); /* Barriers provided by the cache flushing */ WRITE_ONCE(patched, 1); } @@ -176,5 +177,5 @@ void apply_alternatives(void *start, size_t length) .end = start + length, }; - __apply_alternatives(®ion); + __apply_alternatives(®ion, false); } diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index ef1caae02110..d4739552da28 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -434,6 +434,7 @@ void __init smp_cpus_done(unsigned int max_cpus) setup_cpu_features(); hyp_mode_check(); apply_alternatives_all(); + mark_linear_text_alias_ro(); } void __init smp_prepare_boot_cpu(void) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 6cafd8723d1a..df377fbe464e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -372,16 +372,28 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end debug_pagealloc_enabled()); /* - * Map the linear alias of the [_text, __init_begin) interval as - * read-only/non-executable. This makes the contents of the - * region accessible to subsystems such as hibernate, but - * protects it from inadvertent modification or execution. + * Map the linear alias of the [_text, __init_begin) interval + * as non-executable now, and remove the write permission in + * mark_linear_text_alias_ro() below (which will be called after + * alternative patching has completed). This makes the contents + * of the region accessible to subsystems such as hibernate, + * but protects it from inadvertent modification or execution. */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), - kernel_end - kernel_start, PAGE_KERNEL_RO, + kernel_end - kernel_start, PAGE_KERNEL, early_pgtable_alloc, debug_pagealloc_enabled()); } +void __init mark_linear_text_alias_ro(void) +{ + /* + * Remove the write permissions from the linear alias of .text/.rodata + */ + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), + (unsigned long)__init_begin - (unsigned long)_text, + PAGE_KERNEL_RO); +} + static void __init map_mem(pgd_t *pgd) { struct memblock_region *reg; -- cgit v1.2.3 From 28b066da69b937ea4104341cb5d9324109faab7f Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:02 +0100 Subject: arm64: mmu: map .text as read-only from the outset Now that alternatives patching code no longer relies on the primary mapping of .text being writable, we can remove the code that removes the writable permissions post-init time, and map it read-only from the outset. To preserve the existing behavior under rodata=off, which is relied upon by external debuggers to manage software breakpoints (as pointed out by Mark), add an early_param() check for rodata=, and use RWX permissions if it set to 'off'. Reviewed-by: Laura Abbott Reviewed-by: Kees Cook Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index df377fbe464e..300e98e8cd63 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -416,9 +416,6 @@ void mark_rodata_ro(void) { unsigned long section_size; - section_size = (unsigned long)_etext - (unsigned long)_text; - update_mapping_prot(__pa_symbol(_text), (unsigned long)_text, - section_size, PAGE_KERNEL_ROX); /* * mark .rodata as read only. Use __init_begin rather than __end_rodata * to cover NOTES and EXCEPTION_TABLE. @@ -451,6 +448,12 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, vm_area_add_early(vma); } +static int __init parse_rodata(char *arg) +{ + return strtobool(arg, &rodata_enabled); +} +early_param("rodata", parse_rodata); + /* * Create fine-grained mappings for the kernel. */ @@ -458,7 +461,14 @@ static void __init map_kernel(pgd_t *pgd) { static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; - map_kernel_segment(pgd, _text, _etext, PAGE_KERNEL_EXEC, &vmlinux_text); + /* + * External debuggers may need to write directly to the text + * mapping to install SW breakpoints. Allow this (only) when + * explicitly requested with rodata=off. + */ + pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; + + map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text); map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, &vmlinux_init); -- cgit v1.2.3 From 2ebe088b73a87c8fc70231b468c719337f258bf0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:03 +0100 Subject: arm64: mmu: apply strict permissions to .init.text and .init.data To avoid having mappings that are writable and executable at the same time, split the init region into a .init.text region that is mapped read-only, and a .init.data region that is mapped non-executable. This is possible now that the alternative patching occurs via the linear mapping, and the linear alias of the init region is always mapped writable (but never executable). Since the alternatives descriptions themselves are read-only data, move those into the .init.text region. Reviewed-by: Laura Abbott Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/sections.h | 2 ++ arch/arm64/kernel/vmlinux.lds.S | 25 ++++++++++++++++--------- arch/arm64/mm/mmu.c | 12 ++++++++---- 3 files changed, 26 insertions(+), 13 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/sections.h b/arch/arm64/include/asm/sections.h index 4e7e7067afdb..941267caa39c 100644 --- a/arch/arm64/include/asm/sections.h +++ b/arch/arm64/include/asm/sections.h @@ -24,6 +24,8 @@ extern char __hibernate_exit_text_start[], __hibernate_exit_text_end[]; extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[]; extern char __hyp_text_start[], __hyp_text_end[]; extern char __idmap_text_start[], __idmap_text_end[]; +extern char __initdata_begin[], __initdata_end[]; +extern char __inittext_begin[], __inittext_end[]; extern char __irqentry_text_start[], __irqentry_text_end[]; extern char __mmuoff_data_start[], __mmuoff_data_end[]; diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index b8deffa9e1bf..2c93d259046c 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -143,12 +143,27 @@ SECTIONS . = ALIGN(SEGMENT_ALIGN); __init_begin = .; + __inittext_begin = .; INIT_TEXT_SECTION(8) .exit.text : { ARM_EXIT_KEEP(EXIT_TEXT) } + . = ALIGN(4); + .altinstructions : { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + .altinstr_replacement : { + *(.altinstr_replacement) + } + + . = ALIGN(PAGE_SIZE); + __inittext_end = .; + __initdata_begin = .; + .init.data : { INIT_DATA INIT_SETUP(16) @@ -164,15 +179,6 @@ SECTIONS PERCPU_SECTION(L1_CACHE_BYTES) - . = ALIGN(4); - .altinstructions : { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : { - *(.altinstr_replacement) - } .rela : ALIGN(8) { *(.rela .rela*) } @@ -181,6 +187,7 @@ SECTIONS __rela_size = SIZEOF(.rela); . = ALIGN(SEGMENT_ALIGN); + __initdata_end = .; __init_end = .; _data = .; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 300e98e8cd63..75e21c33caff 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -459,7 +459,8 @@ early_param("rodata", parse_rodata); */ static void __init map_kernel(pgd_t *pgd) { - static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_init, vmlinux_data; + static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext, + vmlinux_initdata, vmlinux_data; /* * External debuggers may need to write directly to the text @@ -469,9 +470,12 @@ static void __init map_kernel(pgd_t *pgd) pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text); - map_kernel_segment(pgd, __start_rodata, __init_begin, PAGE_KERNEL, &vmlinux_rodata); - map_kernel_segment(pgd, __init_begin, __init_end, PAGE_KERNEL_EXEC, - &vmlinux_init); + map_kernel_segment(pgd, __start_rodata, __inittext_begin, PAGE_KERNEL, + &vmlinux_rodata); + map_kernel_segment(pgd, __inittext_begin, __inittext_end, text_prot, + &vmlinux_inittext); + map_kernel_segment(pgd, __initdata_begin, __initdata_end, PAGE_KERNEL, + &vmlinux_initdata); map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { -- cgit v1.2.3 From e393cf40aecfe7e872ea630b5b9ecb8c05a78c7c Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:04 +0100 Subject: arm64/mmu: align alloc_init_pte prototype with pmd/pud versions Align the function prototype of alloc_init_pte() with its pmd and pud counterparts by replacing the pfn parameter with the equivalent physical address. Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 75e21c33caff..bb9179084217 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -107,7 +107,7 @@ static bool pgattr_change_is_safe(u64 old, u64 new) } static void alloc_init_pte(pmd_t *pmd, unsigned long addr, - unsigned long end, unsigned long pfn, + unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void)) { @@ -128,8 +128,7 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, do { pte_t old_pte = *pte; - set_pte(pte, pfn_pte(pfn, prot)); - pfn++; + set_pte(pte, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we @@ -137,6 +136,7 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, */ BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte))); + phys += PAGE_SIZE; } while (pte++, addr += PAGE_SIZE, addr != end); pte_clear_fixmap(); @@ -182,7 +182,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), pmd_val(*pmd))); } else { - alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), + alloc_init_pte(pmd, addr, next, phys, prot, pgtable_alloc); BUG_ON(pmd_val(old_pmd) != 0 && -- cgit v1.2.3 From eccc1bff1b0d168a136ecd51c6091cf0ba02151b Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:05 +0100 Subject: arm64/mmu: ignore debug_pagealloc for kernel segments The debug_pagealloc facility manipulates kernel mappings in the linear region at page granularity to detect out of bounds or use-after-free accesses. Since the kernel segments are not allocated dynamically, there is no point in taking the debug_pagealloc_enabled flag into account for them, and we can use block mappings unconditionally. Note that this applies equally to the linear alias of text/rodata: we will never have dynamic allocations there given that the same memory is statically in use by the kernel image. Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index bb9179084217..ec23aec6433f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -328,8 +328,7 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - NULL, debug_pagealloc_enabled()); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); @@ -381,7 +380,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), kernel_end - kernel_start, PAGE_KERNEL, - early_pgtable_alloc, debug_pagealloc_enabled()); + early_pgtable_alloc, false); } void __init mark_linear_text_alias_ro(void) @@ -437,7 +436,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, debug_pagealloc_enabled()); + early_pgtable_alloc, false); vma->addr = va_start; vma->phys_addr = pa_start; -- cgit v1.2.3 From 141d1497aae004808ecb156ca2bfb5e0bc460a2e Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:06 +0100 Subject: arm64/mmu: add contiguous bit to sanity bug check A mapping with the contiguous bit cannot be safely manipulated while live, regardless of whether the bit changes between the old and new mapping. So take this into account when deciding whether the change is safe. Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ec23aec6433f..382ebd6ef46f 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -103,7 +103,15 @@ static bool pgattr_change_is_safe(u64 old, u64 new) */ static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE; - return old == 0 || new == 0 || ((old ^ new) & ~mask) == 0; + /* creating or taking down mappings is always safe */ + if (old == 0 || new == 0) + return true; + + /* live contiguous mappings may not be manipulated at all */ + if ((old | new) & PTE_CONT) + return false; + + return ((old ^ new) & ~mask) == 0; } static void alloc_init_pte(pmd_t *pmd, unsigned long addr, -- cgit v1.2.3 From c0951366d4b7e00d2f60f6daae7069b240d370c0 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:07 +0100 Subject: arm64/mmu: replace 'page_mappings_only' parameter with flags argument In preparation of extending the policy for manipulating kernel mappings with whether or not contiguous hints may be used in the page tables, replace the bool 'page_mappings_only' with a flags field and a flag NO_BLOCK_MAPPINGS. Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 382ebd6ef46f..bc9d5eb7bfa2 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -43,6 +43,8 @@ #include #include +#define NO_BLOCK_MAPPINGS BIT(0) + u64 idmap_t0sz = TCR_T0SZ(VA_BITS); u64 kimage_voffset __ro_after_init; @@ -153,7 +155,7 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool page_mappings_only) + int flags) { pmd_t *pmd; unsigned long next; @@ -180,7 +182,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0 && - !page_mappings_only) { + (flags & NO_BLOCK_MAPPINGS) == 0) { pmd_set_huge(pmd, phys, prot); /* @@ -217,7 +219,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool page_mappings_only) + int flags) { pud_t *pud; unsigned long next; @@ -239,7 +241,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys) && !page_mappings_only) { + if (use_1G_block(addr, next, phys) && + (flags & NO_BLOCK_MAPPINGS) == 0) { pud_set_huge(pud, phys, prot); /* @@ -250,7 +253,7 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, pud_val(*pud))); } else { alloc_init_pmd(pud, addr, next, phys, prot, - pgtable_alloc, page_mappings_only); + pgtable_alloc, flags); BUG_ON(pud_val(old_pud) != 0 && pud_val(old_pud) != pud_val(*pud)); @@ -265,7 +268,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool page_mappings_only) + int flags) { unsigned long addr, length, end, next; pgd_t *pgd = pgd_offset_raw(pgdir, virt); @@ -285,7 +288,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc, - page_mappings_only); + flags); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -314,17 +317,22 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, bool page_mappings_only) { + int flags = 0; + BUG_ON(mm == &init_mm); + if (page_mappings_only) + flags = NO_BLOCK_MAPPINGS; + __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, page_mappings_only); + pgd_pgtable_alloc, flags); } static void update_mapping_prot(phys_addr_t phys, unsigned long virt, @@ -336,7 +344,7 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); @@ -346,6 +354,10 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end { phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); + int flags = 0; + + if (debug_pagealloc_enabled()) + flags = NO_BLOCK_MAPPINGS; /* * Take care not to create a writable alias for the @@ -356,8 +368,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end if (end < kernel_start || start >= kernel_end) { __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); + early_pgtable_alloc, flags); return; } @@ -369,14 +380,12 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __create_pgd_mapping(pgd, start, __phys_to_virt(start), kernel_start - start, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); + early_pgtable_alloc, flags); if (kernel_end < end) __create_pgd_mapping(pgd, kernel_end, __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); + early_pgtable_alloc, flags); /* * Map the linear alias of the [_text, __init_begin) interval @@ -388,7 +397,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), kernel_end - kernel_start, PAGE_KERNEL, - early_pgtable_alloc, false); + early_pgtable_alloc, 0); } void __init mark_linear_text_alias_ro(void) @@ -444,7 +453,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, false); + early_pgtable_alloc, 0); vma->addr = va_start; vma->phys_addr = pa_start; -- cgit v1.2.3 From 5bd466947160d1e0517747b63216806ea768c791 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:08 +0100 Subject: arm64/mm: remove pointless map/unmap sequences when creating page tables The routines __pud_populate and __pmd_populate only create a table entry at their respective level which refers to the next level page by its physical address, so there is no reason to map this page and then unmap it immediately after. Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/mmu.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index bc9d5eb7bfa2..85ab82f5a0bc 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -128,9 +128,7 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); pte_phys = pgtable_alloc(); - pte = pte_set_fixmap(pte_phys); __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); - pte_clear_fixmap(); } BUG_ON(pmd_bad(*pmd)); @@ -168,9 +166,7 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t pmd_phys; BUG_ON(!pgtable_alloc); pmd_phys = pgtable_alloc(); - pmd = pmd_set_fixmap(pmd_phys); __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); - pmd_clear_fixmap(); } BUG_ON(pud_bad(*pud)); -- cgit v1.2.3 From d27cfa1fc823d35a6cf45ba51f5623db8a14a9b9 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 9 Mar 2017 21:52:09 +0100 Subject: arm64: mm: set the contiguous bit for kernel mappings where appropriate This is the third attempt at enabling the use of contiguous hints for kernel mappings. The most recent attempt 0bfc445dec9d was reverted after it turned out that updating permission attributes on live contiguous ranges may result in TLB conflicts. So this time, the contiguous hint is not set for .rodata or for the linear alias of .text/.rodata, both of which are mapped read-write initially, and remapped read-only at a later stage. (Note that the latter region could also be unmapped and remapped again with updated permission attributes, given that the region, while live, is only mapped for the convenience of the hibernation code, but that also means the TLB footprint is negligible anyway, so why bother) This enables the following contiguous range sizes for the virtual mapping of the kernel image, and for the linear mapping: granule size | cont PTE | cont PMD | -------------+------------+------------+ 4 KB | 64 KB | 32 MB | 16 KB | 2 MB | 1 GB* | 64 KB | 2 MB | 16 GB* | * Only when built for 3 or more levels of translation. This is due to the fact that a 2 level configuration only consists of PGDs and PTEs, and the added complexity of dealing with folded PMDs is not justified considering that 16 GB contiguous ranges are likely to be ignored by the hardware (and 16k/2 levels is a niche configuration) Reviewed-by: Mark Rutland Tested-by: Mark Rutland Signed-off-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 10 +++ arch/arm64/mm/mmu.c | 140 +++++++++++++++++++++++++++------------ 2 files changed, 107 insertions(+), 43 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 0eef6064bf3b..c213fdbd056c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -74,6 +74,16 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN)) #define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT)) +#define pte_cont_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + CONT_PTE_SIZE) & CONT_PTE_MASK; \ + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ +}) + +#define pmd_cont_addr_end(addr, end) \ +({ unsigned long __boundary = ((addr) + CONT_PMD_SIZE) & CONT_PMD_MASK; \ + (__boundary - 1 < (end) - 1) ? __boundary : (end); \ +}) + #ifdef CONFIG_ARM64_HW_AFDBM #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) #else diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 85ab82f5a0bc..91502e36e6d9 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -44,6 +44,7 @@ #include #define NO_BLOCK_MAPPINGS BIT(0) +#define NO_CONT_MAPPINGS BIT(1) u64 idmap_t0sz = TCR_T0SZ(VA_BITS); @@ -116,22 +117,11 @@ static bool pgattr_change_is_safe(u64 old, u64 new) return ((old ^ new) & ~mask) == 0; } -static void alloc_init_pte(pmd_t *pmd, unsigned long addr, - unsigned long end, phys_addr_t phys, - pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) +static void init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot) { pte_t *pte; - BUG_ON(pmd_sect(*pmd)); - if (pmd_none(*pmd)) { - phys_addr_t pte_phys; - BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(); - __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); - } - BUG_ON(pmd_bad(*pmd)); - pte = pte_set_fixmap_offset(pmd, addr); do { pte_t old_pte = *pte; @@ -150,25 +140,45 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte_clear_fixmap(); } -static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, - phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), - int flags) +static void alloc_init_cont_pte(pmd_t *pmd, unsigned long addr, + unsigned long end, phys_addr_t phys, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), + int flags) { - pmd_t *pmd; unsigned long next; - /* - * Check for initial section mappings in the pgd/pud and remove them. - */ - BUG_ON(pud_sect(*pud)); - if (pud_none(*pud)) { - phys_addr_t pmd_phys; + BUG_ON(pmd_sect(*pmd)); + if (pmd_none(*pmd)) { + phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(); - __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); + pte_phys = pgtable_alloc(); + __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); } - BUG_ON(pud_bad(*pud)); + BUG_ON(pmd_bad(*pmd)); + + do { + pgprot_t __prot = prot; + + next = pte_cont_addr_end(addr, end); + + /* use a contiguous mapping if the range is suitably aligned */ + if ((((addr | next | phys) & ~CONT_PTE_MASK) == 0) && + (flags & NO_CONT_MAPPINGS) == 0) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + init_pte(pmd, addr, next, phys, __prot); + + phys += next - addr; + } while (addr = next, addr != end); +} + +static void init_pmd(pud_t *pud, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), int flags) +{ + unsigned long next; + pmd_t *pmd; pmd = pmd_set_fixmap_offset(pud, addr); do { @@ -188,8 +198,8 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), pmd_val(*pmd))); } else { - alloc_init_pte(pmd, addr, next, phys, - prot, pgtable_alloc); + alloc_init_cont_pte(pmd, addr, next, phys, prot, + pgtable_alloc, flags); BUG_ON(pmd_val(old_pmd) != 0 && pmd_val(old_pmd) != pmd_val(*pmd)); @@ -200,6 +210,41 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, pmd_clear_fixmap(); } +static void alloc_init_cont_pmd(pud_t *pud, unsigned long addr, + unsigned long end, phys_addr_t phys, + pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), int flags) +{ + unsigned long next; + + /* + * Check for initial section mappings in the pgd/pud. + */ + BUG_ON(pud_sect(*pud)); + if (pud_none(*pud)) { + phys_addr_t pmd_phys; + BUG_ON(!pgtable_alloc); + pmd_phys = pgtable_alloc(); + __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); + } + BUG_ON(pud_bad(*pud)); + + do { + pgprot_t __prot = prot; + + next = pmd_cont_addr_end(addr, end); + + /* use a contiguous mapping if the range is suitably aligned */ + if ((((addr | next | phys) & ~CONT_PMD_MASK) == 0) && + (flags & NO_CONT_MAPPINGS) == 0) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + + init_pmd(pud, addr, next, phys, __prot, pgtable_alloc, flags); + + phys += next - addr; + } while (addr = next, addr != end); +} + static inline bool use_1G_block(unsigned long addr, unsigned long next, unsigned long phys) { @@ -248,8 +293,8 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), pud_val(*pud))); } else { - alloc_init_pmd(pud, addr, next, phys, prot, - pgtable_alloc, flags); + alloc_init_cont_pmd(pud, addr, next, phys, prot, + pgtable_alloc, flags); BUG_ON(pud_val(old_pud) != 0 && pud_val(old_pud) != pud_val(*pud)); @@ -313,7 +358,8 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, @@ -325,7 +371,7 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, BUG_ON(mm == &init_mm); if (page_mappings_only) - flags = NO_BLOCK_MAPPINGS; + flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(mm->pgd, phys, virt, size, prot, pgd_pgtable_alloc, flags); @@ -340,7 +386,8 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, 0); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); @@ -353,7 +400,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end int flags = 0; if (debug_pagealloc_enabled()) - flags = NO_BLOCK_MAPPINGS; + flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* * Take care not to create a writable alias for the @@ -390,10 +437,12 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end * alternative patching has completed). This makes the contents * of the region accessible to subsystems such as hibernate, * but protects it from inadvertent modification or execution. + * Note that contiguous mappings cannot be remapped in this way, + * so we should avoid them here. */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), kernel_end - kernel_start, PAGE_KERNEL, - early_pgtable_alloc, 0); + early_pgtable_alloc, NO_CONT_MAPPINGS); } void __init mark_linear_text_alias_ro(void) @@ -440,7 +489,8 @@ void mark_rodata_ro(void) } static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, - pgprot_t prot, struct vm_struct *vma) + pgprot_t prot, struct vm_struct *vma, + int flags) { phys_addr_t pa_start = __pa_symbol(va_start); unsigned long size = va_end - va_start; @@ -449,7 +499,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, 0); + early_pgtable_alloc, flags); vma->addr = va_start; vma->phys_addr = pa_start; @@ -481,14 +531,18 @@ static void __init map_kernel(pgd_t *pgd) */ pgprot_t text_prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; - map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text); + /* + * Only rodata will be remapped with different permissions later on, + * all other segments are allowed to use contiguous mappings. + */ + map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text, 0); map_kernel_segment(pgd, __start_rodata, __inittext_begin, PAGE_KERNEL, - &vmlinux_rodata); + &vmlinux_rodata, NO_CONT_MAPPINGS); map_kernel_segment(pgd, __inittext_begin, __inittext_end, text_prot, - &vmlinux_inittext); + &vmlinux_inittext, 0); map_kernel_segment(pgd, __initdata_begin, __initdata_end, PAGE_KERNEL, - &vmlinux_initdata); - map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data); + &vmlinux_initdata, 0); + map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data, 0); if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { /* -- cgit v1.2.3 From 8f579b1c4e347b23bfa747bc2cc0a55dd1b7e5fa Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Mon, 3 Apr 2017 11:24:31 +0900 Subject: arm64: limit memory regions based on DT property, usable-memory-range Crash dump kernel uses only a limited range of available memory as System RAM. On arm64 kdump, This memory range is advertised to crash dump kernel via a device-tree property under /chosen, linux,usable-memory-range = Crash dump kernel reads this property at boot time and calls memblock_cap_memory_range() to limit usable memory which are listed either in UEFI memory map table or "memory" nodes of a device tree blob. Signed-off-by: AKASHI Takahiro Reviewed-by: Geoff Levand Acked-by: Catalin Marinas Acked-by: Mark Rutland Reviewed-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/mm/init.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index e19e06593e37..290794b1a0f1 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -188,10 +188,45 @@ static int __init early_mem(char *p) } early_param("mem", early_mem); +static int __init early_init_dt_scan_usablemem(unsigned long node, + const char *uname, int depth, void *data) +{ + struct memblock_region *usablemem = data; + const __be32 *reg; + int len; + + if (depth != 1 || strcmp(uname, "chosen") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,usable-memory-range", &len); + if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells))) + return 1; + + usablemem->base = dt_mem_next_cell(dt_root_addr_cells, ®); + usablemem->size = dt_mem_next_cell(dt_root_size_cells, ®); + + return 1; +} + +static void __init fdt_enforce_memory_region(void) +{ + struct memblock_region reg = { + .size = 0, + }; + + of_scan_flat_dt(early_init_dt_scan_usablemem, ®); + + if (reg.size) + memblock_cap_memory_range(reg.base, reg.size); +} + void __init arm64_memblock_init(void) { const s64 linear_region_size = -(s64)PAGE_OFFSET; + /* Handle linux,usable-memory-range property */ + fdt_enforce_memory_region(); + /* * Ensure that the linear region takes up exactly half of the kernel * virtual address space. This way, we can distinguish a linear address -- cgit v1.2.3 From 764b51ead10d5f428cb5f167bf98e336bdc23f8c Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Mon, 3 Apr 2017 11:24:32 +0900 Subject: arm64: kdump: reserve memory for crash dump kernel "crashkernel=" kernel parameter specifies the size (and optionally the start address) of the system ram to be used by crash dump kernel. reserve_crashkernel() will allocate and reserve that memory at boot time of primary kernel. The memory range will be exposed to userspace as a resource named "Crash kernel" in /proc/iomem. Signed-off-by: AKASHI Takahiro Signed-off-by: Mark Salter Signed-off-by: Pratyush Anand Reviewed-by: James Morse Acked-by: Catalin Marinas Reviewed-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/setup.c | 7 ++++- arch/arm64/mm/init.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 42274bda0ccb..28855ec1be95 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -226,6 +225,12 @@ static void __init request_standard_resources(void) if (kernel_data.start >= res->start && kernel_data.end <= res->end) request_resource(res, &kernel_data); +#ifdef CONFIG_KEXEC_CORE + /* Userspace will find "Crash kernel" region in /proc/iomem. */ + if (crashk_res.end && crashk_res.start >= res->start && + crashk_res.end <= res->end) + request_resource(res, &crashk_res); +#endif } } diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 290794b1a0f1..09d19207362d 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,7 @@ #include #include #include +#include #include #include @@ -77,6 +79,67 @@ static int __init early_initrd(char *p) early_param("initrd", early_initrd); #endif +#ifdef CONFIG_KEXEC_CORE +/* + * reserve_crashkernel() - reserves memory for crash kernel + * + * This function reserves memory area given in "crashkernel=" kernel command + * line parameter. The memory reserved is used by dump capture kernel when + * primary kernel is crashing. + */ +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_base, crash_size; + int ret; + + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base); + /* no crashkernel= or invalid value specified */ + if (ret || !crash_size) + return; + + crash_size = PAGE_ALIGN(crash_size); + + if (crash_base == 0) { + /* Current arm64 boot protocol requires 2MB alignment */ + crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT, + crash_size, SZ_2M); + if (crash_base == 0) { + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + } else { + /* User specifies base address explicitly. */ + if (!memblock_is_region_memory(crash_base, crash_size)) { + pr_warn("cannot reserve crashkernel: region is not memory\n"); + return; + } + + if (memblock_is_region_reserved(crash_base, crash_size)) { + pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n"); + return; + } + + if (!IS_ALIGNED(crash_base, SZ_2M)) { + pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n"); + return; + } + } + memblock_reserve(crash_base, crash_size); + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} +#else +static void __init reserve_crashkernel(void) +{ +} +#endif /* CONFIG_KEXEC_CORE */ + /* * Return the maximum physical address for ZONE_DMA (DMA_BIT_MASK(32)). It * currently assumes that for memory starting above 4G, 32-bit devices will @@ -332,6 +395,9 @@ void __init arm64_memblock_init(void) arm64_dma_phys_limit = max_zone_dma_phys(); else arm64_dma_phys_limit = PHYS_MASK + 1; + + reserve_crashkernel(); + dma_contiguous_reserve(arm64_dma_phys_limit); memblock_allow_resize(); -- cgit v1.2.3 From 9b0aa14e31555b64460c1ee323d0e19bf891958b Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Mon, 3 Apr 2017 11:24:33 +0900 Subject: arm64: mm: add set_memory_valid() This function validates and invalidates PTE entries, and will be utilized in kdump to protect loaded crash dump kernel image. Signed-off-by: AKASHI Takahiro Reviewed-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cacheflush.h | 1 + arch/arm64/mm/pageattr.c | 15 +++++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 5a2a6ee65f65..728f933cef8c 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -154,5 +154,6 @@ int set_memory_ro(unsigned long addr, int numpages); int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +int set_memory_valid(unsigned long addr, unsigned long size, int enable); #endif diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 8def55e7249b..3212ee0558f6 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -125,20 +125,23 @@ int set_memory_x(unsigned long addr, int numpages) } EXPORT_SYMBOL_GPL(set_memory_x); -#ifdef CONFIG_DEBUG_PAGEALLOC -void __kernel_map_pages(struct page *page, int numpages, int enable) +int set_memory_valid(unsigned long addr, int numpages, int enable) { - unsigned long addr = (unsigned long) page_address(page); - if (enable) - __change_memory_common(addr, PAGE_SIZE * numpages, + return __change_memory_common(addr, PAGE_SIZE * numpages, __pgprot(PTE_VALID), __pgprot(0)); else - __change_memory_common(addr, PAGE_SIZE * numpages, + return __change_memory_common(addr, PAGE_SIZE * numpages, __pgprot(0), __pgprot(PTE_VALID)); } + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + set_memory_valid((unsigned long)page_address(page), numpages, enable); +} #ifdef CONFIG_HIBERNATION /* * When built with CONFIG_DEBUG_PAGEALLOC and CONFIG_HIBERNATION, this function -- cgit v1.2.3 From 98d2e1539b84abddce4b3c2ca8733f6aeacdee47 Mon Sep 17 00:00:00 2001 From: Takahiro Akashi Date: Mon, 3 Apr 2017 11:24:34 +0900 Subject: arm64: kdump: protect crash dump kernel memory arch_kexec_protect_crashkres() and arch_kexec_unprotect_crashkres() are meant to be called by kexec_load() in order to protect the memory allocated for crash dump kernel once the image is loaded. The protection is implemented by unmapping the relevant segments in crash dump kernel memory, rather than making it read-only as other archs do, to prevent coherency issues due to potential cache aliasing (with mismatched attributes). Page-level mappings are consistently used here so that we can change the attributes of segments in page granularity as well as shrink the region also in page granularity through /sys/kernel/kexec_crash_size, putting the freed memory back to buddy system. Signed-off-by: AKASHI Takahiro Reviewed-by: Ard Biesheuvel Signed-off-by: Catalin Marinas --- arch/arm64/kernel/machine_kexec.c | 32 +++++++++--- arch/arm64/mm/mmu.c | 103 ++++++++++++++++++++------------------ 2 files changed, 80 insertions(+), 55 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index bc96c8a7fc79..b63baa749609 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -14,7 +14,9 @@ #include #include +#include #include +#include #include "cpu-reset.h" @@ -22,8 +24,6 @@ extern const unsigned char arm64_relocate_new_kernel[]; extern const unsigned long arm64_relocate_new_kernel_size; -static unsigned long kimage_start; - /** * kexec_image_info - For debugging output. */ @@ -64,8 +64,6 @@ void machine_kexec_cleanup(struct kimage *kimage) */ int machine_kexec_prepare(struct kimage *kimage) { - kimage_start = kimage->start; - kexec_image_info(kimage); if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { @@ -183,7 +181,7 @@ void machine_kexec(struct kimage *kimage) kexec_list_flush(kimage); /* Flush the new image if already in place. */ - if (kimage->head & IND_DONE) + if ((kimage != kexec_crash_image) && (kimage->head & IND_DONE)) kexec_segment_flush(kimage); pr_info("Bye!\n"); @@ -201,7 +199,7 @@ void machine_kexec(struct kimage *kimage) */ cpu_soft_restart(1, reboot_code_buffer_phys, kimage->head, - kimage_start, 0); + kimage->start, 0); BUG(); /* Should never get here. */ } @@ -210,3 +208,25 @@ void machine_crash_shutdown(struct pt_regs *regs) { /* Empty routine needed to avoid build errors. */ } + +void arch_kexec_protect_crashkres(void) +{ + int i; + + kexec_segment_flush(kexec_crash_image); + + for (i = 0; i < kexec_crash_image->nr_segments; i++) + set_memory_valid( + __phys_to_virt(kexec_crash_image->segment[i].mem), + kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 0); +} + +void arch_kexec_unprotect_crashkres(void) +{ + int i; + + for (i = 0; i < kexec_crash_image->nr_segments; i++) + set_memory_valid( + __phys_to_virt(kexec_crash_image->segment[i].mem), + kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 1); +} diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 91502e36e6d9..0c429ec6fde8 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -393,10 +395,28 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, flush_tlb_kernel_range(virt, virt + size); } -static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) +static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, + phys_addr_t end, pgprot_t prot, int flags) +{ + __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, + prot, early_pgtable_alloc, flags); +} + +void __init mark_linear_text_alias_ro(void) +{ + /* + * Remove the write permissions from the linear alias of .text/.rodata + */ + update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), + (unsigned long)__init_begin - (unsigned long)_text, + PAGE_KERNEL_RO); +} + +static void __init map_mem(pgd_t *pgd) { phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); + struct memblock_region *reg; int flags = 0; if (debug_pagealloc_enabled()) @@ -405,30 +425,28 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end /* * Take care not to create a writable alias for the * read-only text and rodata sections of the kernel image. + * So temporarily mark them as NOMAP to skip mappings in + * the following for-loop */ + memblock_mark_nomap(kernel_start, kernel_end - kernel_start); +#ifdef CONFIG_KEXEC_CORE + if (crashk_res.end) + memblock_mark_nomap(crashk_res.start, + resource_size(&crashk_res)); +#endif - /* No overlap with the kernel text/rodata */ - if (end < kernel_start || start >= kernel_end) { - __create_pgd_mapping(pgd, start, __phys_to_virt(start), - end - start, PAGE_KERNEL, - early_pgtable_alloc, flags); - return; - } + /* map all the memory banks */ + for_each_memblock(memory, reg) { + phys_addr_t start = reg->base; + phys_addr_t end = start + reg->size; - /* - * This block overlaps the kernel text/rodata mappings. - * Map the portion(s) which don't overlap. - */ - if (start < kernel_start) - __create_pgd_mapping(pgd, start, - __phys_to_virt(start), - kernel_start - start, PAGE_KERNEL, - early_pgtable_alloc, flags); - if (kernel_end < end) - __create_pgd_mapping(pgd, kernel_end, - __phys_to_virt(kernel_end), - end - kernel_end, PAGE_KERNEL, - early_pgtable_alloc, flags); + if (start >= end) + break; + if (memblock_is_nomap(reg)) + continue; + + __map_memblock(pgd, start, end, PAGE_KERNEL, flags); + } /* * Map the linear alias of the [_text, __init_begin) interval @@ -440,37 +458,24 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end * Note that contiguous mappings cannot be remapped in this way, * so we should avoid them here. */ - __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), - kernel_end - kernel_start, PAGE_KERNEL, - early_pgtable_alloc, NO_CONT_MAPPINGS); -} + __map_memblock(pgd, kernel_start, kernel_end, + PAGE_KERNEL, NO_CONT_MAPPINGS); + memblock_clear_nomap(kernel_start, kernel_end - kernel_start); -void __init mark_linear_text_alias_ro(void) -{ +#ifdef CONFIG_KEXEC_CORE /* - * Remove the write permissions from the linear alias of .text/.rodata + * Use page-level mappings here so that we can shrink the region + * in page granularity and put back unused memory to buddy system + * through /sys/kernel/kexec_crash_size interface. */ - update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), - (unsigned long)__init_begin - (unsigned long)_text, - PAGE_KERNEL_RO); -} - -static void __init map_mem(pgd_t *pgd) -{ - struct memblock_region *reg; - - /* map all the memory banks */ - for_each_memblock(memory, reg) { - phys_addr_t start = reg->base; - phys_addr_t end = start + reg->size; - - if (start >= end) - break; - if (memblock_is_nomap(reg)) - continue; - - __map_memblock(pgd, start, end); + if (crashk_res.end) { + __map_memblock(pgd, crashk_res.start, crashk_res.end + 1, + PAGE_KERNEL, + NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); + memblock_clear_nomap(crashk_res.start, + resource_size(&crashk_res)); } +#endif } void mark_rodata_ro(void) -- cgit v1.2.3 From 254a41c0ba0573fa23272945d3fbe39efcc5d07d Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Mon, 3 Apr 2017 11:24:35 +0900 Subject: arm64: hibernate: preserve kdump image around hibernation Since arch_kexec_protect_crashkres() removes a mapping for crash dump kernel image, the loaded data won't be preserved around hibernation. In this patch, helper functions, crash_prepare_suspend()/ crash_post_resume(), are additionally called before/after hibernation so that the relevant memory segments will be mapped again and preserved just as the others are. In addition, to minimize the size of hibernation image, crash_is_nosave() is added to pfn_is_nosave() in order to recognize only the pages that hold loaded crash dump kernel image as saveable. Hibernation excludes any pages that are marked as Reserved and yet "nosave." Signed-off-by: AKASHI Takahiro Reviewed-by: James Morse Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/kexec.h | 10 ++++++ arch/arm64/kernel/hibernate.c | 10 +++++- arch/arm64/kernel/machine_kexec.c | 71 +++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/init.c | 27 +++++++++++++++ 4 files changed, 117 insertions(+), 1 deletion(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/include/asm/kexec.h b/arch/arm64/include/asm/kexec.h index 04744dc5fb61..90aabbe893b7 100644 --- a/arch/arm64/include/asm/kexec.h +++ b/arch/arm64/include/asm/kexec.h @@ -43,6 +43,16 @@ static inline void crash_setup_regs(struct pt_regs *newregs, /* Empty routine needed to avoid build errors. */ } +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION) +extern bool crash_is_nosave(unsigned long pfn); +extern void crash_prepare_suspend(void); +extern void crash_post_resume(void); +#else +static inline bool crash_is_nosave(unsigned long pfn) {return false; } +static inline void crash_prepare_suspend(void) {} +static inline void crash_post_resume(void) {} +#endif + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 97a7384100f3..a44e13942d30 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -102,7 +103,8 @@ int pfn_is_nosave(unsigned long pfn) unsigned long nosave_begin_pfn = sym_to_pfn(&__nosave_begin); unsigned long nosave_end_pfn = sym_to_pfn(&__nosave_end - 1); - return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn); + return ((pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn)) || + crash_is_nosave(pfn); } void notrace save_processor_state(void) @@ -286,6 +288,9 @@ int swsusp_arch_suspend(void) local_dbg_save(flags); if (__cpu_suspend_enter(&state)) { + /* make the crash dump kernel image visible/saveable */ + crash_prepare_suspend(); + sleep_cpu = smp_processor_id(); ret = swsusp_save(); } else { @@ -297,6 +302,9 @@ int swsusp_arch_suspend(void) if (el2_reset_needed()) dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end); + /* make the crash dump kernel image protected again */ + crash_post_resume(); + /* * Tell the hibernation core that we've just restored * the memory diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c index b63baa749609..a6d66b98d795 100644 --- a/arch/arm64/kernel/machine_kexec.c +++ b/arch/arm64/kernel/machine_kexec.c @@ -10,6 +10,7 @@ */ #include +#include #include #include @@ -230,3 +231,73 @@ void arch_kexec_unprotect_crashkres(void) __phys_to_virt(kexec_crash_image->segment[i].mem), kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 1); } + +#ifdef CONFIG_HIBERNATION +/* + * To preserve the crash dump kernel image, the relevant memory segments + * should be mapped again around the hibernation. + */ +void crash_prepare_suspend(void) +{ + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); +} + +void crash_post_resume(void) +{ + if (kexec_crash_image) + arch_kexec_protect_crashkres(); +} + +/* + * crash_is_nosave + * + * Return true only if a page is part of reserved memory for crash dump kernel, + * but does not hold any data of loaded kernel image. + * + * Note that all the pages in crash dump kernel memory have been initially + * marked as Reserved in kexec_reserve_crashkres_pages(). + * + * In hibernation, the pages which are Reserved and yet "nosave" are excluded + * from the hibernation iamge. crash_is_nosave() does thich check for crash + * dump kernel and will reduce the total size of hibernation image. + */ + +bool crash_is_nosave(unsigned long pfn) +{ + int i; + phys_addr_t addr; + + if (!crashk_res.end) + return false; + + /* in reserved memory? */ + addr = __pfn_to_phys(pfn); + if ((addr < crashk_res.start) || (crashk_res.end < addr)) + return false; + + if (!kexec_crash_image) + return true; + + /* not part of loaded kernel image? */ + for (i = 0; i < kexec_crash_image->nr_segments; i++) + if (addr >= kexec_crash_image->segment[i].mem && + addr < (kexec_crash_image->segment[i].mem + + kexec_crash_image->segment[i].memsz)) + return false; + + return true; +} + +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + struct page *page; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + page = phys_to_page(addr); + ClearPageReserved(page); + free_reserved_page(page); + } +} +#endif /* CONFIG_HIBERNATION */ diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 09d19207362d..89ba3cd0fe44 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -134,10 +134,35 @@ static void __init reserve_crashkernel(void) crashk_res.start = crash_base; crashk_res.end = crash_base + crash_size - 1; } + +static void __init kexec_reserve_crashkres_pages(void) +{ +#ifdef CONFIG_HIBERNATION + phys_addr_t addr; + struct page *page; + + if (!crashk_res.end) + return; + + /* + * To reduce the size of hibernation image, all the pages are + * marked as Reserved initially. + */ + for (addr = crashk_res.start; addr < (crashk_res.end + 1); + addr += PAGE_SIZE) { + page = phys_to_page(addr); + SetPageReserved(page); + } +#endif +} #else static void __init reserve_crashkernel(void) { } + +static void __init kexec_reserve_crashkres_pages(void) +{ +} #endif /* CONFIG_KEXEC_CORE */ /* @@ -517,6 +542,8 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ free_all_bootmem(); + kexec_reserve_crashkres_pages(); + mem_init_print_info(NULL); #define MLK(b, t) b, t, ((t) - (b)) >> 10 -- cgit v1.2.3 From e62aaeac426ab1ddbdde524797b2a7835f606d91 Mon Sep 17 00:00:00 2001 From: AKASHI Takahiro Date: Mon, 3 Apr 2017 11:24:38 +0900 Subject: arm64: kdump: provide /proc/vmcore file Arch-specific functions are added to allow for implementing a crash dump file interface, /proc/vmcore, which can be viewed as a ELF file. A user space tool, like kexec-tools, is responsible for allocating a separate region for the core's ELF header within crash kdump kernel memory and filling it in when executing kexec_load(). Then, its location will be advertised to crash dump kernel via a new device-tree property, "linux,elfcorehdr", and crash dump kernel preserves the region for later use with reserve_elfcorehdr() at boot time. On crash dump kernel, /proc/vmcore will access the primary kernel's memory with copy_oldmem_page(), which feeds the data page-by-page by ioremap'ing it since it does not reside in linear mapping on crash dump kernel. Meanwhile, elfcorehdr_read() is simple as the region is always mapped. Signed-off-by: AKASHI Takahiro Reviewed-by: James Morse Acked-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 11 +++++++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/crash_dump.c | 71 ++++++++++++++++++++++++++++++++++++++++++ arch/arm64/mm/init.c | 53 +++++++++++++++++++++++++++++++ 4 files changed, 136 insertions(+) create mode 100644 arch/arm64/kernel/crash_dump.c (limited to 'arch/arm64/mm') diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 3741859765cf..e7f043efff41 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -736,6 +736,17 @@ config KEXEC but it is independent of the system firmware. And like a reboot you can start any kernel with it, not just Linux. +config CRASH_DUMP + bool "Build kdump crash kernel" + help + Generate crash dump after being started by kexec. This should + be normally only set in special crash dump kernels which are + loaded in the main kernel with kexec-tools into a specially + reserved region and then later executed after a crash by + kdump/kexec. + + For more details see Documentation/kdump/kdump.txt + config XEN_DOM0 def_bool y depends on XEN diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index aaaf06b117e3..1dcb69d3d0e5 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -52,6 +52,7 @@ arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ cpu-reset.o arm64-obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o +arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c new file mode 100644 index 000000000000..f46d57c31443 --- /dev/null +++ b/arch/arm64/kernel/crash_dump.c @@ -0,0 +1,71 @@ +/* + * Routines for doing kexec-based kdump + * + * Copyright (C) 2017 Linaro Limited + * Author: AKASHI Takahiro + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +/** + * copy_oldmem_page() - copy one page from old kernel memory + * @pfn: page frame number to be copied + * @buf: buffer where the copied page is placed + * @csize: number of bytes to copy + * @offset: offset in bytes into the page + * @userbuf: if set, @buf is in a user address space + * + * This function copies one page from old kernel memory into buffer pointed by + * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes + * copied or negative error in case of failure. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, + int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB); + if (!vaddr) + return -ENOMEM; + + if (userbuf) { + if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { + memunmap(vaddr); + return -EFAULT; + } + } else { + memcpy(buf, vaddr + offset, csize); + } + + memunmap(vaddr); + + return csize; +} + +/** + * elfcorehdr_read - read from ELF core header + * @buf: buffer where the data is placed + * @csize: number of bytes to read + * @ppos: address in the memory + * + * This function reads @count bytes from elf core header which exists + * on crash dump kernel's memory. + */ +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + return count; +} diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 89ba3cd0fe44..5960bef0170d 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -165,6 +166,56 @@ static void __init kexec_reserve_crashkres_pages(void) } #endif /* CONFIG_KEXEC_CORE */ +#ifdef CONFIG_CRASH_DUMP +static int __init early_init_dt_scan_elfcorehdr(unsigned long node, + const char *uname, int depth, void *data) +{ + const __be32 *reg; + int len; + + if (depth != 1 || strcmp(uname, "chosen") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len); + if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells))) + return 1; + + elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, ®); + elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, ®); + + return 1; +} + +/* + * reserve_elfcorehdr() - reserves memory for elf core header + * + * This function reserves the memory occupied by an elf core header + * described in the device tree. This region contains all the + * information about primary kernel's core image and is used by a dump + * capture kernel to access the system memory on primary kernel. + */ +static void __init reserve_elfcorehdr(void) +{ + of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL); + + if (!elfcorehdr_size) + return; + + if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) { + pr_warn("elfcorehdr is overlapped\n"); + return; + } + + memblock_reserve(elfcorehdr_addr, elfcorehdr_size); + + pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n", + elfcorehdr_size >> 10, elfcorehdr_addr); +} +#else +static void __init reserve_elfcorehdr(void) +{ +} +#endif /* CONFIG_CRASH_DUMP */ /* * Return the maximum physical address for ZONE_DMA (DMA_BIT_MASK(32)). It * currently assumes that for memory starting above 4G, 32-bit devices will @@ -423,6 +474,8 @@ void __init arm64_memblock_init(void) reserve_crashkernel(); + reserve_elfcorehdr(); + dma_contiguous_reserve(arm64_dma_phys_limit); memblock_allow_resize(); -- cgit v1.2.3 From b824b930682308ce453b73b0e3ba9686e1f3a93a Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Wed, 5 Apr 2017 12:18:31 -0700 Subject: arm64: print a fault message when attempting to write RO memory If a page is marked read only we should print out that fact, instead of printing out that there was a page fault. Right now we get a cryptic error message that something went wrong with an unhandled fault, but we don't evaluate the esr to figure out that it was a read/write permission fault. Instead of seeing: Unable to handle kernel paging request at virtual address ffff000008e460d8 pgd = ffff800003504000 [ffff000008e460d8] *pgd=0000000083473003, *pud=0000000083503003, *pmd=0000000000000000 Internal error: Oops: 9600004f [#1] PREEMPT SMP we'll see: Unable to handle kernel write to read-only memory at virtual address ffff000008e760d8 pgd = ffff80003d3de000 [ffff000008e760d8] *pgd=0000000083472003, *pud=0000000083435003, *pmd=0000000000000000 Internal error: Oops: 9600004f [#1] PREEMPT SMP We also add a userspace address check into is_permission_fault() so that the function doesn't return true for ttbr0 PAN faults when it shouldn't. Reviewed-by: James Morse Tested-by: James Morse Acked-by: Laura Abbott Cc: Mark Rutland Signed-off-by: Stephen Boyd Signed-off-by: Catalin Marinas --- arch/arm64/mm/fault.c | 55 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 19 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 4bf899fb451b..61114b7b6b9e 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -161,12 +161,33 @@ static bool is_el1_instruction_abort(unsigned int esr) return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR; } +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs, + unsigned long addr) +{ + unsigned int ec = ESR_ELx_EC(esr); + unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; + + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (fsc_type == ESR_ELx_FSC_PERM) + return true; + + if (addr < USER_DS && system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + + return false; +} + /* * The kernel tried to access some page that wasn't present. */ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, unsigned int esr, struct pt_regs *regs) { + const char *msg; + /* * Are we prepared to handle this kernel fault? * We are almost certainly not prepared to handle instruction faults. @@ -178,9 +199,20 @@ static void __do_kernel_fault(struct mm_struct *mm, unsigned long addr, * No handler, we'll have to terminate things with extreme prejudice. */ bust_spinlocks(1); - pr_alert("Unable to handle kernel %s at virtual address %08lx\n", - (addr < PAGE_SIZE) ? "NULL pointer dereference" : - "paging request", addr); + + if (is_permission_fault(esr, regs, addr)) { + if (esr & ESR_ELx_WNR) + msg = "write to read-only memory"; + else + msg = "read from unreadable memory"; + } else if (addr < PAGE_SIZE) { + msg = "NULL pointer dereference"; + } else { + msg = "paging request"; + } + + pr_alert("Unable to handle kernel %s at virtual address %08lx\n", msg, + addr); show_pte(mm, addr); die("Oops", regs, esr); @@ -270,21 +302,6 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) -{ - unsigned int ec = ESR_ELx_EC(esr); - unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - - if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) - return false; - - if (system_uses_ttbr0_pan()) - return fsc_type == ESR_ELx_FSC_FAULT && - (regs->pstate & PSR_PAN_BIT); - else - return fsc_type == ESR_ELx_FSC_PERM; -} - static bool is_el0_instruction_abort(unsigned int esr) { return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW; @@ -322,7 +339,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (addr < USER_DS && is_permission_fault(esr, regs)) { + if (addr < USER_DS && is_permission_fault(esr, regs, addr)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); -- cgit v1.2.3 From 92f66f84d9695d07adf9bc987bbcce4bf9b8e87c Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 25 Apr 2017 15:42:31 +0100 Subject: arm64: Fix the DMA mmap and get_sgtable API with DMA_ATTR_FORCE_CONTIGUOUS While honouring the DMA_ATTR_FORCE_CONTIGUOUS on arm64 (commit 44176bb38fa4: "arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to IOMMU"), the existing uses of dma_mmap_attrs() and dma_get_sgtable() have been broken by passing a physically contiguous vm_struct with an invalid pages pointer through the common iommu API. Since the coherent allocation with DMA_ATTR_FORCE_CONTIGUOUS uses CMA, this patch simply reuses the existing swiotlb logic for mmap and get_sgtable. Note that the current implementation of get_sgtable (both swiotlb and iommu) is broken if dma_declare_coherent_memory() is used since such memory does not have a corresponding struct page. To be addressed in a subsequent patch. Fixes: 44176bb38fa4 ("arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to IOMMU") Reported-by: Andrzej Hajda Cc: Geert Uytterhoeven Acked-by: Robin Murphy Tested-by: Andrzej Hajda Reviewed-by: Andrzej Hajda Signed-off-by: Catalin Marinas --- arch/arm64/mm/dma-mapping.c | 65 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 16 deletions(-) (limited to 'arch/arm64/mm') diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index f7b54019ef55..c9e53dec3695 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -308,24 +308,15 @@ static void __swiotlb_sync_sg_for_device(struct device *dev, sg->length, dir); } -static int __swiotlb_mmap(struct device *dev, - struct vm_area_struct *vma, - void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) +static int __swiotlb_mmap_pfn(struct vm_area_struct *vma, + unsigned long pfn, size_t size) { int ret = -ENXIO; unsigned long nr_vma_pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; - unsigned long pfn = dma_to_phys(dev, dma_addr) >> PAGE_SHIFT; unsigned long off = vma->vm_pgoff; - vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot, - is_device_dma_coherent(dev)); - - if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) - return ret; - if (off < nr_pages && nr_vma_pages <= (nr_pages - off)) { ret = remap_pfn_range(vma, vma->vm_start, pfn + off, @@ -336,19 +327,43 @@ static int __swiotlb_mmap(struct device *dev, return ret; } -static int __swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, - void *cpu_addr, dma_addr_t handle, size_t size, - unsigned long attrs) +static int __swiotlb_mmap(struct device *dev, + struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ + int ret; + unsigned long pfn = dma_to_phys(dev, dma_addr) >> PAGE_SHIFT; + + vma->vm_page_prot = __get_dma_pgprot(attrs, vma->vm_page_prot, + is_device_dma_coherent(dev)); + + if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) + return ret; + + return __swiotlb_mmap_pfn(vma, pfn, size); +} + +static int __swiotlb_get_sgtable_page(struct sg_table *sgt, + struct page *page, size_t size) { int ret = sg_alloc_table(sgt, 1, GFP_KERNEL); if (!ret) - sg_set_page(sgt->sgl, phys_to_page(dma_to_phys(dev, handle)), - PAGE_ALIGN(size), 0); + sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0); return ret; } +static int __swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, + void *cpu_addr, dma_addr_t handle, size_t size, + unsigned long attrs) +{ + struct page *page = phys_to_page(dma_to_phys(dev, handle)); + + return __swiotlb_get_sgtable_page(sgt, page, size); +} + static int __swiotlb_dma_supported(struct device *hwdev, u64 mask) { if (swiotlb) @@ -703,6 +718,15 @@ static int __iommu_mmap_attrs(struct device *dev, struct vm_area_struct *vma, if (dma_mmap_from_coherent(dev, vma, cpu_addr, size, &ret)) return ret; + if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + /* + * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped, + * hence in the vmalloc space. + */ + unsigned long pfn = vmalloc_to_pfn(cpu_addr); + return __swiotlb_mmap_pfn(vma, pfn, size); + } + area = find_vm_area(cpu_addr); if (WARN_ON(!area || !area->pages)) return -ENXIO; @@ -717,6 +741,15 @@ static int __iommu_get_sgtable(struct device *dev, struct sg_table *sgt, unsigned int count = PAGE_ALIGN(size) >> PAGE_SHIFT; struct vm_struct *area = find_vm_area(cpu_addr); + if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) { + /* + * DMA_ATTR_FORCE_CONTIGUOUS allocations are always remapped, + * hence in the vmalloc space. + */ + struct page *page = vmalloc_to_page(cpu_addr); + return __swiotlb_get_sgtable_page(sgt, page, size); + } + if (WARN_ON(!area || !area->pages)) return -ENXIO; -- cgit v1.2.3