From 6c284228eb356a1ec62a704b4d2329711831eaed Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 3 Jun 2019 08:20:28 +0000 Subject: powerpc: Fix kexec failure on book3s/32 In the old days, _PAGE_EXEC didn't exist on 6xx aka book3s/32. Therefore, allthough __mapin_ram_chunk() was already mapping kernel text with PAGE_KERNEL_TEXT and the rest with PAGE_KERNEL, the entire memory was executable. Part of the memory (first 512kbytes) was mapped with BATs instead of page table, but it was also entirely mapped as executable. In commit 385e89d5b20f ("powerpc/mm: add exec protection on powerpc 603"), we started adding exec protection to some 6xx, namely the 603, for pages mapped via pagetables. Then, in commit 63b2bc619565 ("powerpc/mm/32s: Use BATs for STRICT_KERNEL_RWX"), the exec protection was extended to BAT mapped memory, so that really only the kernel text could be executed. The problem here is that kexec is based on copying some code into upper part of memory then executing it from there in order to install a fresh new kernel at its definitive location. However, the code is position independant and first part of it is just there to deactivate the MMU and jump to the second part. So it is possible to run this first part inplace instead of running the copy. Once the MMU is off, there is no protection anymore and the second part of the code will just run as before. Reported-by: Aaro Koskinen Fixes: 63b2bc619565 ("powerpc/mm/32s: Use BATs for STRICT_KERNEL_RWX") Cc: stable@vger.kernel.org # v5.1+ Signed-off-by: Christophe Leroy Tested-by: Aaro Koskinen Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/kexec.h | 3 +++ arch/powerpc/kernel/machine_kexec_32.c | 4 +++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 4a585cba1787..c68476818753 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -94,6 +94,9 @@ static inline bool kdump_in_progress(void) return crashing_cpu >= 0; } +void relocate_new_kernel(unsigned long indirection_page, unsigned long reboot_code_buffer, + unsigned long start_address) __noreturn; + #ifdef CONFIG_KEXEC_FILE extern const struct kexec_file_ops kexec_elf64_ops; diff --git a/arch/powerpc/kernel/machine_kexec_32.c b/arch/powerpc/kernel/machine_kexec_32.c index affe5dcce7f4..2b160d68db49 100644 --- a/arch/powerpc/kernel/machine_kexec_32.c +++ b/arch/powerpc/kernel/machine_kexec_32.c @@ -30,7 +30,6 @@ typedef void (*relocate_new_kernel_t)( */ void default_machine_kexec(struct kimage *image) { - extern const unsigned char relocate_new_kernel[]; extern const unsigned int relocate_new_kernel_size; unsigned long page_list; unsigned long reboot_code_buffer, reboot_code_buffer_phys; @@ -58,6 +57,9 @@ void default_machine_kexec(struct kimage *image) reboot_code_buffer + KEXEC_CONTROL_PAGE_SIZE); printk(KERN_INFO "Bye!\n"); + if (!IS_ENABLED(CONFIG_FSL_BOOKE) && !IS_ENABLED(CONFIG_44x)) + relocate_new_kernel(page_list, reboot_code_buffer_phys, image->start); + /* now call it */ rnk = (relocate_new_kernel_t) reboot_code_buffer; (*rnk)(page_list, reboot_code_buffer_phys, image->start); -- cgit v1.2.3 From 33258a1db165cf43a9e6382587ad06e9b7f8187c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 7 Jun 2019 13:56:35 +1000 Subject: powerpc/64s: Fix THP PMD collapse serialisation Commit 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion in pte helpers") changed the actual bitwise tests in pte_access_permitted by using pte_write() and pte_present() helpers rather than raw bitwise testing _PAGE_WRITE and _PAGE_PRESENT bits. The pte_present() change now returns true for PTEs which are !_PAGE_PRESENT and _PAGE_INVALID, which is the combination used by pmdp_invalidate() to synchronize access from lock-free lookups. pte_access_permitted() is used by pmd_access_permitted(), so allowing GUP lock free access to proceed with such PTEs breaks this synchronisation. This bug has been observed on a host using the hash page table MMU, with random crashes and corruption in guests, usually together with bad PMD messages in the host. Fix this by adding an explicit check in pmd_access_permitted(), and documenting the condition explicitly. The pte_write() change should be okay, and would prevent GUP from falling back to the slow path when encountering savedwrite PTEs, which matches what x86 (that does not implement savedwrite) does. Fixes: 1b2443a547f9 ("powerpc/book3s64: Avoid multiple endian conversion in pte helpers") Cc: stable@vger.kernel.org # v4.20+ Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/book3s/64/pgtable.h | 30 ++++++++++++++++++++++++++++ arch/powerpc/mm/book3s64/pgtable.c | 3 +++ 2 files changed, 33 insertions(+) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 7dede2e34b70..ccf00a8b98c6 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -876,6 +876,23 @@ static inline int pmd_present(pmd_t pmd) return false; } +static inline int pmd_is_serializing(pmd_t pmd) +{ + /* + * If the pmd is undergoing a split, the _PAGE_PRESENT bit is clear + * and _PAGE_INVALID is set (see pmd_present, pmdp_invalidate). + * + * This condition may also occur when flushing a pmd while flushing + * it (see ptep_modify_prot_start), so callers must ensure this + * case is fine as well. + */ + if ((pmd_raw(pmd) & cpu_to_be64(_PAGE_PRESENT | _PAGE_INVALID)) == + cpu_to_be64(_PAGE_INVALID)) + return true; + + return false; +} + static inline int pmd_bad(pmd_t pmd) { if (radix_enabled()) @@ -1092,6 +1109,19 @@ static inline int pmd_protnone(pmd_t pmd) #define pmd_access_permitted pmd_access_permitted static inline bool pmd_access_permitted(pmd_t pmd, bool write) { + /* + * pmdp_invalidate sets this combination (which is not caught by + * !pte_present() check in pte_access_permitted), to prevent + * lock-free lookups, as part of the serialize_against_pte_lookup() + * synchronisation. + * + * This also catches the case where the PTE's hardware PRESENT bit is + * cleared while TLB is flushed, which is suboptimal but should not + * be frequent. + */ + if (pmd_is_serializing(pmd)) + return false; + return pte_access_permitted(pmd_pte(pmd), write); } diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 16bda049187a..ff98b663c83e 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -116,6 +116,9 @@ pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, /* * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. + * + * Marking the entry with _PAGE_INVALID && ~_PAGE_PRESENT requires + * a special case check in pmd_access_permitted. */ serialize_against_pte_lookup(vma->vm_mm); return __pmd(old_pmd); -- cgit v1.2.3 From a00196a272161338d4b1d66ec69e3d57c6b280e0 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 7 Jun 2019 13:56:36 +1000 Subject: powerpc/64s: __find_linux_pte() synchronization vs pmdp_invalidate() The change to pmdp_invalidate() to mark the pmd with _PAGE_INVALID broke the synchronisation against lock free lookups, __find_linux_pte()'s pmd_none() check no longer returns true for such cases. Fix this by adding a check for this condition as well. Fixes: da7ad366b497 ("powerpc/mm/book3s: Update pmd_present to look at _PAGE_PRESENT bit") Cc: stable@vger.kernel.org # v4.20+ Suggested-by: Aneesh Kumar K.V Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Signed-off-by: Michael Ellerman --- arch/powerpc/mm/pgtable.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index db4a6253df92..533fc6fa6726 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -372,13 +372,25 @@ pte_t *__find_linux_pte(pgd_t *pgdir, unsigned long ea, pdshift = PMD_SHIFT; pmdp = pmd_offset(&pud, ea); pmd = READ_ONCE(*pmdp); + /* - * A hugepage collapse is captured by pmd_none, because - * it mark the pmd none and do a hpte invalidate. + * A hugepage collapse is captured by this condition, see + * pmdp_collapse_flush. */ if (pmd_none(pmd)) return NULL; +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * A hugepage split is captured by this condition, see + * pmdp_invalidate. + * + * Huge page modification can be caught here too. + */ + if (pmd_is_serializing(pmd)) + return NULL; +#endif + if (pmd_trans_huge(pmd) || pmd_devmap(pmd)) { if (is_thp) *is_thp = true; -- cgit v1.2.3 From c21f5a9ed85ca3e914ca11f421677ae9ae0d04b0 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 3 Jun 2019 13:00:51 +0000 Subject: powerpc/32s: fix booting with CONFIG_PPC_EARLY_DEBUG_BOOTX When booting through OF, setup_disp_bat() does nothing because disp_BAT are not set. By change, it used to work because BOOTX buffer is mapped 1:1 at address 0x81000000 by the bootloader, and btext_setup_display() sets virt addr same as phys addr. But since commit 215b823707ce ("powerpc/32s: set up an early static hash table for KASAN."), a temporary page table overrides the bootloader mapping. This 0x81000000 is also problematic with the newly implemented Kernel Userspace Access Protection (KUAP) because it is within user address space. This patch fixes those issues by properly setting disp_BAT through a call to btext_prepare_BAT(), allowing setup_disp_bat() to properly setup BAT3 for early bootx screen buffer access. Reported-by: Mathieu Malaterre Fixes: 215b823707ce ("powerpc/32s: set up an early static hash table for KASAN.") Signed-off-by: Christophe Leroy Tested-by: Mathieu Malaterre Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/btext.h | 4 ++++ arch/powerpc/kernel/prom_init.c | 1 + arch/powerpc/kernel/prom_init_check.sh | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/btext.h b/arch/powerpc/include/asm/btext.h index 3ffad030393c..461b0f193864 100644 --- a/arch/powerpc/include/asm/btext.h +++ b/arch/powerpc/include/asm/btext.h @@ -13,7 +13,11 @@ extern void btext_update_display(unsigned long phys, int width, int height, int depth, int pitch); extern void btext_setup_display(int width, int height, int depth, int pitch, unsigned long address); +#ifdef CONFIG_PPC32 extern void btext_prepare_BAT(void); +#else +static inline void btext_prepare_BAT(void) { } +#endif extern void btext_map(void); extern void btext_unmap(void); diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index 00682b8df330..61795c39de21 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2340,6 +2340,7 @@ static void __init prom_check_displays(void) prom_printf("W=%d H=%d LB=%d addr=0x%x\n", width, height, pitch, addr); btext_setup_display(width, height, 8, pitch, addr); + btext_prepare_BAT(); } #endif /* CONFIG_PPC_EARLY_DEBUG_BOOTX */ } diff --git a/arch/powerpc/kernel/prom_init_check.sh b/arch/powerpc/kernel/prom_init_check.sh index 4cac45cb5de5..acf63ad8f4ce 100644 --- a/arch/powerpc/kernel/prom_init_check.sh +++ b/arch/powerpc/kernel/prom_init_check.sh @@ -27,7 +27,7 @@ fi WHITELIST="add_reloc_offset __bss_start __bss_stop copy_and_flush _end enter_prom $MEM_FUNCS reloc_offset __secondary_hold __secondary_hold_acknowledge __secondary_hold_spinloop __start -logo_linux_clut224 +logo_linux_clut224 btext_prepare_BAT reloc_got2 kernstart_addr memstart_addr linux_banner _stext __prom_init_toc_start __prom_init_toc_end btext_setup_display TOC." -- cgit v1.2.3