diff options
Diffstat (limited to 'arch')
633 files changed, 10999 insertions, 7953 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 33687dddd86a..5e43fcbad4ca 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -249,6 +249,10 @@ config ARCH_HAS_FORTIFY_SOURCE config ARCH_HAS_SET_MEMORY bool +# Select if arch has all set_direct_map_invalid/default() functions +config ARCH_HAS_SET_DIRECT_MAP + bool + # Select if arch init_task must go in the __init_task_data section config ARCH_TASK_STRUCT_ON_STACK bool @@ -383,7 +387,13 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE config HAVE_RCU_TABLE_FREE bool -config HAVE_RCU_TABLE_INVALIDATE +config HAVE_RCU_TABLE_NO_INVALIDATE + bool + +config HAVE_MMU_GATHER_PAGE_SIZE + bool + +config HAVE_MMU_GATHER_NO_GATHER bool config ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -901,6 +911,15 @@ config HAVE_ARCH_PREL32_RELOCATIONS config ARCH_USE_MEMREMAP_PROT bool +config LOCK_EVENT_COUNTS + bool "Locking event counts collection" + depends on DEBUG_FS + ---help--- + Enable light-weight counting of various locking related events + in the system with minimal performance impact. This reduces + the chance of application behavior change because of timing + differences. The counts are reported via debugfs. + source "kernel/gcov/Kconfig" source "scripts/gcc-plugins/Kconfig" diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 584a6e114853..f7b19b813a70 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -36,6 +36,7 @@ config ALPHA select ODD_RT_SIGACTION select OLD_SIGSUSPEND select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67 + select MMU_GATHER_NO_RANGE help The Alpha is a 64-bit general-purpose processor designed and marketed by the Digital Equipment Corporation of blessed memory, @@ -49,13 +50,6 @@ config MMU bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool default n diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 70b783333965..89e87bbc987f 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -9,6 +9,7 @@ generic-y += irq_work.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += preempt.h generic-y += sections.h generic-y += trace_clock.h diff --git a/arch/alpha/include/asm/io.h b/arch/alpha/include/asm/io.h index 4c533fc94d62..ccf9d65166bb 100644 --- a/arch/alpha/include/asm/io.h +++ b/arch/alpha/include/asm/io.h @@ -513,8 +513,6 @@ extern inline void writeq(u64 b, volatile void __iomem *addr) #define writel_relaxed(b, addr) __raw_writel(b, addr) #define writeq_relaxed(b, addr) __raw_writeq(b, addr) -#define mmiowb() - /* * String version of IO memory access ops: */ diff --git a/arch/alpha/include/asm/rwsem.h b/arch/alpha/include/asm/rwsem.h deleted file mode 100644 index cf8fc8f9a2ed..000000000000 --- a/arch/alpha/include/asm/rwsem.h +++ /dev/null @@ -1,211 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ALPHA_RWSEM_H -#define _ALPHA_RWSEM_H - -/* - * Written by Ivan Kokshaysky <ink@jurassic.park.msu.ru>, 2001. - * Based on asm-alpha/semaphore.h and asm-i386/rwsem.h - */ - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ - -#include <linux/compiler.h> - -#define RWSEM_UNLOCKED_VALUE 0x0000000000000000L -#define RWSEM_ACTIVE_BIAS 0x0000000000000001L -#define RWSEM_ACTIVE_MASK 0x00000000ffffffffL -#define RWSEM_WAITING_BIAS (-0x0000000100000000L) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -static inline int ___down_read(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter += RWSEM_ACTIVE_READ_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory"); -#endif - return (oldcount < 0); -} - -static inline void __down_read(struct rw_semaphore *sem) -{ - if (unlikely(___down_read(sem))) - rwsem_down_read_failed(sem); -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (unlikely(___down_read(sem))) - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - - return 0; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline int __down_read_trylock(struct rw_semaphore *sem) -{ - long old, new, res; - - res = atomic_long_read(&sem->count); - do { - new = res + RWSEM_ACTIVE_READ_BIAS; - if (new <= 0) - break; - old = res; - res = atomic_long_cmpxchg(&sem->count, old, new); - } while (res != old); - return res >= 0 ? 1 : 0; -} - -static inline long ___down_write(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter += RWSEM_ACTIVE_WRITE_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); -#endif - return oldcount; -} - -static inline void __down_write(struct rw_semaphore *sem) -{ - if (unlikely(___down_write(sem))) - rwsem_down_write_failed(sem); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (unlikely(___down_write(sem))) { - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - } - - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline int __down_write_trylock(struct rw_semaphore *sem) -{ - long ret = atomic_long_cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); - if (ret == RWSEM_UNLOCKED_VALUE) - return 1; - return 0; -} - -static inline void __up_read(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter -= RWSEM_ACTIVE_READ_BIAS; -#else - long temp; - __asm__ __volatile__( - " mb\n" - "1: ldq_l %0,%1\n" - " subq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_READ_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(oldcount < 0)) - if ((int)oldcount - RWSEM_ACTIVE_READ_BIAS == 0) - rwsem_wake(sem); -} - -static inline void __up_write(struct rw_semaphore *sem) -{ - long count; -#ifndef CONFIG_SMP - sem->count.counter -= RWSEM_ACTIVE_WRITE_BIAS; - count = sem->count.counter; -#else - long temp; - __asm__ __volatile__( - " mb\n" - "1: ldq_l %0,%1\n" - " subq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " subq %0,%3,%0\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (count), "=m" (sem->count), "=&r" (temp) - :"Ir" (RWSEM_ACTIVE_WRITE_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(count)) - if ((int)count == 0) - rwsem_wake(sem); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - long oldcount; -#ifndef CONFIG_SMP - oldcount = sem->count.counter; - sem->count.counter -= RWSEM_WAITING_BIAS; -#else - long temp; - __asm__ __volatile__( - "1: ldq_l %0,%1\n" - " addq %0,%3,%2\n" - " stq_c %2,%1\n" - " beq %2,2f\n" - " mb\n" - ".subsection 2\n" - "2: br 1b\n" - ".previous" - :"=&r" (oldcount), "=m" (sem->count), "=&r" (temp) - :"Ir" (-RWSEM_WAITING_BIAS), "m" (sem->count) : "memory"); -#endif - if (unlikely(oldcount < 0)) - rwsem_downgrade_wake(sem); -} - -#endif /* __KERNEL__ */ -#endif /* _ALPHA_RWSEM_H */ diff --git a/arch/alpha/include/asm/tlb.h b/arch/alpha/include/asm/tlb.h index 8f5042b61875..4f79e331af5e 100644 --- a/arch/alpha/include/asm/tlb.h +++ b/arch/alpha/include/asm/tlb.h @@ -2,12 +2,6 @@ #ifndef _ALPHA_TLB_H #define _ALPHA_TLB_H -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c index 3034d6d936d2..242108439f42 100644 --- a/arch/alpha/kernel/pci_iommu.c +++ b/arch/alpha/kernel/pci_iommu.c @@ -249,7 +249,7 @@ static int pci_dac_dma_supported(struct pci_dev *dev, u64 mask) ok = 0; /* If both conditions above are met, we are fine. */ - DBGA("pci_dac_dma_supported %s from %pf\n", + DBGA("pci_dac_dma_supported %s from %ps\n", ok ? "yes" : "no", __builtin_return_address(0)); return ok; @@ -281,7 +281,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, && paddr + size <= __direct_map_size) { ret = paddr + __direct_map_base; - DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %pf\n", + DBGA2("pci_map_single: [%p,%zx] -> direct %llx from %ps\n", cpu_addr, size, ret, __builtin_return_address(0)); return ret; @@ -292,7 +292,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, if (dac_allowed) { ret = paddr + alpha_mv.pci_dac_offset; - DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %pf\n", + DBGA2("pci_map_single: [%p,%zx] -> DAC %llx from %ps\n", cpu_addr, size, ret, __builtin_return_address(0)); return ret; @@ -329,7 +329,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, size_t size, ret = arena->dma_base + dma_ofs * PAGE_SIZE; ret += (unsigned long)cpu_addr & ~PAGE_MASK; - DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %pf\n", + DBGA2("pci_map_single: [%p,%zx] np %ld -> sg %llx from %ps\n", cpu_addr, size, npages, ret, __builtin_return_address(0)); return ret; @@ -396,14 +396,14 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr, && dma_addr < __direct_map_base + __direct_map_size) { /* Nothing to do. */ - DBGA2("pci_unmap_single: direct [%llx,%zx] from %pf\n", + DBGA2("pci_unmap_single: direct [%llx,%zx] from %ps\n", dma_addr, size, __builtin_return_address(0)); return; } if (dma_addr > 0xffffffff) { - DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %pf\n", + DBGA2("pci64_unmap_single: DAC [%llx,%zx] from %ps\n", dma_addr, size, __builtin_return_address(0)); return; } @@ -435,7 +435,7 @@ static void alpha_pci_unmap_page(struct device *dev, dma_addr_t dma_addr, spin_unlock_irqrestore(&arena->lock, flags); - DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %pf\n", + DBGA2("pci_unmap_single: sg [%llx,%zx] np %ld from %ps\n", dma_addr, size, npages, __builtin_return_address(0)); } @@ -458,7 +458,7 @@ try_again: cpu_addr = (void *)__get_free_pages(gfp | __GFP_ZERO, order); if (! cpu_addr) { printk(KERN_INFO "pci_alloc_consistent: " - "get_free_pages failed from %pf\n", + "get_free_pages failed from %ps\n", __builtin_return_address(0)); /* ??? Really atomic allocation? Otherwise we could play with vmalloc and sg if we can't find contiguous memory. */ @@ -477,7 +477,7 @@ try_again: goto try_again; } - DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %pf\n", + DBGA2("pci_alloc_consistent: %zx -> [%p,%llx] from %ps\n", size, cpu_addr, *dma_addrp, __builtin_return_address(0)); return cpu_addr; @@ -497,7 +497,7 @@ static void alpha_pci_free_coherent(struct device *dev, size_t size, pci_unmap_single(pdev, dma_addr, size, PCI_DMA_BIDIRECTIONAL); free_pages((unsigned long)cpu_addr, get_order(size)); - DBGA2("pci_free_consistent: [%llx,%zx] from %pf\n", + DBGA2("pci_free_consistent: [%llx,%zx] from %ps\n", dma_addr, size, __builtin_return_address(0)); } diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 63ed39cbd3bd..165f268beafc 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -463,3 +463,7 @@ 532 common getppid sys_getppid # all other architectures have common numbers for new syscall, alpha # is the exception. +534 common pidfd_send_signal sys_pidfd_send_signal +535 common io_uring_setup sys_io_uring_setup +536 common io_uring_enter sys_io_uring_enter +537 common io_uring_register sys_io_uring_register diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index c781e45d1d99..23e063df5d2c 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -63,9 +63,6 @@ config SCHED_OMIT_FRAME_POINTER config GENERIC_CSUM def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config ARCH_DISCONTIGMEM_ENABLE def_bool n diff --git a/arch/arc/boot/dts/hsdk.dts b/arch/arc/boot/dts/hsdk.dts index 69bc1c9e8e50..7425bb0f2d1b 100644 --- a/arch/arc/boot/dts/hsdk.dts +++ b/arch/arc/boot/dts/hsdk.dts @@ -18,8 +18,8 @@ model = "snps,hsdk"; compatible = "snps,hsdk"; - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; chosen { bootargs = "earlycon=uart8250,mmio32,0xf0005000,115200n8 console=ttyS0,115200n8 debug print-fatal-signals=1"; @@ -105,7 +105,7 @@ #size-cells = <1>; interrupt-parent = <&idu_intc>; - ranges = <0x00000000 0xf0000000 0x10000000>; + ranges = <0x00000000 0x0 0xf0000000 0x10000000>; cgu_rst: reset-controller@8a0 { compatible = "snps,hsdk-reset"; @@ -269,9 +269,10 @@ }; memory@80000000 { - #address-cells = <1>; - #size-cells = <1>; + #address-cells = <2>; + #size-cells = <2>; device_type = "memory"; - reg = <0x80000000 0x40000000>; /* 1 GiB */ + reg = <0x0 0x80000000 0x0 0x40000000>; /* 1 GB lowmem */ + /* 0x1 0x00000000 0x0 0x40000000>; 1 GB highmem */ }; }; diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index decc306a3b52..393d4f5e1450 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/arc/include/asm/tlb.h b/arch/arc/include/asm/tlb.h index a9db5f62aaf3..90cac97643a4 100644 --- a/arch/arc/include/asm/tlb.h +++ b/arch/arc/include/asm/tlb.h @@ -9,38 +9,6 @@ #ifndef _ASM_ARC_TLB_H #define _ASM_ARC_TLB_H -#define tlb_flush(tlb) \ -do { \ - if (tlb->fullmm) \ - flush_tlb_mm((tlb)->mm); \ -} while (0) - -/* - * This pair is called at time of munmap/exit to flush cache and TLB entries - * for mappings being torn down. - * 1) cache-flush part -implemented via tlb_start_vma( ) for VIPT aliasing D$ - * 2) tlb-flush part - implemted via tlb_end_vma( ) flushes the TLB range - * - * Note, read http://lkml.org/lkml/2004/1/15/6 - */ -#ifndef CONFIG_ARC_CACHE_VIPT_ALIASING -#define tlb_start_vma(tlb, vma) -#else -#define tlb_start_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while(0) -#endif - -#define tlb_end_vma(tlb, vma) \ -do { \ - if (!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, ptep, address) - #include <linux/pagemap.h> #include <asm-generic/tlb.h> diff --git a/arch/arc/lib/memset-archs.S b/arch/arc/lib/memset-archs.S index f230bb7092fd..b3373f5c88e0 100644 --- a/arch/arc/lib/memset-archs.S +++ b/arch/arc/lib/memset-archs.S @@ -30,10 +30,10 @@ #else -.macro PREALLOC_INSTR +.macro PREALLOC_INSTR reg, off .endm -.macro PREFETCHW_INSTR +.macro PREFETCHW_INSTR reg, off .endm #endif diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 4135abec3fb0..63e6e6504699 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -113,10 +113,24 @@ static void read_decode_cache_bcr_arcv2(int cpu) } READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); - if (cbcr.c) + if (cbcr.c) { ioc_exists = 1; - else + + /* + * As for today we don't support both IOC and ZONE_HIGHMEM enabled + * simultaneously. This happens because as of today IOC aperture covers + * only ZONE_NORMAL (low mem) and any dma transactions outside this + * region won't be HW coherent. + * If we want to use both IOC and ZONE_HIGHMEM we can use + * bounce_buffer to handle dma transactions to HIGHMEM. + * Also it is possible to modify dma_direct cache ops or increase IOC + * aperture size if we are planning to use HIGHMEM without PAE. + */ + if (IS_ENABLED(CONFIG_HIGHMEM) || is_pae40_enabled()) + ioc_enable = 0; + } else { ioc_enable = 0; + } /* HS 2.0 didn't have AUX_VOL */ if (cpuinfo_arc700[cpu].core.family > 0x51) { @@ -1158,19 +1172,6 @@ noinline void __init arc_ioc_setup(void) if (!ioc_enable) return; - /* - * As for today we don't support both IOC and ZONE_HIGHMEM enabled - * simultaneously. This happens because as of today IOC aperture covers - * only ZONE_NORMAL (low mem) and any dma transactions outside this - * region won't be HW coherent. - * If we want to use both IOC and ZONE_HIGHMEM we can use - * bounce_buffer to handle dma transactions to HIGHMEM. - * Also it is possible to modify dma_direct cache ops or increase IOC - * aperture size if we are planning to use HIGHMEM without PAE. - */ - if (IS_ENABLED(CONFIG_HIGHMEM)) - panic("IOC and HIGHMEM can't be used simultaneously"); - /* Flush + invalidate + disable L1 dcache */ __dc_disable(); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 850b4805e2d1..dc9855c4a3b4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -73,7 +73,7 @@ config ARM select HAVE_EFFICIENT_UNALIGNED_ACCESS if (CPU_V6 || CPU_V6K || CPU_V7) && MMU select HAVE_EXIT_THREAD select HAVE_FTRACE_MCOUNT_RECORD if !XIP_KERNEL - select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL + select HAVE_FUNCTION_GRAPH_TRACER if !THUMB2_KERNEL && !CC_IS_CLANG select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) @@ -178,10 +178,6 @@ config TRACE_IRQFLAGS_SUPPORT bool default !CPU_V7M -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 6d6e0330930b..e388af4594a6 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -47,8 +47,8 @@ config DEBUG_WX choice prompt "Choose kernel unwinder" - default UNWINDER_ARM if AEABI && !FUNCTION_GRAPH_TRACER - default UNWINDER_FRAME_POINTER if !AEABI || FUNCTION_GRAPH_TRACER + default UNWINDER_ARM if AEABI + default UNWINDER_FRAME_POINTER if !AEABI help This determines which method will be used for unwinding kernel stack traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack, @@ -65,7 +65,7 @@ config UNWINDER_FRAME_POINTER config UNWINDER_ARM bool "ARM EABI stack unwinder" - depends on AEABI + depends on AEABI && !FUNCTION_GRAPH_TRACER select ARM_UNWIND help This option enables stack unwinding support in the kernel diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 6c7ccb428c07..7135820f76d4 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -1438,7 +1438,21 @@ ENTRY(efi_stub_entry) @ Preserve return value of efi_entry() in r4 mov r4, r0 - bl cache_clean_flush + + @ our cache maintenance code relies on CP15 barrier instructions + @ but since we arrived here with the MMU and caches configured + @ by UEFI, we must check that the CP15BEN bit is set in SCTLR. + @ Note that this bit is RAO/WI on v6 and earlier, so the ISB in + @ the enable path will be executed on v7+ only. + mrc p15, 0, r1, c1, c0, 0 @ read SCTLR + tst r1, #(1 << 5) @ CP15BEN bit set? + bne 0f + orr r1, r1, #(1 << 5) @ CP15 barrier instructions + mcr p15, 0, r1, c1, c0, 0 @ write SCTLR + ARM( .inst 0xf57ff06f @ v7+ isb ) + THUMB( isb ) + +0: bl cache_clean_flush bl cache_off @ Set parameters for booting zImage according to boot protocol diff --git a/arch/arm/crypto/aes-neonbs-glue.c b/arch/arm/crypto/aes-neonbs-glue.c index 07e31941dc67..617c2c99ebfb 100644 --- a/arch/arm/crypto/aes-neonbs-glue.c +++ b/arch/arm/crypto/aes-neonbs-glue.c @@ -278,6 +278,8 @@ static int __xts_crypt(struct skcipher_request *req, int err; err = skcipher_walk_virt(&walk, req, true); + if (err) + return err; crypto_cipher_encrypt_one(ctx->tweak_tfm, walk.iv, walk.iv); diff --git a/arch/arm/crypto/chacha-neon-glue.c b/arch/arm/crypto/chacha-neon-glue.c index 9d6fda81986d..48a89537b828 100644 --- a/arch/arm/crypto/chacha-neon-glue.c +++ b/arch/arm/crypto/chacha-neon-glue.c @@ -21,6 +21,7 @@ #include <crypto/algapi.h> #include <crypto/chacha.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> @@ -93,7 +94,7 @@ static int chacha_neon(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_chacha_crypt(req); return chacha_neon_stream_xor(req, ctx, req->iv); @@ -107,7 +108,7 @@ static int xchacha_neon(struct skcipher_request *req) u32 state[16]; u8 real_iv[16]; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_xchacha_crypt(req); crypto_chacha_init(state, ctx, req->iv); diff --git a/arch/arm/crypto/crc32-ce-glue.c b/arch/arm/crypto/crc32-ce-glue.c index cd9e93b46c2d..e712c2a7d387 100644 --- a/arch/arm/crypto/crc32-ce-glue.c +++ b/arch/arm/crypto/crc32-ce-glue.c @@ -16,6 +16,7 @@ #include <linux/string.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <asm/hwcap.h> #include <asm/neon.h> @@ -113,7 +114,7 @@ static int crc32_pmull_update(struct shash_desc *desc, const u8 *data, u32 *crc = shash_desc_ctx(desc); unsigned int l; - if (may_use_simd()) { + if (crypto_simd_usable()) { if ((u32)data % SCALE_F) { l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F)); @@ -147,7 +148,7 @@ static int crc32c_pmull_update(struct shash_desc *desc, const u8 *data, u32 *crc = shash_desc_ctx(desc); unsigned int l; - if (may_use_simd()) { + if (crypto_simd_usable()) { if ((u32)data % SCALE_F) { l = min_t(u32, length, SCALE_F - ((u32)data % SCALE_F)); diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c index 3d6b800b8396..3b24f2872592 100644 --- a/arch/arm/crypto/crct10dif-ce-glue.c +++ b/arch/arm/crypto/crct10dif-ce-glue.c @@ -15,6 +15,7 @@ #include <linux/string.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <asm/neon.h> #include <asm/simd.h> @@ -36,7 +37,7 @@ static int crct10dif_update(struct shash_desc *desc, const u8 *data, { u16 *crc = shash_desc_ctx(desc); - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) { + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { kernel_neon_begin(); *crc = crc_t10dif_pmull(*crc, data, length); kernel_neon_end(); diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c index b7d30b6cf49c..39d1ccec1aab 100644 --- a/arch/arm/crypto/ghash-ce-glue.c +++ b/arch/arm/crypto/ghash-ce-glue.c @@ -14,6 +14,7 @@ #include <asm/unaligned.h> #include <crypto/cryptd.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/gf128mul.h> #include <linux/cpufeature.h> #include <linux/crypto.h> @@ -185,7 +186,6 @@ static int ghash_async_init(struct ahash_request *req) struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); desc->tfm = child; - desc->flags = req->base.flags; return crypto_shash_init(desc); } @@ -196,7 +196,7 @@ static int ghash_async_update(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!may_use_simd() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -214,7 +214,7 @@ static int ghash_async_final(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!may_use_simd() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -232,7 +232,7 @@ static int ghash_async_digest(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!may_use_simd() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -242,7 +242,6 @@ static int ghash_async_digest(struct ahash_request *req) struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); desc->tfm = child; - desc->flags = req->base.flags; return shash_ahash_digest(req, desc); } } @@ -255,7 +254,6 @@ static int ghash_async_import(struct ahash_request *req, const void *in) struct shash_desc *desc = cryptd_shash_desc(cryptd_req); desc->tfm = cryptd_ahash_child(ctx->cryptd_tfm); - desc->flags = req->base.flags; return crypto_shash_import(desc, in); } diff --git a/arch/arm/crypto/nhpoly1305-neon-glue.c b/arch/arm/crypto/nhpoly1305-neon-glue.c index 49aae87cb2bc..ae5aefc44a4d 100644 --- a/arch/arm/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm/crypto/nhpoly1305-neon-glue.c @@ -9,6 +9,7 @@ #include <asm/neon.h> #include <asm/simd.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> @@ -25,7 +26,7 @@ static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_neon_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - if (srclen < 64 || !may_use_simd()) + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); do { diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c index b732522e20f8..4c6c6900853c 100644 --- a/arch/arm/crypto/sha1-ce-glue.c +++ b/arch/arm/crypto/sha1-ce-glue.c @@ -9,6 +9,7 @@ */ #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> #include <linux/cpufeature.h> @@ -33,7 +34,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, { struct sha1_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd() || + if (!crypto_simd_usable() || (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) return sha1_update_arm(desc, data, len); @@ -47,7 +48,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return sha1_finup_arm(desc, data, len, out); kernel_neon_begin(); diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c index d15e0ea2c95e..d6c95c213d42 100644 --- a/arch/arm/crypto/sha1_neon_glue.c +++ b/arch/arm/crypto/sha1_neon_glue.c @@ -19,6 +19,7 @@ */ #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> @@ -39,7 +40,7 @@ static int sha1_neon_update(struct shash_desc *desc, const u8 *data, { struct sha1_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd() || + if (!crypto_simd_usable() || (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) return sha1_update_arm(desc, data, len); @@ -54,7 +55,7 @@ static int sha1_neon_update(struct shash_desc *desc, const u8 *data, static int sha1_neon_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return sha1_finup_arm(desc, data, len, out); kernel_neon_begin(); diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c index 1211a5c129fc..a47a9d4b663e 100644 --- a/arch/arm/crypto/sha2-ce-glue.c +++ b/arch/arm/crypto/sha2-ce-glue.c @@ -9,6 +9,7 @@ */ #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> #include <linux/cpufeature.h> @@ -34,7 +35,7 @@ static int sha2_ce_update(struct shash_desc *desc, const u8 *data, { struct sha256_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd() || + if (!crypto_simd_usable() || (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) return crypto_sha256_arm_update(desc, data, len); @@ -49,7 +50,7 @@ static int sha2_ce_update(struct shash_desc *desc, const u8 *data, static int sha2_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sha256_arm_finup(desc, data, len, out); kernel_neon_begin(); diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c index 1d82c6cd31a4..f3f6b1624fc3 100644 --- a/arch/arm/crypto/sha256_neon_glue.c +++ b/arch/arm/crypto/sha256_neon_glue.c @@ -15,6 +15,7 @@ */ #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/cryptohash.h> #include <linux/types.h> #include <linux/string.h> @@ -34,7 +35,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data, { struct sha256_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd() || + if (!crypto_simd_usable() || (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) return crypto_sha256_arm_update(desc, data, len); @@ -49,7 +50,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data, static int sha256_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sha256_arm_finup(desc, data, len, out); kernel_neon_begin(); diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c index 8a5642b41fd6..d33ab59c26c0 100644 --- a/arch/arm/crypto/sha512-neon-glue.c +++ b/arch/arm/crypto/sha512-neon-glue.c @@ -9,6 +9,7 @@ */ #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha512_base.h> #include <linux/crypto.h> @@ -30,7 +31,7 @@ static int sha512_neon_update(struct shash_desc *desc, const u8 *data, { struct sha512_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd() || + if (!crypto_simd_usable() || (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return sha512_arm_update(desc, data, len); @@ -45,7 +46,7 @@ static int sha512_neon_update(struct shash_desc *desc, const u8 *data, static int sha512_neon_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return sha512_arm_finup(desc, data, len, out); kernel_neon_begin(); diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index a8a4eb7f6dae..41deac2451af 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -9,10 +9,10 @@ generic-y += kdebug.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += parport.h generic-y += preempt.h -generic-y += rwsem.h generic-y += seccomp.h generic-y += segment.h generic-y += serial.h diff --git a/arch/arm/include/asm/arch_timer.h b/arch/arm/include/asm/arch_timer.h index 0a8d7bba2cb0..4b66ecd6be99 100644 --- a/arch/arm/include/asm/arch_timer.h +++ b/arch/arm/include/asm/arch_timer.h @@ -11,6 +11,10 @@ #include <clocksource/arm_arch_timer.h> #ifdef CONFIG_ARM_ARCH_TIMER +/* 32bit ARM doesn't know anything about timer errata... */ +#define has_erratum_handler(h) (false) +#define erratum_handler(h) (arch_timer_##h) + int arch_timer_arch_init(void); /* @@ -79,7 +83,7 @@ static inline u32 arch_timer_get_cntfrq(void) return val; } -static inline u64 arch_counter_get_cntpct(void) +static inline u64 __arch_counter_get_cntpct(void) { u64 cval; @@ -88,7 +92,12 @@ static inline u64 arch_counter_get_cntpct(void) return cval; } -static inline u64 arch_counter_get_cntvct(void) +static inline u64 __arch_counter_get_cntpct_stable(void) +{ + return __arch_counter_get_cntpct(); +} + +static inline u64 __arch_counter_get_cntvct(void) { u64 cval; @@ -97,6 +106,11 @@ static inline u64 arch_counter_get_cntvct(void) return cval; } +static inline u64 __arch_counter_get_cntvct_stable(void) +{ + return __arch_counter_get_cntvct(); +} + static inline u32 arch_timer_get_cntkctl(void) { u32 cntkctl; diff --git a/arch/arm/include/asm/cp15.h b/arch/arm/include/asm/cp15.h index 07e27f212dc7..d2453e2d3f1f 100644 --- a/arch/arm/include/asm/cp15.h +++ b/arch/arm/include/asm/cp15.h @@ -68,6 +68,8 @@ #define BPIALL __ACCESS_CP15(c7, 0, c5, 6) #define ICIALLU __ACCESS_CP15(c7, 0, c5, 0) +#define CNTVCT __ACCESS_CP15_64(1, c14) + extern unsigned long cr_alignment; /* defined in entry-armv.S */ static inline unsigned long get_cr(void) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 6b51826ab3d1..7e22c81398c4 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -281,8 +281,6 @@ extern void _memcpy_fromio(void *, const volatile void __iomem *, size_t); extern void _memcpy_toio(volatile void __iomem *, const void *, size_t); extern void _memset_io(volatile void __iomem *, int, size_t); -#define mmiowb() - /* * Memory access primitives * ------------------------ diff --git a/arch/arm/include/asm/stage2_pgtable.h b/arch/arm/include/asm/stage2_pgtable.h index 9e11dce55e06..9587517649bd 100644 --- a/arch/arm/include/asm/stage2_pgtable.h +++ b/arch/arm/include/asm/stage2_pgtable.h @@ -32,14 +32,14 @@ #define stage2_pgd_present(kvm, pgd) pgd_present(pgd) #define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) #define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) -#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) +#define stage2_pud_free(kvm, pud) do { } while (0) #define stage2_pud_none(kvm, pud) pud_none(pud) #define stage2_pud_clear(kvm, pud) pud_clear(pud) #define stage2_pud_present(kvm, pud) pud_present(pud) #define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) #define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) -#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) +#define stage2_pmd_free(kvm, pmd) free_page((unsigned long)pmd) #define stage2_pud_huge(kvm, pud) pud_huge(pud) diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index f854148c8d7c..bc6d04a09899 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -33,271 +33,42 @@ #include <asm/pgalloc.h> #include <asm/tlbflush.h> -#define MMU_GATHER_BUNDLE 8 - -#ifdef CONFIG_HAVE_RCU_TABLE_FREE static inline void __tlb_remove_table(void *_table) { free_page_and_swap_cache((struct page *)_table); } -struct mmu_table_batch { - struct rcu_head rcu; - unsigned int nr; - void *tables[0]; -}; - -#define MAX_TABLE_BATCH \ - ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) - -extern void tlb_table_flush(struct mmu_gather *tlb); -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -#define tlb_remove_entry(tlb, entry) tlb_remove_table(tlb, entry) -#else -#define tlb_remove_entry(tlb, entry) tlb_remove_page(tlb, entry) -#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ - -/* - * TLB handling. This allows us to remove pages from the page - * tables, and efficiently handle the TLB issues. - */ -struct mmu_gather { - struct mm_struct *mm; -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - struct mmu_table_batch *batch; - unsigned int need_flush; -#endif - unsigned int fullmm; - struct vm_area_struct *vma; - unsigned long start, end; - unsigned long range_start; - unsigned long range_end; - unsigned int nr; - unsigned int max; - struct page **pages; - struct page *local[MMU_GATHER_BUNDLE]; -}; - -DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); - -/* - * This is unnecessarily complex. There's three ways the TLB shootdown - * code is used: - * 1. Unmapping a range of vmas. See zap_page_range(), unmap_region(). - * tlb->fullmm = 0, and tlb_start_vma/tlb_end_vma will be called. - * tlb->vma will be non-NULL. - * 2. Unmapping all vmas. See exit_mmap(). - * tlb->fullmm = 1, and tlb_start_vma/tlb_end_vma will be called. - * tlb->vma will be non-NULL. Additionally, page tables will be freed. - * 3. Unmapping argument pages. See shift_arg_pages(). - * tlb->fullmm = 0, but tlb_start_vma/tlb_end_vma will not be called. - * tlb->vma will be NULL. - */ -static inline void tlb_flush(struct mmu_gather *tlb) -{ - if (tlb->fullmm || !tlb->vma) - flush_tlb_mm(tlb->mm); - else if (tlb->range_end > 0) { - flush_tlb_range(tlb->vma, tlb->range_start, tlb->range_end); - tlb->range_start = TASK_SIZE; - tlb->range_end = 0; - } -} - -static inline void tlb_add_flush(struct mmu_gather *tlb, unsigned long addr) -{ - if (!tlb->fullmm) { - if (addr < tlb->range_start) - tlb->range_start = addr; - if (addr + PAGE_SIZE > tlb->range_end) - tlb->range_end = addr + PAGE_SIZE; - } -} - -static inline void __tlb_alloc_page(struct mmu_gather *tlb) -{ - unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); - - if (addr) { - tlb->pages = (void *)addr; - tlb->max = PAGE_SIZE / sizeof(struct page *); - } -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - tlb_flush(tlb); -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb_table_flush(tlb); -#endif -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - free_pages_and_swap_cache(tlb->pages, tlb->nr); - tlb->nr = 0; - if (tlb->pages == tlb->local) - __tlb_alloc_page(tlb); -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->fullmm = !(start | (end+1)); - tlb->start = start; - tlb->end = end; - tlb->vma = NULL; - tlb->max = ARRAY_SIZE(tlb->local); - tlb->pages = tlb->local; - tlb->nr = 0; - __tlb_alloc_page(tlb); +#include <asm-generic/tlb.h> -#ifdef CONFIG_HAVE_RCU_TABLE_FREE - tlb->batch = NULL; +#ifndef CONFIG_HAVE_RCU_TABLE_FREE +#define tlb_remove_table(tlb, entry) tlb_remove_page(tlb, entry) #endif -} - -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->range_start = start; - tlb->range_end = end; - } - - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - - if (tlb->pages != tlb->local) - free_pages((unsigned long)tlb->pages, 0); -} - -/* - * Memorize the range for the TLB flush. - */ -static inline void -tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) -{ - tlb_add_flush(tlb, addr); -} - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) -/* - * In the case of tlb vma handling, we can optimise these away in the - * case where we're doing a full MM flush. When we're doing a munmap, - * the vmas are adjusted to only cover the region to be torn down. - */ -static inline void -tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) { - flush_cache_range(vma, vma->vm_start, vma->vm_end); - tlb->vma = vma; - tlb->range_start = TASK_SIZE; - tlb->range_end = 0; - } -} static inline void -tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) - tlb_flush(tlb); -} - -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->pages[tlb->nr++] = page; - VM_WARN_ON(tlb->nr > tlb->max); - if (tlb->nr == tlb->max) - return true; - return false; -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - if (__tlb_remove_page(tlb, page)) - tlb_flush_mmu(tlb); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, - unsigned long addr) +__pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, unsigned long addr) { pgtable_page_dtor(pte); -#ifdef CONFIG_ARM_LPAE - tlb_add_flush(tlb, addr); -#else +#ifndef CONFIG_ARM_LPAE /* * With the classic ARM MMU, a pte page has two corresponding pmd * entries, each covering 1MB. */ - addr &= PMD_MASK; - tlb_add_flush(tlb, addr + SZ_1M - PAGE_SIZE); - tlb_add_flush(tlb, addr + SZ_1M); + addr = (addr & PMD_MASK) + SZ_1M; + __tlb_adjust_range(tlb, addr - PAGE_SIZE, 2 * PAGE_SIZE); #endif - tlb_remove_entry(tlb, pte); -} - -static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, - unsigned long addr) -{ -#ifdef CONFIG_ARM_LPAE - tlb_add_flush(tlb, addr); - tlb_remove_entry(tlb, virt_to_page(pmdp)); -#endif + tlb_remove_table(tlb, pte); } static inline void -tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) -{ - tlb_add_flush(tlb, addr); -} - -#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) -#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) -#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) - -#define tlb_migrate_finish(mm) do { } while (0) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) +__pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { -} - -static inline void tlb_flush_remove_tables(struct mm_struct *mm) -{ -} +#ifdef CONFIG_ARM_LPAE + struct page *page = virt_to_page(pmdp); -static inline void tlb_flush_remove_tables_local(void *arg) -{ + tlb_remove_table(tlb, page); +#endif } #endif /* CONFIG_MMU */ diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index c08d2d890f7b..b38bbd011b35 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -133,9 +133,9 @@ __secondary_data: */ .text __after_proc_init: -#ifdef CONFIG_ARM_MPU M_CLASS(movw r12, #:lower16:BASEADDR_V7M_SCB) M_CLASS(movt r12, #:upper16:BASEADDR_V7M_SCB) +#ifdef CONFIG_ARM_MPU M_CLASS(ldr r3, [r12, 0x50]) AR_CLASS(mrc p15, 0, r3, c0, c1, 4) @ Read ID_MMFR0 and r3, r3, #(MMFR0_PMSA) @ PMSA field diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 76bb8de6bf6b..be5edfdde558 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -549,8 +549,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) int ret; /* - * Increment event counter and perform fixup for the pre-signal - * frame. + * Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index a56e7c856ab5..86870f40f9a0 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -115,8 +115,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, * running on another CPU? For now, ignore it as we * can't guarantee we won't explode. */ - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; return; #else frame.fp = thread_saved_fp(tsk); @@ -134,8 +132,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, } walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) @@ -153,8 +149,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) frame.pc = regs->ARM_pc; walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) diff --git a/arch/arm/mach-ep93xx/edb93xx.c b/arch/arm/mach-ep93xx/edb93xx.c index 8e89ec8b6f0f..34e18e9556d9 100644 --- a/arch/arm/mach-ep93xx/edb93xx.c +++ b/arch/arm/mach-ep93xx/edb93xx.c @@ -29,6 +29,7 @@ #include <linux/platform_device.h> #include <linux/i2c.h> #include <linux/spi/spi.h> +#include <linux/gpio/machine.h> #include <sound/cs4271.h> @@ -105,13 +106,16 @@ static struct spi_board_info edb93xx_spi_board_info[] __initdata = { }, }; -static int edb93xx_spi_chipselects[] __initdata = { - EP93XX_GPIO_LINE_EGPIO6, +static struct gpiod_lookup_table edb93xx_spi_cs_gpio_table = { + .dev_id = "ep93xx-spi.0", + .table = { + GPIO_LOOKUP("A", 6, "cs", GPIO_ACTIVE_LOW), + { }, + }, }; static struct ep93xx_spi_info edb93xx_spi_info __initdata = { - .chipselect = edb93xx_spi_chipselects, - .num_chipselect = ARRAY_SIZE(edb93xx_spi_chipselects), + /* Intentionally left blank */ }; static void __init edb93xx_register_spi(void) @@ -123,6 +127,7 @@ static void __init edb93xx_register_spi(void) else if (machine_is_edb9315a()) edb93xx_cs4271_data.gpio_nreset = EP93XX_GPIO_LINE_EGPIO14; + gpiod_add_lookup_table(&edb93xx_spi_cs_gpio_table); ep93xx_register_spi(&edb93xx_spi_info, edb93xx_spi_board_info, ARRAY_SIZE(edb93xx_spi_board_info)); } diff --git a/arch/arm/mach-ep93xx/simone.c b/arch/arm/mach-ep93xx/simone.c index 80ccb984d521..f0f38c0dba52 100644 --- a/arch/arm/mach-ep93xx/simone.c +++ b/arch/arm/mach-ep93xx/simone.c @@ -77,13 +77,15 @@ static struct spi_board_info simone_spi_devices[] __initdata = { * low between multi-message command blocks. From v1.4, it uses a GPIO instead. * v1.3 parts will still work, since the signal on SFRMOUT is automatic. */ -static int simone_spi_chipselects[] __initdata = { - EP93XX_GPIO_LINE_EGPIO1, +static struct gpiod_lookup_table simone_spi_cs_gpio_table = { + .dev_id = "ep93xx-spi.0", + .table = { + GPIO_LOOKUP("A", 1, "cs", GPIO_ACTIVE_LOW), + { }, + }, }; static struct ep93xx_spi_info simone_spi_info __initdata = { - .chipselect = simone_spi_chipselects, - .num_chipselect = ARRAY_SIZE(simone_spi_chipselects), .use_dma = 1, }; @@ -113,6 +115,7 @@ static void __init simone_init_machine(void) ep93xx_register_i2c(simone_i2c_board_info, ARRAY_SIZE(simone_i2c_board_info)); gpiod_add_lookup_table(&simone_mmc_spi_gpio_table); + gpiod_add_lookup_table(&simone_spi_cs_gpio_table); ep93xx_register_spi(&simone_spi_info, simone_spi_devices, ARRAY_SIZE(simone_spi_devices)); simone_register_audio(); diff --git a/arch/arm/mach-ep93xx/ts72xx.c b/arch/arm/mach-ep93xx/ts72xx.c index 85b74ac943f0..a3a20c83c6b8 100644 --- a/arch/arm/mach-ep93xx/ts72xx.c +++ b/arch/arm/mach-ep93xx/ts72xx.c @@ -22,6 +22,7 @@ #include <linux/spi/mmc_spi.h> #include <linux/mmc/host.h> #include <linux/platform_data/spi-ep93xx.h> +#include <linux/gpio/machine.h> #include <mach/gpio-ep93xx.h> #include <mach/hardware.h> @@ -269,13 +270,15 @@ static struct spi_board_info bk3_spi_board_info[] __initdata = { * The all work is performed automatically by !SPI_FRAME (SFRM1) and * goes through CPLD */ -static int bk3_spi_chipselects[] __initdata = { - EP93XX_GPIO_LINE_F(3), +static struct gpiod_lookup_table bk3_spi_cs_gpio_table = { + .dev_id = "ep93xx-spi.0", + .table = { + GPIO_LOOKUP("F", 3, "cs", GPIO_ACTIVE_LOW), + { }, + }, }; static struct ep93xx_spi_info bk3_spi_master __initdata = { - .chipselect = bk3_spi_chipselects, - .num_chipselect = ARRAY_SIZE(bk3_spi_chipselects), .use_dma = 1, }; @@ -316,13 +319,17 @@ static struct spi_board_info ts72xx_spi_devices[] __initdata = { }, }; -static int ts72xx_spi_chipselects[] __initdata = { - EP93XX_GPIO_LINE_F(2), /* DIO_17 */ +static struct gpiod_lookup_table ts72xx_spi_cs_gpio_table = { + .dev_id = "ep93xx-spi.0", + .table = { + /* DIO_17 */ + GPIO_LOOKUP("F", 2, "cs", GPIO_ACTIVE_LOW), + { }, + }, }; static struct ep93xx_spi_info ts72xx_spi_info __initdata = { - .chipselect = ts72xx_spi_chipselects, - .num_chipselect = ARRAY_SIZE(ts72xx_spi_chipselects), + /* Intentionally left blank */ }; static void __init ts72xx_init_machine(void) @@ -339,6 +346,7 @@ static void __init ts72xx_init_machine(void) if (board_is_ts7300()) platform_device_register(&ts73xx_fpga_device); #endif + gpiod_add_lookup_table(&ts72xx_spi_cs_gpio_table); ep93xx_register_spi(&ts72xx_spi_info, ts72xx_spi_devices, ARRAY_SIZE(ts72xx_spi_devices)); } @@ -398,6 +406,7 @@ static void __init bk3_init_machine(void) ep93xx_register_eth(&ts72xx_eth_data, 1); + gpiod_add_lookup_table(&bk3_spi_cs_gpio_table); ep93xx_register_spi(&bk3_spi_master, bk3_spi_board_info, ARRAY_SIZE(bk3_spi_board_info)); diff --git a/arch/arm/mach-ep93xx/vision_ep9307.c b/arch/arm/mach-ep93xx/vision_ep9307.c index 767ee64628dc..f95a644769e4 100644 --- a/arch/arm/mach-ep93xx/vision_ep9307.c +++ b/arch/arm/mach-ep93xx/vision_ep9307.c @@ -245,15 +245,17 @@ static struct spi_board_info vision_spi_board_info[] __initdata = { }, }; -static int vision_spi_chipselects[] __initdata = { - EP93XX_GPIO_LINE_EGPIO6, - EP93XX_GPIO_LINE_EGPIO7, - EP93XX_GPIO_LINE_G(2), +static struct gpiod_lookup_table vision_spi_cs_gpio_table = { + .dev_id = "ep93xx-spi.0", + .table = { + GPIO_LOOKUP_IDX("A", 6, "cs", 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("A", 7, "cs", 1, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("G", 2, "cs", 2, GPIO_ACTIVE_LOW), + { }, + }, }; static struct ep93xx_spi_info vision_spi_master __initdata = { - .chipselect = vision_spi_chipselects, - .num_chipselect = ARRAY_SIZE(vision_spi_chipselects), .use_dma = 1, }; @@ -295,6 +297,7 @@ static void __init vision_init_machine(void) ep93xx_register_i2c(vision_i2c_info, ARRAY_SIZE(vision_i2c_info)); gpiod_add_lookup_table(&vision_spi_mmc_gpio_table); + gpiod_add_lookup_table(&vision_spi_cs_gpio_table); ep93xx_register_spi(&vision_spi_master, vision_spi_board_info, ARRAY_SIZE(vision_spi_board_info)); vision_register_i2s(); diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index 87f45b926c78..e67e0b2d4ce0 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -631,7 +631,7 @@ static void imx6_pm_stby_poweroff(void) static int imx6_pm_stby_poweroff_probe(void) { if (pm_power_off) { - pr_warn("%s: pm_power_off already claimed %p %pf!\n", + pr_warn("%s: pm_power_off already claimed %p %ps!\n", __func__, pm_power_off, pm_power_off); return -EBUSY; } diff --git a/arch/arm/mm/alignment.c b/arch/arm/mm/alignment.c index b54f8f8def36..e376883ab35b 100644 --- a/arch/arm/mm/alignment.c +++ b/arch/arm/mm/alignment.c @@ -133,7 +133,7 @@ static const char *usermode_action[] = { static int alignment_proc_show(struct seq_file *m, void *v) { seq_printf(m, "User:\t\t%lu\n", ai_user); - seq_printf(m, "System:\t\t%lu (%pF)\n", ai_sys, ai_sys_last_pc); + seq_printf(m, "System:\t\t%lu (%pS)\n", ai_sys, ai_sys_last_pc); seq_printf(m, "Skipped:\t%lu\n", ai_skipped); seq_printf(m, "Half:\t\t%lu\n", ai_half); seq_printf(m, "Word:\t\t%lu\n", ai_word); diff --git a/arch/arm/nwfpe/fpmodule.c b/arch/arm/nwfpe/fpmodule.c index 1365e8650843..ee34c76e6624 100644 --- a/arch/arm/nwfpe/fpmodule.c +++ b/arch/arm/nwfpe/fpmodule.c @@ -147,7 +147,7 @@ void float_raise(signed char flags) #ifdef CONFIG_DEBUG_USER if (flags & debug) printk(KERN_DEBUG - "NWFPE: %s[%d] takes exception %08x at %pf from %08lx\n", + "NWFPE: %s[%d] takes exception %08x at %ps from %08lx\n", current->comm, current->pid, flags, __builtin_return_address(0), GET_USERREG()->ARM_pc); #endif diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 9016f4081bb9..0393917eaa57 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -437,3 +437,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/arm/vdso/vgettimeofday.c b/arch/arm/vdso/vgettimeofday.c index a9dd619c6c29..7bdbf5d5c47d 100644 --- a/arch/arm/vdso/vgettimeofday.c +++ b/arch/arm/vdso/vgettimeofday.c @@ -18,9 +18,9 @@ #include <linux/compiler.h> #include <linux/hrtimer.h> #include <linux/time.h> -#include <asm/arch_timer.h> #include <asm/barrier.h> #include <asm/bug.h> +#include <asm/cp15.h> #include <asm/page.h> #include <asm/unistd.h> #include <asm/vdso_datapage.h> @@ -123,7 +123,8 @@ static notrace u64 get_ns(struct vdso_data *vdata) u64 cycle_now; u64 nsec; - cycle_now = arch_counter_get_cntvct(); + isb(); + cycle_now = read_sysreg(CNTVCT); cycle_delta = (cycle_now - vdata->cs_cycle_last) & vdata->cs_mask; diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7e34b9eba5de..df350f4e1e7a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -90,6 +90,7 @@ config ARM64 select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_MULTI_HANDLER @@ -148,8 +149,8 @@ config ARM64 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_RCU_TABLE_FREE - select HAVE_RCU_TABLE_INVALIDATE select HAVE_RSEQ select HAVE_STACKPROTECTOR select HAVE_SYSCALL_TRACEPOINTS @@ -237,9 +238,6 @@ config LOCKDEP_SUPPORT config TRACE_IRQFLAGS_SUPPORT def_bool y -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_BUG def_bool y depends on BUG @@ -297,7 +295,7 @@ menu "Kernel Features" menu "ARM errata workarounds via the alternatives framework" config ARM64_WORKAROUND_CLEAN_CACHE - def_bool n + bool config ARM64_ERRATUM_826319 bool "Cortex-A53: 826319: System might deadlock if a write cannot complete until read data is accepted" @@ -464,26 +462,28 @@ config ARM64_ERRATUM_1024718 bool "Cortex-A55: 1024718: Update of DBM/AP bits without break before make might result in incorrect update" default y help - This option adds work around for Arm Cortex-A55 Erratum 1024718. + This option adds a workaround for ARM Cortex-A55 Erratum 1024718. Affected Cortex-A55 cores (r0p0, r0p1, r1p0) could cause incorrect update of the hardware dirty bit when the DBM/AP bits are updated - without a break-before-make. The work around is to disable the usage + without a break-before-make. The workaround is to disable the usage of hardware DBM locally on the affected cores. CPUs not affected by - erratum will continue to use the feature. + this erratum will continue to use the feature. If unsure, say Y. config ARM64_ERRATUM_1188873 - bool "Cortex-A76: MRC read following MRRC read of specific Generic Timer in AArch32 might give incorrect result" + bool "Cortex-A76/Neoverse-N1: MRC read following MRRC read of specific Generic Timer in AArch32 might give incorrect result" default y + depends on COMPAT select ARM_ARCH_TIMER_OOL_WORKAROUND help - This option adds work arounds for ARM Cortex-A76 erratum 1188873 + This option adds a workaround for ARM Cortex-A76/Neoverse-N1 + erratum 1188873. - Affected Cortex-A76 cores (r0p0, r1p0, r2p0) could cause - register corruption when accessing the timer registers from - AArch32 userspace. + Affected Cortex-A76/Neoverse-N1 cores (r0p0, r1p0, r2p0) could + cause register corruption when accessing the timer registers + from AArch32 userspace. If unsure, say Y. @@ -491,7 +491,7 @@ config ARM64_ERRATUM_1165522 bool "Cortex-A76: Speculative AT instruction using out-of-context translation regime could cause subsequent request to generate an incorrect translation" default y help - This option adds work arounds for ARM Cortex-A76 erratum 1165522 + This option adds a workaround for ARM Cortex-A76 erratum 1165522. Affected Cortex-A76 cores (r0p0, r1p0, r2p0) could end-up with corrupted TLBs by speculating an AT instruction during a guest @@ -504,7 +504,7 @@ config ARM64_ERRATUM_1286807 default y select ARM64_WORKAROUND_REPEAT_TLBI help - This option adds workaround for ARM Cortex-A76 erratum 1286807 + This option adds a workaround for ARM Cortex-A76 erratum 1286807. On the affected Cortex-A76 cores (r0p0 to r3p0), if a virtual address for a cacheable mapping of a location is being @@ -521,10 +521,10 @@ config CAVIUM_ERRATUM_22375 bool "Cavium erratum 22375, 24313" default y help - Enable workaround for erratum 22375, 24313. + Enable workaround for errata 22375 and 24313. This implements two gicv3-its errata workarounds for ThunderX. Both - with small impact affecting only ITS table allocation. + with a small impact affecting only ITS table allocation. erratum 22375: only alloc 8MB table size erratum 24313: ignore memory access type @@ -588,9 +588,6 @@ config QCOM_FALKOR_ERRATUM_1003 config ARM64_WORKAROUND_REPEAT_TLBI bool - help - Enable the repeat TLBI workaround for Falkor erratum 1009 and - Cortex-A76 erratum 1286807. config QCOM_FALKOR_ERRATUM_1009 bool "Falkor E1009: Prematurely complete a DSB after a TLBI" @@ -626,7 +623,7 @@ config HISILICON_ERRATUM_161600802 bool "Hip07 161600802: Erroneous redistributor VLPI base" default y help - The HiSilicon Hip07 SoC usees the wrong redistributor base + The HiSilicon Hip07 SoC uses the wrong redistributor base when issued ITS commands such as VMOVP and VMAPP, and requires a 128kB offset to be applied to the target address in this commands. @@ -646,7 +643,7 @@ config FUJITSU_ERRATUM_010001 bool "Fujitsu-A64FX erratum E#010001: Undefined fault may occur wrongly" default y help - This option adds workaround for Fujitsu-A64FX erratum E#010001. + This option adds a workaround for Fujitsu-A64FX erratum E#010001. On some variants of the Fujitsu-A64FX cores ver(1.0, 1.1), memory accesses may cause undefined fault (Data abort, DFSC=0b111111). This fault occurs under a specific hardware condition when a @@ -657,7 +654,7 @@ config FUJITSU_ERRATUM_010001 case-4 TTBR1_EL2 with TCR_EL2.NFD1 == 1. The workaround is to ensure these bits are clear in TCR_ELx. - The workaround only affect the Fujitsu-A64FX. + The workaround only affects the Fujitsu-A64FX. If unsure, say Y. @@ -889,6 +886,9 @@ config ARCH_WANT_HUGE_PMD_SHARE config ARCH_HAS_CACHE_LINE_SIZE def_bool y +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + def_bool y if PGTABLE_LEVELS > 2 + config SECCOMP bool "Enable seccomp to safely compute untrusted bytecode" ---help--- @@ -1078,9 +1078,65 @@ config RODATA_FULL_DEFAULT_ENABLED This requires the linear region to be mapped down to pages, which may adversely affect performance in some cases. +config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + +menuconfig COMPAT + bool "Kernel support for 32-bit EL0" + depends on ARM64_4K_PAGES || EXPERT + select COMPAT_BINFMT_ELF if BINFMT_ELF + select HAVE_UID16 + select OLD_SIGSUSPEND3 + select COMPAT_OLD_SIGACTION + help + This option enables support for a 32-bit EL0 running under a 64-bit + kernel at EL1. AArch32-specific components such as system calls, + the user helper functions, VFP support and the ptrace interface are + handled appropriately by the kernel. + + If you use a page size other than 4KB (i.e, 16KB or 64KB), please be aware + that you will only be able to execute AArch32 binaries that were compiled + with page size aligned segments. + + If you want to execute 32-bit userspace applications, say Y. + +if COMPAT + +config KUSER_HELPERS + bool "Enable kuser helpers page for 32 bit applications" + default y + help + Warning: disabling this option may break 32-bit user programs. + + Provide kuser helpers to compat tasks. The kernel provides + helper code to userspace in read only form at a fixed location + to allow userspace to be independent of the CPU type fitted to + the system. This permits binaries to be run on ARMv4 through + to ARMv8 without modification. + + See Documentation/arm/kernel_user_helpers.txt for details. + + However, the fixed address nature of these helpers can be used + by ROP (return orientated programming) authors when creating + exploits. + + If all of the binaries and libraries which run on your platform + are built specifically for your platform, and make no use of + these helpers, then you can turn this option off to hinder + such exploits. However, in that case, if a binary or library + relying on those helpers is run, it will not function correctly. + + Say N here only if you are absolutely certain that you do not + need these helpers; otherwise, the safe option is to say Y. + + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" - depends on COMPAT depends on SYSCTL help Legacy software support may require certain instructions @@ -1146,13 +1202,7 @@ config SETEND_EMULATION If unsure, say Y endif -config ARM64_SW_TTBR0_PAN - bool "Emulate Privileged Access Never using TTBR0_EL1 switching" - help - Enabling this option prevents the kernel from accessing - user-space memory directly by pointing TTBR0_EL1 to a reserved - zeroed area and reserved ASID. The user access routines - restore the valid TTBR0_EL1 temporarily. +endif menu "ARMv8.1 architectural features" @@ -1318,6 +1368,9 @@ config ARM64_SVE To enable use of this extension on CPUs that implement it, say Y. + On CPUs that support the SVE2 extensions, this option will enable + those too. + Note that for architectural reasons, firmware _must_ implement SVE support when running on SVE capable hardware. The required support is present in: @@ -1351,7 +1404,7 @@ config ARM64_PSEUDO_NMI help Adds support for mimicking Non-Maskable Interrupts through the use of GIC interrupt priority. This support requires version 3 or later of - Arm GIC. + ARM GIC. This high priority configuration for interrupts needs to be explicitly enabled by setting the kernel parameter @@ -1475,25 +1528,6 @@ config DMI endmenu -config COMPAT - bool "Kernel support for 32-bit EL0" - depends on ARM64_4K_PAGES || EXPERT - select COMPAT_BINFMT_ELF if BINFMT_ELF - select HAVE_UID16 - select OLD_SIGSUSPEND3 - select COMPAT_OLD_SIGACTION - help - This option enables support for a 32-bit EL0 running under a 64-bit - kernel at EL1. AArch32-specific components such as system calls, - the user helper functions, VFP support and the ptrace interface are - handled appropriately by the kernel. - - If you use a page size other than 4KB (i.e, 16KB or 64KB), please be aware - that you will only be able to execute AArch32 binaries that were compiled - with page size aligned segments. - - If you want to execute 32-bit userspace applications, say Y. - config SYSVIPC_COMPAT def_bool y depends on COMPAT && SYSVIPC diff --git a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi index cd7c76e58b09..a2cec6218211 100644 --- a/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi +++ b/arch/arm64/boot/dts/altera/socfpga_stratix10.dtsi @@ -534,11 +534,12 @@ }; eccmgr { - compatible = "altr,socfpga-a10-ecc-manager"; + compatible = "altr,socfpga-s10-ecc-manager", + "altr,socfpga-a10-ecc-manager"; altr,sysmgr-syscon = <&sysmgr>; #address-cells = <1>; #size-cells = <1>; - interrupts = <0 15 4>, <0 95 4>; + interrupts = <0 15 4>; interrupt-controller; #interrupt-cells = <2>; ranges; @@ -546,31 +547,31 @@ sdramedac { compatible = "altr,sdram-edac-s10"; altr,sdr-syscon = <&sdr>; - interrupts = <16 4>, <48 4>; + interrupts = <16 4>; }; usb0-ecc@ff8c4000 { - compatible = "altr,socfpga-usb-ecc"; + compatible = "altr,socfpga-s10-usb-ecc", + "altr,socfpga-usb-ecc"; reg = <0xff8c4000 0x100>; altr,ecc-parent = <&usb0>; - interrupts = <2 4>, - <34 4>; + interrupts = <2 4>; }; emac0-rx-ecc@ff8c0000 { - compatible = "altr,socfpga-eth-mac-ecc"; + compatible = "altr,socfpga-s10-eth-mac-ecc", + "altr,socfpga-eth-mac-ecc"; reg = <0xff8c0000 0x100>; altr,ecc-parent = <&gmac0>; - interrupts = <4 4>, - <36 4>; + interrupts = <4 4>; }; emac0-tx-ecc@ff8c0400 { - compatible = "altr,socfpga-eth-mac-ecc"; + compatible = "altr,socfpga-s10-eth-mac-ecc", + "altr,socfpga-eth-mac-ecc"; reg = <0xff8c0400 0x100>; altr,ecc-parent = <&gmac0>; - interrupts = <5 4>, - <37 4>; + interrupts = <5 4>; }; }; diff --git a/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h b/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h index 1b4cb0c55744..385c455a7c98 100644 --- a/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h +++ b/arch/arm64/boot/dts/mediatek/mt2712-pinfunc.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2018 MediaTek Inc. * Author: Zhiyong Tao <zhiyong.tao@mediatek.com> diff --git a/arch/arm64/crypto/aes-ce-ccm-glue.c b/arch/arm64/crypto/aes-ce-ccm-glue.c index 5fc6f51908fd..cb89c80800b5 100644 --- a/arch/arm64/crypto/aes-ce-ccm-glue.c +++ b/arch/arm64/crypto/aes-ce-ccm-glue.c @@ -14,6 +14,7 @@ #include <crypto/aes.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/module.h> @@ -109,7 +110,7 @@ static int ccm_init_mac(struct aead_request *req, u8 maciv[], u32 msglen) static void ccm_update_mac(struct crypto_aes_ctx *key, u8 mac[], u8 const in[], u32 abytes, u32 *macp) { - if (may_use_simd()) { + if (crypto_simd_usable()) { kernel_neon_begin(); ce_aes_ccm_auth_data(mac, in, abytes, macp, key->key_enc, num_rounds(key)); @@ -255,7 +256,7 @@ static int ccm_encrypt(struct aead_request *req) err = skcipher_walk_aead_encrypt(&walk, req, false); - if (may_use_simd()) { + if (crypto_simd_usable()) { while (walk.nbytes) { u32 tail = walk.nbytes % AES_BLOCK_SIZE; @@ -313,7 +314,7 @@ static int ccm_decrypt(struct aead_request *req) err = skcipher_walk_aead_decrypt(&walk, req, false); - if (may_use_simd()) { + if (crypto_simd_usable()) { while (walk.nbytes) { u32 tail = walk.nbytes % AES_BLOCK_SIZE; @@ -372,7 +373,7 @@ static struct aead_alg ccm_aes_alg = { static int __init aes_mod_init(void) { - if (!(elf_hwcap & HWCAP_AES)) + if (!cpu_have_named_feature(AES)) return -ENODEV; return crypto_register_aead(&ccm_aes_alg); } diff --git a/arch/arm64/crypto/aes-ce-glue.c b/arch/arm64/crypto/aes-ce-glue.c index e6b3227bbf57..3213843fcb46 100644 --- a/arch/arm64/crypto/aes-ce-glue.c +++ b/arch/arm64/crypto/aes-ce-glue.c @@ -12,6 +12,7 @@ #include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/aes.h> +#include <crypto/internal/simd.h> #include <linux/cpufeature.h> #include <linux/crypto.h> #include <linux/module.h> @@ -52,7 +53,7 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - if (!may_use_simd()) { + if (!crypto_simd_usable()) { __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx)); return; } @@ -66,7 +67,7 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) { struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm); - if (!may_use_simd()) { + if (!crypto_simd_usable()) { __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx)); return; } diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 1e676625ef33..f0ceb545bd1e 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -405,7 +405,7 @@ static int ctr_encrypt_sync(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct crypto_aes_ctx *ctx = crypto_skcipher_ctx(tfm); - if (!may_use_simd()) + if (!crypto_simd_usable()) return aes_ctr_encrypt_fallback(ctx, req); return ctr_encrypt(req); @@ -642,7 +642,7 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks, { int rounds = 6 + ctx->key_length / 4; - if (may_use_simd()) { + if (crypto_simd_usable()) { kernel_neon_begin(); aes_mac_update(in, ctx->key_enc, rounds, blocks, dg, enc_before, enc_after); @@ -707,7 +707,7 @@ static int cbcmac_final(struct shash_desc *desc, u8 *out) struct mac_tfm_ctx *tctx = crypto_shash_ctx(desc->tfm); struct mac_desc_ctx *ctx = shash_desc_ctx(desc); - mac_do_update(&tctx->key, NULL, 0, ctx->dg, 1, 0); + mac_do_update(&tctx->key, NULL, 0, ctx->dg, (ctx->len != 0), 0); memcpy(out, ctx->dg, AES_BLOCK_SIZE); diff --git a/arch/arm64/crypto/aes-neonbs-glue.c b/arch/arm64/crypto/aes-neonbs-glue.c index e7a95a566462..02b65d9eb947 100644 --- a/arch/arm64/crypto/aes-neonbs-glue.c +++ b/arch/arm64/crypto/aes-neonbs-glue.c @@ -288,7 +288,7 @@ static int ctr_encrypt_sync(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct aesbs_ctr_ctx *ctx = crypto_skcipher_ctx(tfm); - if (!may_use_simd()) + if (!crypto_simd_usable()) return aes_ctr_encrypt_fallback(&ctx->fallback, req); return ctr_encrypt(req); @@ -304,6 +304,8 @@ static int __xts_crypt(struct skcipher_request *req, int err; err = skcipher_walk_virt(&walk, req, false); + if (err) + return err; kernel_neon_begin(); neon_aes_ecb_encrypt(walk.iv, walk.iv, ctx->twkey, ctx->key.rounds, 1); @@ -440,7 +442,7 @@ static int __init aes_init(void) int err; int i; - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; err = crypto_register_skciphers(aes_algs, ARRAY_SIZE(aes_algs)); diff --git a/arch/arm64/crypto/chacha-neon-glue.c b/arch/arm64/crypto/chacha-neon-glue.c index bece1d85bd81..82029cda2e77 100644 --- a/arch/arm64/crypto/chacha-neon-glue.c +++ b/arch/arm64/crypto/chacha-neon-glue.c @@ -21,6 +21,7 @@ #include <crypto/algapi.h> #include <crypto/chacha.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> @@ -90,7 +91,7 @@ static int chacha_neon(struct skcipher_request *req) struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm); - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_chacha_crypt(req); return chacha_neon_stream_xor(req, ctx, req->iv); @@ -104,7 +105,7 @@ static int xchacha_neon(struct skcipher_request *req) u32 state[16]; u8 real_iv[16]; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !may_use_simd()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_xchacha_crypt(req); crypto_chacha_init(state, ctx, req->iv); @@ -173,7 +174,7 @@ static struct skcipher_alg algs[] = { static int __init chacha_simd_mod_init(void) { - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; return crypto_register_skciphers(algs, ARRAY_SIZE(algs)); diff --git a/arch/arm64/crypto/crct10dif-ce-glue.c b/arch/arm64/crypto/crct10dif-ce-glue.c index dd325829ee44..2e0a7d2eee24 100644 --- a/arch/arm64/crypto/crct10dif-ce-glue.c +++ b/arch/arm64/crypto/crct10dif-ce-glue.c @@ -16,6 +16,7 @@ #include <linux/string.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <asm/neon.h> #include <asm/simd.h> @@ -38,7 +39,7 @@ static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data, { u16 *crc = shash_desc_ctx(desc); - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) { + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { kernel_neon_begin(); *crc = crc_t10dif_pmull_p8(*crc, data, length); kernel_neon_end(); @@ -54,7 +55,7 @@ static int crct10dif_update_pmull_p64(struct shash_desc *desc, const u8 *data, { u16 *crc = shash_desc_ctx(desc); - if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && may_use_simd()) { + if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { kernel_neon_begin(); *crc = crc_t10dif_pmull_p64(*crc, data, length); kernel_neon_end(); @@ -101,7 +102,7 @@ static struct shash_alg crc_t10dif_alg[] = {{ static int __init crc_t10dif_mod_init(void) { - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) return crypto_register_shashes(crc_t10dif_alg, ARRAY_SIZE(crc_t10dif_alg)); else @@ -111,7 +112,7 @@ static int __init crc_t10dif_mod_init(void) static void __exit crc_t10dif_mod_exit(void) { - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) crypto_unregister_shashes(crc_t10dif_alg, ARRAY_SIZE(crc_t10dif_alg)); else diff --git a/arch/arm64/crypto/ghash-ce-glue.c b/arch/arm64/crypto/ghash-ce-glue.c index 791ad422c427..b39ed99b06fb 100644 --- a/arch/arm64/crypto/ghash-ce-glue.c +++ b/arch/arm64/crypto/ghash-ce-glue.c @@ -17,6 +17,7 @@ #include <crypto/gf128mul.h> #include <crypto/internal/aead.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/cpufeature.h> @@ -89,7 +90,7 @@ static void ghash_do_update(int blocks, u64 dg[], const char *src, struct ghash_key const *k, const char *head)) { - if (likely(may_use_simd())) { + if (likely(crypto_simd_usable())) { kernel_neon_begin(); simd_update(blocks, dg, src, key, head); kernel_neon_end(); @@ -441,7 +442,7 @@ static int gcm_encrypt(struct aead_request *req) err = skcipher_walk_aead_encrypt(&walk, req, false); - if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) { u32 const *rk = NULL; kernel_neon_begin(); @@ -473,9 +474,11 @@ static int gcm_encrypt(struct aead_request *req) put_unaligned_be32(2, iv + GCM_IV_SIZE); while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + const int blocks = + walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; u8 *dst = walk.dst.virt.addr; u8 *src = walk.src.virt.addr; + int remaining = blocks; do { __aes_arm64_encrypt(ctx->aes_key.key_enc, @@ -485,9 +488,9 @@ static int gcm_encrypt(struct aead_request *req) dst += AES_BLOCK_SIZE; src += AES_BLOCK_SIZE; - } while (--blocks > 0); + } while (--remaining > 0); - ghash_do_update(walk.nbytes / AES_BLOCK_SIZE, dg, + ghash_do_update(blocks, dg, walk.dst.virt.addr, &ctx->ghash_key, NULL, pmull_ghash_update_p64); @@ -563,7 +566,7 @@ static int gcm_decrypt(struct aead_request *req) err = skcipher_walk_aead_decrypt(&walk, req, false); - if (likely(may_use_simd() && walk.total >= 2 * AES_BLOCK_SIZE)) { + if (likely(crypto_simd_usable() && walk.total >= 2 * AES_BLOCK_SIZE)) { u32 const *rk = NULL; kernel_neon_begin(); @@ -609,7 +612,7 @@ static int gcm_decrypt(struct aead_request *req) put_unaligned_be32(2, iv + GCM_IV_SIZE); while (walk.nbytes >= (2 * AES_BLOCK_SIZE)) { - int blocks = walk.nbytes / AES_BLOCK_SIZE; + int blocks = walk.nbytes / (2 * AES_BLOCK_SIZE) * 2; u8 *dst = walk.dst.virt.addr; u8 *src = walk.src.virt.addr; @@ -704,10 +707,10 @@ static int __init ghash_ce_mod_init(void) { int ret; - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) ret = crypto_register_shashes(ghash_alg, ARRAY_SIZE(ghash_alg)); else @@ -717,7 +720,7 @@ static int __init ghash_ce_mod_init(void) if (ret) return ret; - if (elf_hwcap & HWCAP_PMULL) { + if (cpu_have_named_feature(PMULL)) { ret = crypto_register_aead(&gcm_aes_alg); if (ret) crypto_unregister_shashes(ghash_alg, @@ -728,7 +731,7 @@ static int __init ghash_ce_mod_init(void) static void __exit ghash_ce_mod_exit(void) { - if (elf_hwcap & HWCAP_PMULL) + if (cpu_have_named_feature(PMULL)) crypto_unregister_shashes(ghash_alg, ARRAY_SIZE(ghash_alg)); else crypto_unregister_shash(ghash_alg); diff --git a/arch/arm64/crypto/nhpoly1305-neon-glue.c b/arch/arm64/crypto/nhpoly1305-neon-glue.c index 22cc32ac9448..895d3727c1fb 100644 --- a/arch/arm64/crypto/nhpoly1305-neon-glue.c +++ b/arch/arm64/crypto/nhpoly1305-neon-glue.c @@ -9,6 +9,7 @@ #include <asm/neon.h> #include <asm/simd.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> @@ -25,7 +26,7 @@ static void _nh_neon(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_neon_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - if (srclen < 64 || !may_use_simd()) + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); do { @@ -56,7 +57,7 @@ static struct shash_alg nhpoly1305_alg = { static int __init nhpoly1305_mod_init(void) { - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) return -ENODEV; return crypto_register_shash(&nhpoly1305_alg); diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index 17fac2889f56..eaa7a8258f1c 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -12,6 +12,7 @@ #include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> #include <linux/cpufeature.h> @@ -38,7 +39,7 @@ static int sha1_ce_update(struct shash_desc *desc, const u8 *data, { struct sha1_ce_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sha1_update(desc, data, len); sctx->finalize = 0; @@ -56,7 +57,7 @@ static int sha1_ce_finup(struct shash_desc *desc, const u8 *data, struct sha1_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA1_BLOCK_SIZE); - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sha1_finup(desc, data, len, out); /* @@ -78,7 +79,7 @@ static int sha1_ce_final(struct shash_desc *desc, u8 *out) { struct sha1_ce_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sha1_finup(desc, NULL, 0, out); sctx->finalize = 0; diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 261f5195cab7..a725997e55f2 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -12,6 +12,7 @@ #include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> #include <linux/cpufeature.h> @@ -42,7 +43,7 @@ static int sha256_ce_update(struct shash_desc *desc, const u8 *data, { struct sha256_ce_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd()) + if (!crypto_simd_usable()) return sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha256_block_data_order); @@ -61,7 +62,7 @@ static int sha256_ce_finup(struct shash_desc *desc, const u8 *data, struct sha256_ce_state *sctx = shash_desc_ctx(desc); bool finalize = !sctx->sst.count && !(len % SHA256_BLOCK_SIZE); - if (!may_use_simd()) { + if (!crypto_simd_usable()) { if (len) sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha256_block_data_order); @@ -90,7 +91,7 @@ static int sha256_ce_final(struct shash_desc *desc, u8 *out) { struct sha256_ce_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd()) { + if (!crypto_simd_usable()) { sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_block_data_order); return sha256_base_finish(desc, out); diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index 4aedeaefd61f..e62298740e31 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -14,6 +14,7 @@ #include <asm/neon.h> #include <asm/simd.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> #include <linux/cryptohash.h> @@ -89,7 +90,7 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data, { struct sha256_state *sctx = shash_desc_ctx(desc); - if (!may_use_simd()) + if (!crypto_simd_usable()) return sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha256_block_data_order); @@ -119,7 +120,7 @@ static int sha256_update_neon(struct shash_desc *desc, const u8 *data, static int sha256_finup_neon(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) { + if (!crypto_simd_usable()) { if (len) sha256_base_do_update(desc, data, len, (sha256_block_fn *)sha256_block_data_order); @@ -173,7 +174,7 @@ static int __init sha256_mod_init(void) if (ret) return ret; - if (elf_hwcap & HWCAP_ASIMD) { + if (cpu_have_named_feature(ASIMD)) { ret = crypto_register_shashes(neon_algs, ARRAY_SIZE(neon_algs)); if (ret) crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); @@ -183,7 +184,7 @@ static int __init sha256_mod_init(void) static void __exit sha256_mod_fini(void) { - if (elf_hwcap & HWCAP_ASIMD) + if (cpu_have_named_feature(ASIMD)) crypto_unregister_shashes(neon_algs, ARRAY_SIZE(neon_algs)); crypto_unregister_shashes(algs, ARRAY_SIZE(algs)); } diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c index a336feac0f59..9a4bbfc45f40 100644 --- a/arch/arm64/crypto/sha3-ce-glue.c +++ b/arch/arm64/crypto/sha3-ce-glue.c @@ -14,6 +14,7 @@ #include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha3.h> #include <linux/cpufeature.h> #include <linux/crypto.h> @@ -32,7 +33,7 @@ static int sha3_update(struct shash_desc *desc, const u8 *data, struct sha3_state *sctx = shash_desc_ctx(desc); unsigned int digest_size = crypto_shash_digestsize(desc->tfm); - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sha3_update(desc, data, len); if ((sctx->partial + len) >= sctx->rsiz) { @@ -76,7 +77,7 @@ static int sha3_final(struct shash_desc *desc, u8 *out) __le64 *digest = (__le64 *)out; int i; - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sha3_final(desc, out); sctx->buf[sctx->partial++] = 0x06; diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index f2c5f28c622a..2369540040aa 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -13,6 +13,7 @@ #include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sha.h> #include <crypto/sha512_base.h> #include <linux/cpufeature.h> @@ -31,7 +32,7 @@ asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks); static int sha512_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return sha512_base_do_update(desc, data, len, (sha512_block_fn *)sha512_block_data_order); @@ -46,7 +47,7 @@ static int sha512_ce_update(struct shash_desc *desc, const u8 *data, static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) { + if (!crypto_simd_usable()) { if (len) sha512_base_do_update(desc, data, len, (sha512_block_fn *)sha512_block_data_order); @@ -65,7 +66,7 @@ static int sha512_ce_finup(struct shash_desc *desc, const u8 *data, static int sha512_ce_final(struct shash_desc *desc, u8 *out) { - if (!may_use_simd()) { + if (!crypto_simd_usable()) { sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_block_data_order); return sha512_base_finish(desc, out); diff --git a/arch/arm64/crypto/sm3-ce-glue.c b/arch/arm64/crypto/sm3-ce-glue.c index 88938a20d9b2..5d15533799a2 100644 --- a/arch/arm64/crypto/sm3-ce-glue.c +++ b/arch/arm64/crypto/sm3-ce-glue.c @@ -12,6 +12,7 @@ #include <asm/simd.h> #include <asm/unaligned.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/sm3.h> #include <crypto/sm3_base.h> #include <linux/cpufeature.h> @@ -28,7 +29,7 @@ asmlinkage void sm3_ce_transform(struct sm3_state *sst, u8 const *src, static int sm3_ce_update(struct shash_desc *desc, const u8 *data, unsigned int len) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sm3_update(desc, data, len); kernel_neon_begin(); @@ -40,7 +41,7 @@ static int sm3_ce_update(struct shash_desc *desc, const u8 *data, static int sm3_ce_final(struct shash_desc *desc, u8 *out) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sm3_finup(desc, NULL, 0, out); kernel_neon_begin(); @@ -53,7 +54,7 @@ static int sm3_ce_final(struct shash_desc *desc, u8 *out) static int sm3_ce_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { - if (!may_use_simd()) + if (!crypto_simd_usable()) return crypto_sm3_finup(desc, data, len, out); kernel_neon_begin(); diff --git a/arch/arm64/crypto/sm4-ce-glue.c b/arch/arm64/crypto/sm4-ce-glue.c index 0c4fc223f225..2754c875d39c 100644 --- a/arch/arm64/crypto/sm4-ce-glue.c +++ b/arch/arm64/crypto/sm4-ce-glue.c @@ -3,6 +3,7 @@ #include <asm/neon.h> #include <asm/simd.h> #include <crypto/sm4.h> +#include <crypto/internal/simd.h> #include <linux/module.h> #include <linux/cpufeature.h> #include <linux/crypto.h> @@ -20,7 +21,7 @@ static void sm4_ce_encrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); - if (!may_use_simd()) { + if (!crypto_simd_usable()) { crypto_sm4_encrypt(tfm, out, in); } else { kernel_neon_begin(); @@ -33,7 +34,7 @@ static void sm4_ce_decrypt(struct crypto_tfm *tfm, u8 *out, const u8 *in) { const struct crypto_sm4_ctx *ctx = crypto_tfm_ctx(tfm); - if (!may_use_simd()) { + if (!crypto_simd_usable()) { crypto_sm4_decrypt(tfm, out, in); } else { kernel_neon_begin(); diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 1e17ea5c372b..eb0df239a759 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -13,10 +13,10 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += msi.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += rwsem.h generic-y += segment.h generic-y += serial.h generic-y += set_memory.h diff --git a/arch/arm64/include/asm/arch_timer.h b/arch/arm64/include/asm/arch_timer.h index f2a234d6516c..b7bca1ae09e6 100644 --- a/arch/arm64/include/asm/arch_timer.h +++ b/arch/arm64/include/asm/arch_timer.h @@ -31,11 +31,23 @@ #include <clocksource/arm_arch_timer.h> #if IS_ENABLED(CONFIG_ARM_ARCH_TIMER_OOL_WORKAROUND) -extern struct static_key_false arch_timer_read_ool_enabled; -#define needs_unstable_timer_counter_workaround() \ - static_branch_unlikely(&arch_timer_read_ool_enabled) +#define has_erratum_handler(h) \ + ({ \ + const struct arch_timer_erratum_workaround *__wa; \ + __wa = __this_cpu_read(timer_unstable_counter_workaround); \ + (__wa && __wa->h); \ + }) + +#define erratum_handler(h) \ + ({ \ + const struct arch_timer_erratum_workaround *__wa; \ + __wa = __this_cpu_read(timer_unstable_counter_workaround); \ + (__wa && __wa->h) ? __wa->h : arch_timer_##h; \ + }) + #else -#define needs_unstable_timer_counter_workaround() false +#define has_erratum_handler(h) false +#define erratum_handler(h) (arch_timer_##h) #endif enum arch_timer_erratum_match_type { @@ -61,23 +73,37 @@ struct arch_timer_erratum_workaround { DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *, timer_unstable_counter_workaround); +/* inline sysreg accessors that make erratum_handler() work */ +static inline notrace u32 arch_timer_read_cntp_tval_el0(void) +{ + return read_sysreg(cntp_tval_el0); +} + +static inline notrace u32 arch_timer_read_cntv_tval_el0(void) +{ + return read_sysreg(cntv_tval_el0); +} + +static inline notrace u64 arch_timer_read_cntpct_el0(void) +{ + return read_sysreg(cntpct_el0); +} + +static inline notrace u64 arch_timer_read_cntvct_el0(void) +{ + return read_sysreg(cntvct_el0); +} + #define arch_timer_reg_read_stable(reg) \ -({ \ - u64 _val; \ - if (needs_unstable_timer_counter_workaround()) { \ - const struct arch_timer_erratum_workaround *wa; \ + ({ \ + u64 _val; \ + \ preempt_disable_notrace(); \ - wa = __this_cpu_read(timer_unstable_counter_workaround); \ - if (wa && wa->read_##reg) \ - _val = wa->read_##reg(); \ - else \ - _val = read_sysreg(reg); \ + _val = erratum_handler(read_ ## reg)(); \ preempt_enable_notrace(); \ - } else { \ - _val = read_sysreg(reg); \ - } \ - _val; \ -}) + \ + _val; \ + }) /* * These register accessors are marked inline so the compiler can @@ -148,18 +174,67 @@ static inline void arch_timer_set_cntkctl(u32 cntkctl) isb(); } -static inline u64 arch_counter_get_cntpct(void) +/* + * Ensure that reads of the counter are treated the same as memory reads + * for the purposes of ordering by subsequent memory barriers. + * + * This insanity brought to you by speculative system register reads, + * out-of-order memory accesses, sequence locks and Thomas Gleixner. + * + * http://lists.infradead.org/pipermail/linux-arm-kernel/2019-February/631195.html + */ +#define arch_counter_enforce_ordering(val) do { \ + u64 tmp, _val = (val); \ + \ + asm volatile( \ + " eor %0, %1, %1\n" \ + " add %0, sp, %0\n" \ + " ldr xzr, [%0]" \ + : "=r" (tmp) : "r" (_val)); \ +} while (0) + +static inline u64 __arch_counter_get_cntpct_stable(void) +{ + u64 cnt; + + isb(); + cnt = arch_timer_reg_read_stable(cntpct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; +} + +static inline u64 __arch_counter_get_cntpct(void) { + u64 cnt; + isb(); - return arch_timer_reg_read_stable(cntpct_el0); + cnt = read_sysreg(cntpct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; } -static inline u64 arch_counter_get_cntvct(void) +static inline u64 __arch_counter_get_cntvct_stable(void) { + u64 cnt; + isb(); - return arch_timer_reg_read_stable(cntvct_el0); + cnt = arch_timer_reg_read_stable(cntvct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; } +static inline u64 __arch_counter_get_cntvct(void) +{ + u64 cnt; + + isb(); + cnt = read_sysreg(cntvct_el0); + arch_counter_enforce_ordering(cnt); + return cnt; +} + +#undef arch_counter_enforce_ordering + static inline int arch_timer_arch_init(void) { return 0; diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index c5308d01e228..039fbd822ec6 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -407,10 +407,14 @@ alternative_endif .ifc \op, cvap sys 3, c7, c12, 1, \kaddr // dc cvap .else + .ifc \op, cvadp + sys 3, c7, c13, 1, \kaddr // dc cvadp + .else dc \op, \kaddr .endif .endif .endif + .endif add \kaddr, \kaddr, \tmp1 cmp \kaddr, \size b.lo 9998b @@ -442,8 +446,8 @@ USER(\label, ic ivau, \tmp2) // invalidate I line PoU * reset_pmuserenr_el0 - reset PMUSERENR_EL0 if PMUv3 present */ .macro reset_pmuserenr_el0, tmpreg - mrs \tmpreg, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx \tmpreg, \tmpreg, #8, #4 + mrs \tmpreg, id_aa64dfr0_el1 + sbfx \tmpreg, \tmpreg, #ID_AA64DFR0_PMUVER_SHIFT, #4 cmp \tmpreg, #1 // Skip if no PMU present b.lt 9000f msr pmuserenr_el0, xzr // Disable PMU access from EL0 diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index f66bb04fdf2d..85b6bedbcc68 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -20,6 +20,8 @@ #ifndef __ASSEMBLY__ +#include <linux/kasan-checks.h> + #define __nops(n) ".rept " #n "\nnop\n.endr\n" #define nops(n) asm volatile(__nops(n)) @@ -72,31 +74,33 @@ static inline unsigned long array_index_mask_nospec(unsigned long idx, #define __smp_store_release(p, v) \ do { \ + typeof(p) __p = (p); \ union { typeof(*p) __val; char __c[1]; } __u = \ - { .__val = (__force typeof(*p)) (v) }; \ + { .__val = (__force typeof(*p)) (v) }; \ compiletime_assert_atomic_type(*p); \ + kasan_check_write(__p, sizeof(*p)); \ switch (sizeof(*p)) { \ case 1: \ asm volatile ("stlrb %w1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u8 *)__u.__c) \ : "memory"); \ break; \ case 2: \ asm volatile ("stlrh %w1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u16 *)__u.__c) \ : "memory"); \ break; \ case 4: \ asm volatile ("stlr %w1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u32 *)__u.__c) \ : "memory"); \ break; \ case 8: \ asm volatile ("stlr %1, %0" \ - : "=Q" (*p) \ + : "=Q" (*__p) \ : "r" (*(__u64 *)__u.__c) \ : "memory"); \ break; \ @@ -106,27 +110,29 @@ do { \ #define __smp_load_acquire(p) \ ({ \ union { typeof(*p) __val; char __c[1]; } __u; \ + typeof(p) __p = (p); \ compiletime_assert_atomic_type(*p); \ + kasan_check_read(__p, sizeof(*p)); \ switch (sizeof(*p)) { \ case 1: \ asm volatile ("ldarb %w0, %1" \ : "=r" (*(__u8 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ case 2: \ asm volatile ("ldarh %w0, %1" \ : "=r" (*(__u16 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ case 4: \ asm volatile ("ldar %w0, %1" \ : "=r" (*(__u32 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ case 8: \ asm volatile ("ldar %0, %1" \ : "=r" (*(__u64 *)__u.__c) \ - : "Q" (*p) : "memory"); \ + : "Q" (*__p) : "memory"); \ break; \ } \ __u.__val; \ diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h index 2945fe6cd863..d84294064e6a 100644 --- a/arch/arm64/include/asm/brk-imm.h +++ b/arch/arm64/include/asm/brk-imm.h @@ -11,6 +11,8 @@ /* * #imm16 values used for BRK instruction generation + * 0x004: for installing kprobes + * 0x005: for installing uprobes * Allowed values for kgdb are 0x400 - 0x7ff * 0x100: for triggering a fault on purpose (reserved) * 0x400: for dynamic BRK instruction @@ -18,10 +20,13 @@ * 0x800: kernel-mode BUG() and WARN() traps * 0x9xx: tag-based KASAN trap (allowed values 0x900 - 0x9ff) */ +#define KPROBES_BRK_IMM 0x004 +#define UPROBES_BRK_IMM 0x005 #define FAULT_BRK_IMM 0x100 #define KGDB_DYN_DBG_BRK_IMM 0x400 #define KGDB_COMPILED_DBG_BRK_IMM 0x401 #define BUG_BRK_IMM 0x800 #define KASAN_BRK_IMM 0x900 +#define KASAN_BRK_MASK 0x0ff #endif diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index f6a76e43f39e..defdc67d9ab4 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -61,7 +61,8 @@ #define ARM64_HAS_GENERIC_AUTH_ARCH 40 #define ARM64_HAS_GENERIC_AUTH_IMP_DEF 41 #define ARM64_HAS_IRQ_PRIO_MASKING 42 +#define ARM64_HAS_DCPODP 43 -#define ARM64_NCAPS 43 +#define ARM64_NCAPS 44 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index e505e1fbd2b9..f210bcf096f7 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -14,15 +14,8 @@ #include <asm/hwcap.h> #include <asm/sysreg.h> -/* - * In the arm64 world (as in the ARM world), elf_hwcap is used both internally - * in the kernel and for user space to keep track of which optional features - * are supported by the current system. So let's map feature 'x' to HWCAP_x. - * Note that HWCAP_x constants are bit fields so we need to take the log. - */ - -#define MAX_CPU_FEATURES (8 * sizeof(elf_hwcap)) -#define cpu_feature(x) ilog2(HWCAP_ ## x) +#define MAX_CPU_FEATURES 64 +#define cpu_feature(x) KERNEL_HWCAP_ ## x #ifndef __ASSEMBLY__ @@ -399,11 +392,13 @@ extern DECLARE_BITMAP(boot_capabilities, ARM64_NPATCHABLE); for_each_set_bit(cap, cpu_hwcaps, ARM64_NCAPS) bool this_cpu_has_cap(unsigned int cap); +void cpu_set_feature(unsigned int num); +bool cpu_have_feature(unsigned int num); +unsigned long cpu_get_elf_hwcap(void); +unsigned long cpu_get_elf_hwcap2(void); -static inline bool cpu_have_feature(unsigned int num) -{ - return elf_hwcap & (1UL << num); -} +#define cpu_set_named_feature(name) cpu_set_feature(cpu_feature(name)) +#define cpu_have_named_feature(name) cpu_have_feature(cpu_feature(name)) /* System capability check for constant caps */ static inline bool __cpus_have_const_cap(int num) @@ -638,11 +633,7 @@ static inline int arm64_get_ssbd_state(void) #endif } -#ifdef CONFIG_ARM64_SSBD void arm64_set_ssbd_mitigation(bool state); -#else -static inline void arm64_set_ssbd_mitigation(bool state) {} -#endif extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 5f1437099b99..2602bae334fb 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -89,6 +89,7 @@ #define ARM_CPU_PART_CORTEX_A35 0xD04 #define ARM_CPU_PART_CORTEX_A55 0xD05 #define ARM_CPU_PART_CORTEX_A76 0xD0B +#define ARM_CPU_PART_NEOVERSE_N1 0xD0C #define APM_CPU_PART_POTENZA 0x000 @@ -118,6 +119,7 @@ #define MIDR_CORTEX_A35 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A35) #define MIDR_CORTEX_A55 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A55) #define MIDR_CORTEX_A76 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A76) +#define MIDR_NEOVERSE_N1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_NEOVERSE_N1) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index a44cf5225429..0679f781696d 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -65,12 +65,9 @@ #define CACHE_FLUSH_IS_SAFE 1 /* kprobes BRK opcodes with ESR encoding */ -#define BRK64_ESR_MASK 0xFFFF -#define BRK64_ESR_KPROBES 0x0004 -#define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5)) +#define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (KPROBES_BRK_IMM << 5)) /* uprobes BRK opcodes with ESR encoding */ -#define BRK64_ESR_UPROBES 0x0005 -#define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (BRK64_ESR_UPROBES << 5)) +#define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (UPROBES_BRK_IMM << 5)) /* AArch32 */ #define DBG_ESR_EVT_BKPT 0x4 @@ -94,18 +91,24 @@ struct step_hook { int (*fn)(struct pt_regs *regs, unsigned int esr); }; -void register_step_hook(struct step_hook *hook); -void unregister_step_hook(struct step_hook *hook); +void register_user_step_hook(struct step_hook *hook); +void unregister_user_step_hook(struct step_hook *hook); + +void register_kernel_step_hook(struct step_hook *hook); +void unregister_kernel_step_hook(struct step_hook *hook); struct break_hook { struct list_head node; - u32 esr_val; - u32 esr_mask; int (*fn)(struct pt_regs *regs, unsigned int esr); + u16 imm; + u16 mask; /* These bits are ignored when comparing with imm */ }; -void register_break_hook(struct break_hook *hook); -void unregister_break_hook(struct break_hook *hook); +void register_user_break_hook(struct break_hook *hook); +void unregister_user_break_hook(struct break_hook *hook); + +void register_kernel_break_hook(struct break_hook *hook); +void unregister_kernel_break_hook(struct break_hook *hook); u8 debug_monitors_arch(void); diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 6adc1a90e7e6..355d120b78cb 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -214,10 +214,10 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; set_thread_flag(TIF_32BIT); \ }) #define COMPAT_ARCH_DLINFO -extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, - int uses_interp); +extern int aarch32_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); #define compat_arch_setup_additional_pages \ - aarch32_setup_vectors_page + aarch32_setup_additional_pages #endif /* CONFIG_COMPAT */ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 52233f00d53d..0e27fe91d5ea 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -156,9 +156,7 @@ ESR_ELx_WFx_ISS_WFI) /* BRK instruction trap from AArch64 state */ -#define ESR_ELx_VAL_BRK64(imm) \ - ((ESR_ELx_EC_BRK64 << ESR_ELx_EC_SHIFT) | ESR_ELx_IL | \ - ((imm) & 0xffff)) +#define ESR_ELx_BRK64_ISS_COMMENT_MASK 0xffff /* ISS field definitions for System instruction traps */ #define ESR_ELx_SYS64_ISS_RES0_SHIFT 22 @@ -198,9 +196,10 @@ /* * User space cache operations have the following sysreg encoding * in System instructions. - * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 14 }, WRITE (L=0) + * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 13, 14 }, WRITE (L=0) */ #define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVADP 13 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index cccb83ad7fa8..a56efb5626fa 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -23,26 +23,34 @@ #include <asm/errno.h> +#define FUTEX_MAX_LOOPS 128 /* What's the largest number you can think of? */ + #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ do { \ + unsigned int loops = FUTEX_MAX_LOOPS; \ + \ uaccess_enable(); \ asm volatile( \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ -"2: stlxr %w3, %w0, %2\n" \ -" cbnz %w3, 1b\n" \ -" dmb ish\n" \ +"2: stlxr %w0, %w3, %2\n" \ +" cbz %w0, 3f\n" \ +" sub %w4, %w4, %w0\n" \ +" cbnz %w4, 1b\n" \ +" mov %w0, %w7\n" \ "3:\n" \ +" dmb ish\n" \ " .pushsection .fixup,\"ax\"\n" \ " .align 2\n" \ -"4: mov %w0, %w5\n" \ +"4: mov %w0, %w6\n" \ " b 3b\n" \ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ - : "r" (oparg), "Ir" (-EFAULT) \ + : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp), \ + "+r" (loops) \ + : "r" (oparg), "Ir" (-EFAULT), "Ir" (-EAGAIN) \ : "memory"); \ uaccess_disable(); \ } while (0) @@ -57,23 +65,23 @@ arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr) switch (op) { case FUTEX_OP_SET: - __futex_atomic_op("mov %w0, %w4", + __futex_atomic_op("mov %w3, %w5", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_ADD: - __futex_atomic_op("add %w0, %w1, %w4", + __futex_atomic_op("add %w3, %w1, %w5", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_OR: - __futex_atomic_op("orr %w0, %w1, %w4", + __futex_atomic_op("orr %w3, %w1, %w5", ret, oldval, uaddr, tmp, oparg); break; case FUTEX_OP_ANDN: - __futex_atomic_op("and %w0, %w1, %w4", + __futex_atomic_op("and %w3, %w1, %w5", ret, oldval, uaddr, tmp, ~oparg); break; case FUTEX_OP_XOR: - __futex_atomic_op("eor %w0, %w1, %w4", + __futex_atomic_op("eor %w3, %w1, %w5", ret, oldval, uaddr, tmp, oparg); break; default: @@ -93,6 +101,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, u32 oldval, u32 newval) { int ret = 0; + unsigned int loops = FUTEX_MAX_LOOPS; u32 val, tmp; u32 __user *uaddr; @@ -104,24 +113,30 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, asm volatile("// futex_atomic_cmpxchg_inatomic\n" " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" -" sub %w3, %w1, %w4\n" -" cbnz %w3, 3f\n" -"2: stlxr %w3, %w5, %2\n" -" cbnz %w3, 1b\n" -" dmb ish\n" +" sub %w3, %w1, %w5\n" +" cbnz %w3, 4f\n" +"2: stlxr %w3, %w6, %2\n" +" cbz %w3, 3f\n" +" sub %w4, %w4, %w3\n" +" cbnz %w4, 1b\n" +" mov %w0, %w8\n" "3:\n" +" dmb ish\n" +"4:\n" " .pushsection .fixup,\"ax\"\n" -"4: mov %w0, %w6\n" -" b 3b\n" +"5: mov %w0, %w7\n" +" b 4b\n" " .popsection\n" - _ASM_EXTABLE(1b, 4b) - _ASM_EXTABLE(2b, 4b) - : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) - : "r" (oldval), "r" (newval), "Ir" (-EFAULT) + _ASM_EXTABLE(1b, 5b) + _ASM_EXTABLE(2b, 5b) + : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) + : "r" (oldval), "r" (newval), "Ir" (-EFAULT), "Ir" (-EAGAIN) : "memory"); uaccess_disable(); - *uval = val; + if (!ret) + *uval = val; + return ret; } diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h index 400b80b49595..b4bfb6672168 100644 --- a/arch/arm64/include/asm/hwcap.h +++ b/arch/arm64/include/asm/hwcap.h @@ -17,6 +17,7 @@ #define __ASM_HWCAP_H #include <uapi/asm/hwcap.h> +#include <asm/cpufeature.h> #define COMPAT_HWCAP_HALF (1 << 1) #define COMPAT_HWCAP_THUMB (1 << 2) @@ -40,11 +41,67 @@ #define COMPAT_HWCAP2_CRC32 (1 << 4) #ifndef __ASSEMBLY__ +#include <linux/log2.h> + +/* + * For userspace we represent hwcaps as a collection of HWCAP{,2}_x bitfields + * as described in uapi/asm/hwcap.h. For the kernel we represent hwcaps as + * natural numbers (in a single range of size MAX_CPU_FEATURES) defined here + * with prefix KERNEL_HWCAP_ mapped to their HWCAP{,2}_x counterpart. + * + * Hwcaps should be set and tested within the kernel via the + * cpu_{set,have}_named_feature(feature) where feature is the unique suffix + * of KERNEL_HWCAP_{feature}. + */ +#define __khwcap_feature(x) const_ilog2(HWCAP_ ## x) +#define KERNEL_HWCAP_FP __khwcap_feature(FP) +#define KERNEL_HWCAP_ASIMD __khwcap_feature(ASIMD) +#define KERNEL_HWCAP_EVTSTRM __khwcap_feature(EVTSTRM) +#define KERNEL_HWCAP_AES __khwcap_feature(AES) +#define KERNEL_HWCAP_PMULL __khwcap_feature(PMULL) +#define KERNEL_HWCAP_SHA1 __khwcap_feature(SHA1) +#define KERNEL_HWCAP_SHA2 __khwcap_feature(SHA2) +#define KERNEL_HWCAP_CRC32 __khwcap_feature(CRC32) +#define KERNEL_HWCAP_ATOMICS __khwcap_feature(ATOMICS) +#define KERNEL_HWCAP_FPHP __khwcap_feature(FPHP) +#define KERNEL_HWCAP_ASIMDHP __khwcap_feature(ASIMDHP) +#define KERNEL_HWCAP_CPUID __khwcap_feature(CPUID) +#define KERNEL_HWCAP_ASIMDRDM __khwcap_feature(ASIMDRDM) +#define KERNEL_HWCAP_JSCVT __khwcap_feature(JSCVT) +#define KERNEL_HWCAP_FCMA __khwcap_feature(FCMA) +#define KERNEL_HWCAP_LRCPC __khwcap_feature(LRCPC) +#define KERNEL_HWCAP_DCPOP __khwcap_feature(DCPOP) +#define KERNEL_HWCAP_SHA3 __khwcap_feature(SHA3) +#define KERNEL_HWCAP_SM3 __khwcap_feature(SM3) +#define KERNEL_HWCAP_SM4 __khwcap_feature(SM4) +#define KERNEL_HWCAP_ASIMDDP __khwcap_feature(ASIMDDP) +#define KERNEL_HWCAP_SHA512 __khwcap_feature(SHA512) +#define KERNEL_HWCAP_SVE __khwcap_feature(SVE) +#define KERNEL_HWCAP_ASIMDFHM __khwcap_feature(ASIMDFHM) +#define KERNEL_HWCAP_DIT __khwcap_feature(DIT) +#define KERNEL_HWCAP_USCAT __khwcap_feature(USCAT) +#define KERNEL_HWCAP_ILRCPC __khwcap_feature(ILRCPC) +#define KERNEL_HWCAP_FLAGM __khwcap_feature(FLAGM) +#define KERNEL_HWCAP_SSBS __khwcap_feature(SSBS) +#define KERNEL_HWCAP_SB __khwcap_feature(SB) +#define KERNEL_HWCAP_PACA __khwcap_feature(PACA) +#define KERNEL_HWCAP_PACG __khwcap_feature(PACG) + +#define __khwcap2_feature(x) (const_ilog2(HWCAP2_ ## x) + 32) +#define KERNEL_HWCAP_DCPODP __khwcap2_feature(DCPODP) +#define KERNEL_HWCAP_SVE2 __khwcap2_feature(SVE2) +#define KERNEL_HWCAP_SVEAES __khwcap2_feature(SVEAES) +#define KERNEL_HWCAP_SVEPMULL __khwcap2_feature(SVEPMULL) +#define KERNEL_HWCAP_SVEBITPERM __khwcap2_feature(SVEBITPERM) +#define KERNEL_HWCAP_SVESHA3 __khwcap2_feature(SVESHA3) +#define KERNEL_HWCAP_SVESM4 __khwcap2_feature(SVESM4) + /* * This yields a mask that user programs can use to figure out what * instruction set this cpu supports. */ -#define ELF_HWCAP (elf_hwcap) +#define ELF_HWCAP cpu_get_elf_hwcap() +#define ELF_HWCAP2 cpu_get_elf_hwcap2() #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP (compat_elf_hwcap) @@ -60,6 +117,5 @@ enum { #endif }; -extern unsigned long elf_hwcap; #endif #endif diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 8bb7210ac286..b807cb9b517d 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -124,8 +124,6 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) #define __io_par(v) __iormb(v) #define __iowmb() wmb() -#define mmiowb() do { } while (0) - /* * Relaxed I/O memory access primitives. These follow the Device memory * ordering rules but do not guarantee any ordering relative to Normal memory diff --git a/arch/arm64/include/asm/irqflags.h b/arch/arm64/include/asm/irqflags.h index 43d8366c1e87..629963189085 100644 --- a/arch/arm64/include/asm/irqflags.h +++ b/arch/arm64/include/asm/irqflags.h @@ -43,7 +43,7 @@ static inline void arch_local_irq_enable(void) asm volatile(ALTERNATIVE( "msr daifclr, #2 // arch_local_irq_enable\n" "nop", - "msr_s " __stringify(SYS_ICC_PMR_EL1) ",%0\n" + __msr_s(SYS_ICC_PMR_EL1, "%0") "dsb sy", ARM64_HAS_IRQ_PRIO_MASKING) : @@ -55,7 +55,7 @@ static inline void arch_local_irq_disable(void) { asm volatile(ALTERNATIVE( "msr daifset, #2 // arch_local_irq_disable", - "msr_s " __stringify(SYS_ICC_PMR_EL1) ", %0", + __msr_s(SYS_ICC_PMR_EL1, "%0"), ARM64_HAS_IRQ_PRIO_MASKING) : : "r" ((unsigned long) GIC_PRIO_IRQOFF) @@ -86,7 +86,7 @@ static inline unsigned long arch_local_save_flags(void) "mov %0, %1\n" "nop\n" "nop", - "mrs_s %0, " __stringify(SYS_ICC_PMR_EL1) "\n" + __mrs_s("%0", SYS_ICC_PMR_EL1) "ands %1, %1, " __stringify(PSR_I_BIT) "\n" "csel %0, %0, %2, eq", ARM64_HAS_IRQ_PRIO_MASKING) @@ -116,7 +116,7 @@ static inline void arch_local_irq_restore(unsigned long flags) asm volatile(ALTERNATIVE( "msr daif, %0\n" "nop", - "msr_s " __stringify(SYS_ICC_PMR_EL1) ", %0\n" + __msr_s(SYS_ICC_PMR_EL1, "%0") "dsb sy", ARM64_HAS_IRQ_PRIO_MASKING) : "+r" (flags) diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h index d5a44cf859e9..21721fbf44e7 100644 --- a/arch/arm64/include/asm/kprobes.h +++ b/arch/arm64/include/asm/kprobes.h @@ -54,8 +54,6 @@ void arch_remove_kprobe(struct kprobe *); int kprobe_fault_handler(struct pt_regs *regs, unsigned int fsr); int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); -int kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr); -int kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr); void kretprobe_trampoline(void); void __kprobes *trampoline_probe_handler(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 4da765f2cca5..c3060833b7a5 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -30,7 +30,7 @@ ({ \ u64 reg; \ asm volatile(ALTERNATIVE("mrs %0, " __stringify(r##nvh),\ - "mrs_s %0, " __stringify(r##vh),\ + __mrs_s("%0", r##vh), \ ARM64_HAS_VIRT_HOST_EXTN) \ : "=r" (reg)); \ reg; \ @@ -40,7 +40,7 @@ do { \ u64 __val = (u64)(v); \ asm volatile(ALTERNATIVE("msr " __stringify(r##nvh) ", %x0",\ - "msr_s " __stringify(r##vh) ", %x0",\ + __msr_s(r##vh, "%x0"), \ ARM64_HAS_VIRT_HOST_EXTN) \ : : "rZ" (__val)); \ } while (0) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 290195168bb3..2cb8248fa2c8 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -302,7 +302,7 @@ static inline void *phys_to_virt(phys_addr_t x) */ #define ARCH_PFN_OFFSET ((unsigned long)PHYS_PFN_OFFSET) -#ifndef CONFIG_SPARSEMEM_VMEMMAP +#if !defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_DEBUG_VIRTUAL) #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define _virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT) #else diff --git a/arch/arm64/include/asm/module.h b/arch/arm64/include/asm/module.h index 905e1bb0e7bd..cd9f4e9d04d3 100644 --- a/arch/arm64/include/asm/module.h +++ b/arch/arm64/include/asm/module.h @@ -73,4 +73,9 @@ static inline bool is_forbidden_offset_for_adrp(void *place) struct plt_entry get_plt_entry(u64 dst, void *pc); bool plt_entries_equal(const struct plt_entry *a, const struct plt_entry *b); +static inline bool plt_entry_is_initialized(const struct plt_entry *e) +{ + return e->adrp || e->add || e->br; +} + #endif /* __ASM_MODULE_H */ diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index 52fa47c73bf0..dabba4b2c61f 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -33,12 +33,22 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)__get_free_page(PGALLOC_GFP); + struct page *page; + + page = alloc_page(PGALLOC_GFP); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_page(page); + return NULL; + } + return page_address(page); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp) { BUG_ON((unsigned long)pmdp & (PAGE_SIZE-1)); + pgtable_pmd_page_dtor(virt_to_page(pmdp)); free_page((unsigned long)pmdp); } diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index de70c1eabf33..2c41b04708fe 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -478,6 +478,8 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) return __pmd_to_phys(pmd); } +static inline void pte_unmap(pte_t *pte) { } + /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -485,9 +487,6 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) -#define pte_unmap(pte) do { } while (0) -#define pte_unmap_nested(pte) do { } while (0) #define pte_set_fixmap(addr) ((pte_t *)set_fixmap_offset(FIX_PTE, addr)) #define pte_set_fixmap_offset(pmd, addr) pte_set_fixmap(pte_offset_phys(pmd, addr)) diff --git a/arch/arm64/include/asm/pointer_auth.h b/arch/arm64/include/asm/pointer_auth.h index 15d49515efdd..d328540cb85e 100644 --- a/arch/arm64/include/asm/pointer_auth.h +++ b/arch/arm64/include/asm/pointer_auth.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_POINTER_AUTH_H #define __ASM_POINTER_AUTH_H diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 5d9ce62bdebd..fcd0e691b1ea 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -57,7 +57,15 @@ #define TASK_SIZE_64 (UL(1) << vabits_user) #ifdef CONFIG_COMPAT +#if defined(CONFIG_ARM64_64K_PAGES) && defined(CONFIG_KUSER_HELPERS) +/* + * With CONFIG_ARM64_64K_PAGES enabled, the last page is occupied + * by the compat vectors page. + */ #define TASK_SIZE_32 UL(0x100000000) +#else +#define TASK_SIZE_32 (UL(0x100000000) - PAGE_SIZE) +#endif /* CONFIG_ARM64_64K_PAGES */ #define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ TASK_SIZE_32 : TASK_SIZE_64) #define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index ec60174c8c18..b2de32939ada 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -305,6 +305,28 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->regs[0]; } +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * regs_get_argument() returns @n th argument of the function call. + * + * Note that this chooses the most likely register mapping. In very rare + * cases this may not return correct data, for example, if one of the + * function parameters is 16 bytes or bigger. In such cases, we cannot + * get access the parameter correctly and the register assignment of + * subsequent parameters will be shifted. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, + unsigned int n) +{ +#define NR_REG_ARGUMENTS 8 + if (n < NR_REG_ARGUMENTS) + return pt_regs_read_reg(regs, n); + return 0; +} + /* We must avoid circular header include via sched.h */ struct task_struct; int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); diff --git a/arch/arm64/include/asm/sdei.h b/arch/arm64/include/asm/sdei.h index ffe47d766c25..63e0b92a5fbb 100644 --- a/arch/arm64/include/asm/sdei.h +++ b/arch/arm64/include/asm/sdei.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2017 Arm Ltd. #ifndef __ASM_SDEI_H #define __ASM_SDEI_H diff --git a/arch/arm64/include/asm/signal32.h b/arch/arm64/include/asm/signal32.h index 81abea0b7650..58e288aaf0ba 100644 --- a/arch/arm64/include/asm/signal32.h +++ b/arch/arm64/include/asm/signal32.h @@ -20,8 +20,6 @@ #ifdef CONFIG_COMPAT #include <linux/compat.h> -#define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500 - int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h index 5412fa40825e..915809e4ac32 100644 --- a/arch/arm64/include/asm/stage2_pgtable.h +++ b/arch/arm64/include/asm/stage2_pgtable.h @@ -119,7 +119,7 @@ static inline pud_t *stage2_pud_offset(struct kvm *kvm, static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) { if (kvm_stage2_has_pud(kvm)) - pud_free(NULL, pud); + free_page((unsigned long)pud); } static inline bool stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) @@ -192,7 +192,7 @@ static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) { if (kvm_stage2_has_pmd(kvm)) - pmd_free(NULL, pmd); + free_page((unsigned long)pmd); } static inline bool stage2_pud_huge(struct kvm *kvm, pud_t pud) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 5b267dec6194..3f7b917e8f3a 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -606,6 +606,20 @@ #define ID_AA64PFR1_SSBS_PSTATE_ONLY 1 #define ID_AA64PFR1_SSBS_PSTATE_INSNS 2 +/* id_aa64zfr0 */ +#define ID_AA64ZFR0_SM4_SHIFT 40 +#define ID_AA64ZFR0_SHA3_SHIFT 32 +#define ID_AA64ZFR0_BITPERM_SHIFT 16 +#define ID_AA64ZFR0_AES_SHIFT 4 +#define ID_AA64ZFR0_SVEVER_SHIFT 0 + +#define ID_AA64ZFR0_SM4 0x1 +#define ID_AA64ZFR0_SHA3 0x1 +#define ID_AA64ZFR0_BITPERM 0x1 +#define ID_AA64ZFR0_AES 0x1 +#define ID_AA64ZFR0_AES_PMULL 0x2 +#define ID_AA64ZFR0_SVEVER_SVE2 0x1 + /* id_aa64mmfr0 */ #define ID_AA64MMFR0_TGRAN4_SHIFT 28 #define ID_AA64MMFR0_TGRAN64_SHIFT 24 @@ -746,20 +760,39 @@ #include <linux/build_bug.h> #include <linux/types.h> -asm( -" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" -" .equ .L__reg_num_x\\num, \\num\n" -" .endr\n" +#define __DEFINE_MRS_MSR_S_REGNUM \ +" .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30\n" \ +" .equ .L__reg_num_x\\num, \\num\n" \ +" .endr\n" \ " .equ .L__reg_num_xzr, 31\n" -"\n" -" .macro mrs_s, rt, sreg\n" - __emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt)) + +#define DEFINE_MRS_S \ + __DEFINE_MRS_MSR_S_REGNUM \ +" .macro mrs_s, rt, sreg\n" \ + __emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt)) \ " .endm\n" -"\n" -" .macro msr_s, sreg, rt\n" - __emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt)) + +#define DEFINE_MSR_S \ + __DEFINE_MRS_MSR_S_REGNUM \ +" .macro msr_s, sreg, rt\n" \ + __emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt)) \ " .endm\n" -); + +#define UNDEFINE_MRS_S \ +" .purgem mrs_s\n" + +#define UNDEFINE_MSR_S \ +" .purgem msr_s\n" + +#define __mrs_s(v, r) \ + DEFINE_MRS_S \ +" mrs_s " v ", " __stringify(r) "\n" \ + UNDEFINE_MRS_S + +#define __msr_s(r, v) \ + DEFINE_MSR_S \ +" msr_s " __stringify(r) ", " v "\n" \ + UNDEFINE_MSR_S /* * Unlike read_cpuid, calls to read_sysreg are never expected to be @@ -787,13 +820,13 @@ asm( */ #define read_sysreg_s(r) ({ \ u64 __val; \ - asm volatile("mrs_s %0, " __stringify(r) : "=r" (__val)); \ + asm volatile(__mrs_s("%0", r) : "=r" (__val)); \ __val; \ }) #define write_sysreg_s(v, r) do { \ u64 __val = (u64)(v); \ - asm volatile("msr_s " __stringify(r) ", %x0" : : "rZ" (__val)); \ + asm volatile(__msr_s(r, "%x0") : : "rZ" (__val)); \ } while (0) /* diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index 32693f34f431..fca95424e873 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -41,7 +41,6 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, int sig, int code, const char *name); struct mm_struct; -extern void show_pte(unsigned long addr); extern void __show_regs(struct pt_regs *); extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd); diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index 106fdc951b6e..a287189ca8b4 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -27,6 +27,7 @@ static inline void __tlb_remove_table(void *_table) free_page_and_swap_cache((struct page *)_table); } +#define tlb_flush tlb_flush static void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> @@ -62,7 +63,10 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr) { - tlb_remove_table(tlb, virt_to_page(pmdp)); + struct page *page = virt_to_page(pmdp); + + pgtable_pmd_page_dtor(page); + tlb_remove_table(tlb, page); } #endif diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index d1dd93436e1e..f2a83ff6b73c 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -44,7 +44,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 424 +#define __NR_compat_syscalls 428 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 5590f2623690..23f1a44acada 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -866,6 +866,14 @@ __SYSCALL(__NR_rt_sigtimedwait_time64, compat_sys_rt_sigtimedwait_time64) __SYSCALL(__NR_futex_time64, sys_futex) #define __NR_sched_rr_get_interval_time64 423 __SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) +#define __NR_pidfd_send_signal 424 +__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) +#define __NR_io_uring_register 427 +__SYSCALL(__NR_io_uring_register, sys_io_uring_register) /* * Please add new compat syscalls above this comment and update diff --git a/arch/arm64/include/asm/vdso_datapage.h b/arch/arm64/include/asm/vdso_datapage.h index 2b9a63771eda..f89263c8e11a 100644 --- a/arch/arm64/include/asm/vdso_datapage.h +++ b/arch/arm64/include/asm/vdso_datapage.h @@ -38,6 +38,7 @@ struct vdso_data { __u32 tz_minuteswest; /* Whacky timezone stuff */ __u32 tz_dsttime; __u32 use_syscall; + __u32 hrtimer_res; }; #endif /* !__ASSEMBLY__ */ diff --git a/arch/arm64/include/asm/vmap_stack.h b/arch/arm64/include/asm/vmap_stack.h index 0b5ec6e08c10..0a12115d9638 100644 --- a/arch/arm64/include/asm/vmap_stack.h +++ b/arch/arm64/include/asm/vmap_stack.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ // Copyright (C) 2017 Arm Ltd. #ifndef __ASM_VMAP_STACK_H #define __ASM_VMAP_STACK_H diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h index 5f0750c2199c..1a772b162191 100644 --- a/arch/arm64/include/uapi/asm/hwcap.h +++ b/arch/arm64/include/uapi/asm/hwcap.h @@ -18,7 +18,7 @@ #define _UAPI__ASM_HWCAP_H /* - * HWCAP flags - for elf_hwcap (in kernel) and AT_HWCAP + * HWCAP flags - for AT_HWCAP */ #define HWCAP_FP (1 << 0) #define HWCAP_ASIMD (1 << 1) @@ -53,4 +53,15 @@ #define HWCAP_PACA (1 << 30) #define HWCAP_PACG (1UL << 31) +/* + * HWCAP2 flags - for AT_HWCAP2 + */ +#define HWCAP2_DCPODP (1 << 0) +#define HWCAP2_SVE2 (1 << 1) +#define HWCAP2_SVEAES (1 << 2) +#define HWCAP2_SVEPMULL (1 << 3) +#define HWCAP2_SVEBITPERM (1 << 4) +#define HWCAP2_SVESHA3 (1 << 5) +#define HWCAP2_SVESM4 (1 << 6) + #endif /* _UAPI__ASM_HWCAP_H */ diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index cd434d0719c1..9e7dcb2c31c7 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -7,9 +7,9 @@ CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) CFLAGS_armv8_deprecated.o := -I$(src) -CFLAGS_REMOVE_ftrace.o = -pg -CFLAGS_REMOVE_insn.o = -pg -CFLAGS_REMOVE_return_address.o = -pg +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_insn.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) # Object file lists. obj-y := debug-monitors.o entry.o irq.o fpsimd.o \ @@ -27,8 +27,9 @@ OBJCOPYFLAGS := --prefix-symbols=__efistub_ $(obj)/%.stub.o: $(obj)/%.o FORCE $(call if_changed,objcopy) -obj-$(CONFIG_COMPAT) += sys32.o kuser32.o signal32.o \ - sys_compat.o +obj-$(CONFIG_COMPAT) += sys32.o signal32.o \ + sigreturn32.o sys_compat.o +obj-$(CONFIG_KUSER_HELPERS) += kuser32.o obj-$(CONFIG_FUNCTION_TRACER) += ftrace.o entry-ftrace.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_ARM64_MODULE_PLTS) += module-plts.o diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7f40dcbdd51d..e10e2a5d9ddc 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -94,7 +94,7 @@ int main(void) DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); DEFINE(CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC_RAW); - DEFINE(CLOCK_REALTIME_RES, MONOTONIC_RES_NSEC); + DEFINE(CLOCK_REALTIME_RES, offsetof(struct vdso_data, hrtimer_res)); DEFINE(CLOCK_REALTIME_COARSE, CLOCK_REALTIME_COARSE); DEFINE(CLOCK_MONOTONIC_COARSE,CLOCK_MONOTONIC_COARSE); DEFINE(CLOCK_COARSE_RES, LOW_RES_NSEC); diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 9950bb0cbd52..e88d4e7bdfc7 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -19,6 +19,7 @@ #include <linux/arm-smccc.h> #include <linux/psci.h> #include <linux/types.h> +#include <linux/cpu.h> #include <asm/cpu.h> #include <asm/cputype.h> #include <asm/cpufeature.h> @@ -109,7 +110,6 @@ cpu_enable_trap_ctr_access(const struct arm64_cpu_capabilities *__unused) atomic_t arm64_el2_vector_last_slot = ATOMIC_INIT(-1); -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR #include <asm/mmu_context.h> #include <asm/cacheflush.h> @@ -131,9 +131,9 @@ static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, __flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); } -static void __install_bp_hardening_cb(bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) +static void install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) { static DEFINE_RAW_SPINLOCK(bp_lock); int cpu, slot = -1; @@ -169,7 +169,7 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn, #define __smccc_workaround_1_smc_start NULL #define __smccc_workaround_1_smc_end NULL -static void __install_bp_hardening_cb(bp_hardening_cb_t fn, +static void install_bp_hardening_cb(bp_hardening_cb_t fn, const char *hyp_vecs_start, const char *hyp_vecs_end) { @@ -177,23 +177,6 @@ static void __install_bp_hardening_cb(bp_hardening_cb_t fn, } #endif /* CONFIG_KVM_INDIRECT_VECTORS */ -static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, - bp_hardening_cb_t fn, - const char *hyp_vecs_start, - const char *hyp_vecs_end) -{ - u64 pfr0; - - if (!entry->matches(entry, SCOPE_LOCAL_CPU)) - return; - - pfr0 = read_cpuid(ID_AA64PFR0_EL1); - if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) - return; - - __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); -} - #include <uapi/linux/psci.h> #include <linux/arm-smccc.h> #include <linux/psci.h> @@ -220,60 +203,83 @@ static void qcom_link_stack_sanitization(void) : "=&r" (tmp)); } -static void -enable_smccc_arch_workaround_1(const struct arm64_cpu_capabilities *entry) +static bool __nospectre_v2; +static int __init parse_nospectre_v2(char *str) +{ + __nospectre_v2 = true; + return 0; +} +early_param("nospectre_v2", parse_nospectre_v2); + +/* + * -1: No workaround + * 0: No workaround required + * 1: Workaround installed + */ +static int detect_harden_bp_fw(void) { bp_hardening_cb_t cb; void *smccc_start, *smccc_end; struct arm_smccc_res res; u32 midr = read_cpuid_id(); - if (!entry->matches(entry, SCOPE_LOCAL_CPU)) - return; - if (psci_ops.smccc_version == SMCCC_VERSION_1_0) - return; + return -1; switch (psci_ops.conduit) { case PSCI_CONDUIT_HVC: arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 < 0) - return; - cb = call_hvc_arch_workaround_1; - /* This is a guest, no need to patch KVM vectors */ - smccc_start = NULL; - smccc_end = NULL; + switch ((int)res.a0) { + case 1: + /* Firmware says we're just fine */ + return 0; + case 0: + cb = call_hvc_arch_workaround_1; + /* This is a guest, no need to patch KVM vectors */ + smccc_start = NULL; + smccc_end = NULL; + break; + default: + return -1; + } break; case PSCI_CONDUIT_SMC: arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, ARM_SMCCC_ARCH_WORKAROUND_1, &res); - if ((int)res.a0 < 0) - return; - cb = call_smc_arch_workaround_1; - smccc_start = __smccc_workaround_1_smc_start; - smccc_end = __smccc_workaround_1_smc_end; + switch ((int)res.a0) { + case 1: + /* Firmware says we're just fine */ + return 0; + case 0: + cb = call_smc_arch_workaround_1; + smccc_start = __smccc_workaround_1_smc_start; + smccc_end = __smccc_workaround_1_smc_end; + break; + default: + return -1; + } break; default: - return; + return -1; } if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) || ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) cb = qcom_link_stack_sanitization; - install_bp_hardening_cb(entry, cb, smccc_start, smccc_end); + if (IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) + install_bp_hardening_cb(cb, smccc_start, smccc_end); - return; + return 1; } -#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ -#ifdef CONFIG_ARM64_SSBD DEFINE_PER_CPU_READ_MOSTLY(u64, arm64_ssbd_callback_required); int ssbd_state __read_mostly = ARM64_SSBD_KERNEL; +static bool __ssb_safe = true; static const struct ssbd_options { const char *str; @@ -343,6 +349,11 @@ void __init arm64_enable_wa2_handling(struct alt_instr *alt, void arm64_set_ssbd_mitigation(bool state) { + if (!IS_ENABLED(CONFIG_ARM64_SSBD)) { + pr_info_once("SSBD disabled by kernel configuration\n"); + return; + } + if (this_cpu_has_cap(ARM64_SSBS)) { if (state) asm volatile(SET_PSTATE_SSBS(0)); @@ -372,16 +383,28 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, struct arm_smccc_res res; bool required = true; s32 val; + bool this_cpu_safe = false; WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + if (cpu_mitigations_off()) + ssbd_state = ARM64_SSBD_FORCE_DISABLE; + + /* delay setting __ssb_safe until we get a firmware response */ + if (is_midr_in_range_list(read_cpuid_id(), entry->midr_range_list)) + this_cpu_safe = true; + if (this_cpu_has_cap(ARM64_SSBS)) { + if (!this_cpu_safe) + __ssb_safe = false; required = false; goto out_printmsg; } if (psci_ops.smccc_version == SMCCC_VERSION_1_0) { ssbd_state = ARM64_SSBD_UNKNOWN; + if (!this_cpu_safe) + __ssb_safe = false; return false; } @@ -398,6 +421,8 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, default: ssbd_state = ARM64_SSBD_UNKNOWN; + if (!this_cpu_safe) + __ssb_safe = false; return false; } @@ -406,14 +431,18 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, switch (val) { case SMCCC_RET_NOT_SUPPORTED: ssbd_state = ARM64_SSBD_UNKNOWN; + if (!this_cpu_safe) + __ssb_safe = false; return false; + /* machines with mixed mitigation requirements must not return this */ case SMCCC_RET_NOT_REQUIRED: pr_info_once("%s mitigation not required\n", entry->desc); ssbd_state = ARM64_SSBD_MITIGATED; return false; case SMCCC_RET_SUCCESS: + __ssb_safe = false; required = true; break; @@ -423,6 +452,8 @@ static bool has_ssbd_mitigation(const struct arm64_cpu_capabilities *entry, default: WARN_ON(1); + if (!this_cpu_safe) + __ssb_safe = false; return false; } @@ -462,7 +493,14 @@ out_printmsg: return required; } -#endif /* CONFIG_ARM64_SSBD */ + +/* known invulnerable cores */ +static const struct midr_range arm64_ssb_cpus[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + {}, +}; static void __maybe_unused cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) @@ -507,26 +545,67 @@ cpu_enable_cache_maint_trap(const struct arm64_cpu_capabilities *__unused) .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, \ CAP_MIDR_RANGE_LIST(midr_list) -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +/* Track overall mitigation state. We are only mitigated if all cores are ok */ +static bool __hardenbp_enab = true; +static bool __spectrev2_safe = true; /* - * List of CPUs where we need to issue a psci call to - * harden the branch predictor. + * List of CPUs that do not need any Spectre-v2 mitigation at all. */ -static const struct midr_range arm64_bp_harden_smccc_cpus[] = { - MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), - MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), - MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), - MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), - MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), - MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR), - MIDR_ALL_VERSIONS(MIDR_NVIDIA_DENVER), - {}, +static const struct midr_range spectre_v2_safe_list[] = { + MIDR_ALL_VERSIONS(MIDR_CORTEX_A35), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A53), + MIDR_ALL_VERSIONS(MIDR_CORTEX_A55), + { /* sentinel */ } }; -#endif +/* + * Track overall bp hardening for all heterogeneous cores in the machine. + * We are only considered "safe" if all booted cores are known safe. + */ +static bool __maybe_unused +check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope) +{ + int need_wa; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + /* If the CPU has CSV2 set, we're safe */ + if (cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64PFR0_EL1), + ID_AA64PFR0_CSV2_SHIFT)) + return false; + + /* Alternatively, we have a list of unaffected CPUs */ + if (is_midr_in_range_list(read_cpuid_id(), spectre_v2_safe_list)) + return false; + + /* Fallback to firmware detection */ + need_wa = detect_harden_bp_fw(); + if (!need_wa) + return false; + + __spectrev2_safe = false; + + if (!IS_ENABLED(CONFIG_HARDEN_BRANCH_PREDICTOR)) { + pr_warn_once("spectrev2 mitigation disabled by kernel configuration\n"); + __hardenbp_enab = false; + return false; + } + + /* forced off */ + if (__nospectre_v2 || cpu_mitigations_off()) { + pr_info_once("spectrev2 mitigation disabled by command line option\n"); + __hardenbp_enab = false; + return false; + } + + if (need_wa < 0) { + pr_warn_once("ARM_SMCCC_ARCH_WORKAROUND_1 missing from firmware\n"); + __hardenbp_enab = false; + } + + return (need_wa > 0); +} #ifdef CONFIG_HARDEN_EL2_VECTORS @@ -603,6 +682,16 @@ static const struct midr_range workaround_clean_cache[] = { }; #endif +#ifdef CONFIG_ARM64_ERRATUM_1188873 +static const struct midr_range erratum_1188873_list[] = { + /* Cortex-A76 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), + /* Neoverse-N1 r0p0 to r2p0 */ + MIDR_RANGE(MIDR_NEOVERSE_N1, 0, 0, 2, 0), + {}, +}; +#endif + const struct arm64_cpu_capabilities arm64_errata[] = { #ifdef CONFIG_ARM64_WORKAROUND_CLEAN_CACHE { @@ -701,13 +790,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), }, #endif -#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR { .capability = ARM64_HARDEN_BRANCH_PREDICTOR, - .cpu_enable = enable_smccc_arch_workaround_1, - ERRATA_MIDR_RANGE_LIST(arm64_bp_harden_smccc_cpus), + .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, + .matches = check_branch_predictor, }, -#endif #ifdef CONFIG_HARDEN_EL2_VECTORS { .desc = "EL2 vector hardening", @@ -715,20 +802,18 @@ const struct arm64_cpu_capabilities arm64_errata[] = { ERRATA_MIDR_RANGE_LIST(arm64_harden_el2_vectors), }, #endif -#ifdef CONFIG_ARM64_SSBD { .desc = "Speculative Store Bypass Disable", .capability = ARM64_SSBD, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = has_ssbd_mitigation, + .midr_range_list = arm64_ssb_cpus, }, -#endif #ifdef CONFIG_ARM64_ERRATUM_1188873 { - /* Cortex-A76 r0p0 to r2p0 */ .desc = "ARM erratum 1188873", .capability = ARM64_WORKAROUND_1188873, - ERRATA_MIDR_RANGE(MIDR_CORTEX_A76, 0, 0, 2, 0), + ERRATA_MIDR_RANGE_LIST(erratum_1188873_list), }, #endif #ifdef CONFIG_ARM64_ERRATUM_1165522 @@ -742,3 +827,38 @@ const struct arm64_cpu_capabilities arm64_errata[] = { { } }; + +ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, + char *buf) +{ + return sprintf(buf, "Mitigation: __user pointer sanitization\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, + char *buf) +{ + if (__spectrev2_safe) + return sprintf(buf, "Not affected\n"); + + if (__hardenbp_enab) + return sprintf(buf, "Mitigation: Branch predictor hardening\n"); + + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spec_store_bypass(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (__ssb_safe) + return sprintf(buf, "Not affected\n"); + + switch (ssbd_state) { + case ARM64_SSBD_KERNEL: + case ARM64_SSBD_FORCE_ENABLE: + if (IS_ENABLED(CONFIG_ARM64_SSBD)) + return sprintf(buf, + "Mitigation: Speculative Store Bypass disabled via prctl\n"); + } + + return sprintf(buf, "Vulnerable\n"); +} diff --git a/arch/arm64/kernel/cpu_ops.c b/arch/arm64/kernel/cpu_ops.c index ea001241bdd4..00f8b8612b69 100644 --- a/arch/arm64/kernel/cpu_ops.c +++ b/arch/arm64/kernel/cpu_ops.c @@ -85,6 +85,7 @@ static const char *__init cpu_read_enable_method(int cpu) pr_err("%pOF: missing enable-method property\n", dn); } + of_node_put(dn); } else { enable_method = acpi_get_enable_method(cpu); if (!enable_method) { diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 4061de10cea6..2b807f129e60 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -25,6 +25,7 @@ #include <linux/stop_machine.h> #include <linux/types.h> #include <linux/mm.h> +#include <linux/cpu.h> #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/cpu_ops.h> @@ -35,8 +36,8 @@ #include <asm/traps.h> #include <asm/virt.h> -unsigned long elf_hwcap __read_mostly; -EXPORT_SYMBOL_GPL(elf_hwcap); +/* Kernel representation of AT_HWCAP and AT_HWCAP2 */ +static unsigned long elf_hwcap __read_mostly; #ifdef CONFIG_COMPAT #define COMPAT_ELF_HWCAP_DEFAULT \ @@ -184,6 +185,15 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr1[] = { ARM64_FTR_END, }; +static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SM4_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SHA3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_BITPERM_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_AES_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ZFR0_SVEVER_SHIFT, 4, 0), + ARM64_FTR_END, +}; + static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = { S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN4_SHIFT, 4, ID_AA64MMFR0_TGRAN4_NI), S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR0_TGRAN64_SHIFT, 4, ID_AA64MMFR0_TGRAN64_NI), @@ -392,7 +402,7 @@ static const struct __ftr_reg_entry { /* Op1 = 0, CRn = 0, CRm = 4 */ ARM64_FTR_REG(SYS_ID_AA64PFR0_EL1, ftr_id_aa64pfr0), ARM64_FTR_REG(SYS_ID_AA64PFR1_EL1, ftr_id_aa64pfr1), - ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_raz), + ARM64_FTR_REG(SYS_ID_AA64ZFR0_EL1, ftr_id_aa64zfr0), /* Op1 = 0, CRn = 0, CRm = 5 */ ARM64_FTR_REG(SYS_ID_AA64DFR0_EL1, ftr_id_aa64dfr0), @@ -947,7 +957,7 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) return has_cpuid_feature(entry, scope); } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static bool __meltdown_safe = true; static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, @@ -966,7 +976,17 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, MIDR_ALL_VERSIONS(MIDR_HISI_TSV110), { /* sentinel */ } }; - char const *str = "command line option"; + char const *str = "kpti command line option"; + bool meltdown_safe; + + meltdown_safe = is_midr_in_range_list(read_cpuid_id(), kpti_safe_list); + + /* Defer to CPU feature registers */ + if (has_cpuid_feature(entry, scope)) + meltdown_safe = true; + + if (!meltdown_safe) + __meltdown_safe = false; /* * For reasons that aren't entirely clear, enabling KPTI on Cavium @@ -978,6 +998,24 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, __kpti_forced = -1; } + /* Useful for KASLR robustness */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_offset() > 0) { + if (!__kpti_forced) { + str = "KASLR"; + __kpti_forced = 1; + } + } + + if (cpu_mitigations_off() && !__kpti_forced) { + str = "mitigations=off"; + __kpti_forced = -1; + } + + if (!IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0)) { + pr_info_once("kernel page table isolation disabled by kernel configuration\n"); + return false; + } + /* Forced? */ if (__kpti_forced) { pr_info_once("kernel page table isolation forced %s by %s\n", @@ -985,18 +1023,10 @@ static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, return __kpti_forced > 0; } - /* Useful for KASLR robustness */ - if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) - return kaslr_offset() > 0; - - /* Don't force KPTI for CPUs that are not vulnerable */ - if (is_midr_in_range_list(read_cpuid_id(), kpti_safe_list)) - return false; - - /* Defer to CPU feature registers */ - return !has_cpuid_feature(entry, scope); + return !meltdown_safe; } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 static void kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) { @@ -1026,6 +1056,12 @@ kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) return; } +#else +static void +kpti_install_ng_mappings(const struct arm64_cpu_capabilities *__unused) +{ +} +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ static int __init parse_kpti(char *str) { @@ -1039,7 +1075,6 @@ static int __init parse_kpti(char *str) return 0; } early_param("kpti", parse_kpti); -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ #ifdef CONFIG_ARM64_HW_AFDBM static inline void __cpu_enable_hw_dbm(void) @@ -1306,7 +1341,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64PFR0_EL0_SHIFT, .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, }, -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 { .desc = "Kernel page table isolation (KPTI)", .capability = ARM64_UNMAP_KERNEL_AT_EL0, @@ -1322,7 +1356,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .matches = unmap_kernel_at_el0, .cpu_enable = kpti_install_ng_mappings, }, -#endif { /* FP/SIMD is not implemented */ .capability = ARM64_HAS_NO_FPSIMD, @@ -1340,6 +1373,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .field_pos = ID_AA64ISAR1_DPB_SHIFT, .min_field_value = 1, }, + { + .desc = "Data cache clean to Point of Deep Persistence", + .capability = ARM64_HAS_DCPODP, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .min_field_value = 2, + }, #endif #ifdef CONFIG_ARM64_SVE { @@ -1571,39 +1614,46 @@ static const struct arm64_cpu_capabilities ptr_auth_hwcap_gen_matches[] = { #endif static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_PMULL), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_AES), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA1), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA2), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_SHA512), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_CRC32), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ATOMICS), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDRDM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SHA3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM), - HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FLAGM), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_DIT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, HWCAP_ILRCPC), - HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SB), - HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_USCAT), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_PMULL), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_AES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AES), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA1_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA1), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA2), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA2_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_SHA512), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_CRC32_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_CRC32), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_ATOMICS_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ATOMICS), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_RDM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDRDM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SHA3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SHA3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM3), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SM4), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_TS_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FLAGM), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_FP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FPHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, KERNEL_HWCAP_ASIMD), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_DIT_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DIT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_DCPOP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_DCPODP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_JSCVT), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_FCMA), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_LRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 2, CAP_HWCAP, KERNEL_HWCAP_ILRCPC), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_SB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_SB), + HWCAP_CAP(SYS_ID_AA64MMFR2_EL1, ID_AA64MMFR2_AT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_USCAT), #ifdef CONFIG_ARM64_SVE - HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, HWCAP_SVE), + HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, ID_AA64PFR0_SVE, CAP_HWCAP, KERNEL_HWCAP_SVE), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SVEVER_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SVEVER_SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES, CAP_HWCAP, KERNEL_HWCAP_SVEAES), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_AES_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_AES_PMULL, CAP_HWCAP, KERNEL_HWCAP_SVEPMULL), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_BITPERM_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_BITPERM, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SHA3_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SHA3, CAP_HWCAP, KERNEL_HWCAP_SVESHA3), + HWCAP_CAP(SYS_ID_AA64ZFR0_EL1, ID_AA64ZFR0_SM4_SHIFT, FTR_UNSIGNED, ID_AA64ZFR0_SM4, CAP_HWCAP, KERNEL_HWCAP_SVESM4), #endif - HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, HWCAP_SSBS), + HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_SSBS_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_SSBS_PSTATE_INSNS, CAP_HWCAP, KERNEL_HWCAP_SSBS), #ifdef CONFIG_ARM64_PTR_AUTH - HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, HWCAP_PACA), - HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, HWCAP_PACG), + HWCAP_MULTI_CAP(ptr_auth_hwcap_addr_matches, CAP_HWCAP, KERNEL_HWCAP_PACA), + HWCAP_MULTI_CAP(ptr_auth_hwcap_gen_matches, CAP_HWCAP, KERNEL_HWCAP_PACG), #endif {}, }; @@ -1623,7 +1673,7 @@ static void __init cap_set_elf_hwcap(const struct arm64_cpu_capabilities *cap) { switch (cap->hwcap_type) { case CAP_HWCAP: - elf_hwcap |= cap->hwcap; + cpu_set_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -1646,7 +1696,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) switch (cap->hwcap_type) { case CAP_HWCAP: - rc = (elf_hwcap & cap->hwcap) != 0; + rc = cpu_have_feature(cap->hwcap); break; #ifdef CONFIG_COMPAT case CAP_COMPAT_HWCAP: @@ -1667,7 +1717,7 @@ static bool cpus_have_elf_hwcap(const struct arm64_cpu_capabilities *cap) static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) { /* We support emulation of accesses to CPU ID feature registers */ - elf_hwcap |= HWCAP_CPUID; + cpu_set_named_feature(CPUID); for (; hwcaps->matches; hwcaps++) if (hwcaps->matches(hwcaps, cpucap_default_scope(hwcaps))) cap_set_elf_hwcap(hwcaps); @@ -1947,6 +1997,35 @@ bool this_cpu_has_cap(unsigned int n) return false; } +void cpu_set_feature(unsigned int num) +{ + WARN_ON(num >= MAX_CPU_FEATURES); + elf_hwcap |= BIT(num); +} +EXPORT_SYMBOL_GPL(cpu_set_feature); + +bool cpu_have_feature(unsigned int num) +{ + WARN_ON(num >= MAX_CPU_FEATURES); + return elf_hwcap & BIT(num); +} +EXPORT_SYMBOL_GPL(cpu_have_feature); + +unsigned long cpu_get_elf_hwcap(void) +{ + /* + * We currently only populate the first 32 bits of AT_HWCAP. Please + * note that for userspace compatibility we guarantee that bits 62 + * and 63 will always be returned as 0. + */ + return lower_32_bits(elf_hwcap); +} + +unsigned long cpu_get_elf_hwcap2(void) +{ + return upper_32_bits(elf_hwcap); +} + static void __init setup_system_capabilities(void) { /* @@ -2101,3 +2180,15 @@ static int __init enable_mrs_emulation(void) } core_initcall(enable_mrs_emulation); + +ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, + char *buf) +{ + if (__meltdown_safe) + return sprintf(buf, "Not affected\n"); + + if (arm64_kernel_unmapped_at_el0()) + return sprintf(buf, "Mitigation: PTI\n"); + + return sprintf(buf, "Vulnerable\n"); +} diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index ca0685f33900..f6f7936be6e7 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -85,6 +85,13 @@ static const char *const hwcap_str[] = { "sb", "paca", "pacg", + "dcpodp", + "sve2", + "sveaes", + "svepmull", + "svebitperm", + "svesha3", + "svesm4", NULL }; @@ -167,7 +174,7 @@ static int c_show(struct seq_file *m, void *v) #endif /* CONFIG_COMPAT */ } else { for (j = 0; hwcap_str[j]; j++) - if (elf_hwcap & (1 << j)) + if (cpu_have_feature(j)) seq_printf(m, " %s", hwcap_str[j]); } seq_puts(m, "\n"); diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index d7bb6aefae0a..555b6bd2f3d6 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -135,6 +135,7 @@ NOKPROBE_SYMBOL(disable_debug_monitors); */ static int clear_os_lock(unsigned int cpu) { + write_sysreg(0, osdlr_el1); write_sysreg(0, oslar_el1); isb(); return 0; @@ -163,25 +164,46 @@ static void clear_regs_spsr_ss(struct pt_regs *regs) } NOKPROBE_SYMBOL(clear_regs_spsr_ss); -/* EL1 Single Step Handler hooks */ -static LIST_HEAD(step_hook); -static DEFINE_SPINLOCK(step_hook_lock); +static DEFINE_SPINLOCK(debug_hook_lock); +static LIST_HEAD(user_step_hook); +static LIST_HEAD(kernel_step_hook); -void register_step_hook(struct step_hook *hook) +static void register_debug_hook(struct list_head *node, struct list_head *list) { - spin_lock(&step_hook_lock); - list_add_rcu(&hook->node, &step_hook); - spin_unlock(&step_hook_lock); + spin_lock(&debug_hook_lock); + list_add_rcu(node, list); + spin_unlock(&debug_hook_lock); + } -void unregister_step_hook(struct step_hook *hook) +static void unregister_debug_hook(struct list_head *node) { - spin_lock(&step_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&step_hook_lock); + spin_lock(&debug_hook_lock); + list_del_rcu(node); + spin_unlock(&debug_hook_lock); synchronize_rcu(); } +void register_user_step_hook(struct step_hook *hook) +{ + register_debug_hook(&hook->node, &user_step_hook); +} + +void unregister_user_step_hook(struct step_hook *hook) +{ + unregister_debug_hook(&hook->node); +} + +void register_kernel_step_hook(struct step_hook *hook) +{ + register_debug_hook(&hook->node, &kernel_step_hook); +} + +void unregister_kernel_step_hook(struct step_hook *hook) +{ + unregister_debug_hook(&hook->node); +} + /* * Call registered single step handlers * There is no Syndrome info to check for determining the handler. @@ -191,11 +213,14 @@ void unregister_step_hook(struct step_hook *hook) static int call_step_hook(struct pt_regs *regs, unsigned int esr) { struct step_hook *hook; + struct list_head *list; int retval = DBG_HOOK_ERROR; + list = user_mode(regs) ? &user_step_hook : &kernel_step_hook; + rcu_read_lock(); - list_for_each_entry_rcu(hook, &step_hook, node) { + list_for_each_entry_rcu(hook, list, node) { retval = hook->fn(regs, esr); if (retval == DBG_HOOK_HANDLED) break; @@ -222,7 +247,7 @@ static void send_user_sigtrap(int si_code) "User debug trap"); } -static int single_step_handler(unsigned long addr, unsigned int esr, +static int single_step_handler(unsigned long unused, unsigned int esr, struct pt_regs *regs) { bool handler_found = false; @@ -234,10 +259,6 @@ static int single_step_handler(unsigned long addr, unsigned int esr, if (!reinstall_suspended_bps(regs)) return 0; -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; -#endif if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) handler_found = true; @@ -264,61 +285,59 @@ static int single_step_handler(unsigned long addr, unsigned int esr, } NOKPROBE_SYMBOL(single_step_handler); -/* - * Breakpoint handler is re-entrant as another breakpoint can - * hit within breakpoint handler, especically in kprobes. - * Use reader/writer locks instead of plain spinlock. - */ -static LIST_HEAD(break_hook); -static DEFINE_SPINLOCK(break_hook_lock); +static LIST_HEAD(user_break_hook); +static LIST_HEAD(kernel_break_hook); -void register_break_hook(struct break_hook *hook) +void register_user_break_hook(struct break_hook *hook) { - spin_lock(&break_hook_lock); - list_add_rcu(&hook->node, &break_hook); - spin_unlock(&break_hook_lock); + register_debug_hook(&hook->node, &user_break_hook); } -void unregister_break_hook(struct break_hook *hook) +void unregister_user_break_hook(struct break_hook *hook) { - spin_lock(&break_hook_lock); - list_del_rcu(&hook->node); - spin_unlock(&break_hook_lock); - synchronize_rcu(); + unregister_debug_hook(&hook->node); +} + +void register_kernel_break_hook(struct break_hook *hook) +{ + register_debug_hook(&hook->node, &kernel_break_hook); +} + +void unregister_kernel_break_hook(struct break_hook *hook) +{ + unregister_debug_hook(&hook->node); } static int call_break_hook(struct pt_regs *regs, unsigned int esr) { struct break_hook *hook; + struct list_head *list; int (*fn)(struct pt_regs *regs, unsigned int esr) = NULL; + list = user_mode(regs) ? &user_break_hook : &kernel_break_hook; + rcu_read_lock(); - list_for_each_entry_rcu(hook, &break_hook, node) - if ((esr & hook->esr_mask) == hook->esr_val) + list_for_each_entry_rcu(hook, list, node) { + unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + + if ((comment & ~hook->mask) == hook->imm) fn = hook->fn; + } rcu_read_unlock(); return fn ? fn(regs, esr) : DBG_HOOK_ERROR; } NOKPROBE_SYMBOL(call_break_hook); -static int brk_handler(unsigned long addr, unsigned int esr, +static int brk_handler(unsigned long unused, unsigned int esr, struct pt_regs *regs) { - bool handler_found = false; - -#ifdef CONFIG_KPROBES - if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { - if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; - } -#endif - if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED) - handler_found = true; + if (call_break_hook(regs, esr) == DBG_HOOK_HANDLED) + return 0; - if (!handler_found && user_mode(regs)) { + if (user_mode(regs)) { send_user_sigtrap(TRAP_BRKPT); - } else if (!handler_found) { + } else { pr_warn("Unexpected kernel BRK exception at EL1\n"); return -EFAULT; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index c50a7a75f2e0..1a7811b7e3c4 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -336,6 +336,21 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif 3: +#ifdef CONFIG_ARM64_ERRATUM_1188873 +alternative_if_not ARM64_WORKAROUND_1188873 + b 4f +alternative_else_nop_endif + /* + * if (x22.mode32 == cntkctl_el1.el0vcten) + * cntkctl_el1.el0vcten = ~cntkctl_el1.el0vcten + */ + mrs x1, cntkctl_el1 + eon x0, x1, x22, lsr #3 + tbz x0, #1, 4f + eor x1, x1, #2 // ARCH_TIMER_USR_VCT_ACCESS_EN + msr cntkctl_el1, x1 +4: +#endif apply_ssbd 0, x0, x1 .endif @@ -362,11 +377,11 @@ alternative_else_nop_endif .if \el == 0 alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 - bne 4f + bne 5f msr far_el1, x30 tramp_alias x30, tramp_exit_native br x30 -4: +5: tramp_alias x30, tramp_exit_compat br x30 #endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 5ebe73b69961..735cf1f8b109 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1258,14 +1258,14 @@ static inline void fpsimd_hotplug_init(void) { } */ static int __init fpsimd_init(void) { - if (elf_hwcap & HWCAP_FP) { + if (cpu_have_named_feature(FP)) { fpsimd_pm_init(); fpsimd_hotplug_init(); } else { pr_notice("Floating-point is not implemented\n"); } - if (!(elf_hwcap & HWCAP_ASIMD)) + if (!cpu_have_named_feature(ASIMD)) pr_notice("Advanced SIMD is not implemented\n"); return sve_sysctl_init(); diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index 8e4431a8821f..65a51331088e 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -103,12 +103,16 @@ int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) * to be revisited if support for multiple ftrace entry points * is added in the future, but for now, the pr_err() below * deals with a theoretical issue only. + * + * Note that PLTs are place relative, and plt_entries_equal() + * checks whether they point to the same target. Here, we need + * to check if the actual opcodes are in fact identical, + * regardless of the offset in memory so use memcmp() instead. */ trampoline = get_plt_entry(addr, mod->arch.ftrace_trampoline); - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &trampoline)) { - if (!plt_entries_equal(mod->arch.ftrace_trampoline, - &(struct plt_entry){})) { + if (memcmp(mod->arch.ftrace_trampoline, &trampoline, + sizeof(trampoline))) { + if (plt_entry_is_initialized(mod->arch.ftrace_trampoline)) { pr_err("ftrace: far branches to multiple entry points unsupported inside a single module\n"); return -EINVAL; } diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index eecf7927dab0..fcae3f85c6cd 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -505,7 +505,7 @@ ENTRY(el2_setup) * kernel is intended to run at EL2. */ mrs x2, id_aa64mmfr1_el1 - ubfx x2, x2, #8, #4 + ubfx x2, x2, #ID_AA64MMFR1_VHE_SHIFT, #4 #else mov x2, xzr #endif @@ -538,7 +538,7 @@ set_hcr: #ifdef CONFIG_ARM_GIC_V3 /* GICv3 system register access */ mrs x0, id_aa64pfr0_el1 - ubfx x0, x0, #24, #4 + ubfx x0, x0, #ID_AA64PFR0_GIC_SHIFT, #4 cbz x0, 3f mrs_s x0, SYS_ICC_SRE_EL2 @@ -564,8 +564,8 @@ set_hcr: #endif /* EL2 debug */ - mrs x1, id_aa64dfr0_el1 // Check ID_AA64DFR0_EL1 PMUVer - sbfx x0, x1, #8, #4 + mrs x1, id_aa64dfr0_el1 + sbfx x0, x1, #ID_AA64DFR0_PMUVER_SHIFT, #4 cmp x0, #1 b.lt 4f // Skip if no PMU present mrs x0, pmcr_el0 // Disable debug access traps @@ -574,7 +574,7 @@ set_hcr: csel x3, xzr, x0, lt // all PMU counters from EL1 /* Statistical profiling */ - ubfx x0, x1, #32, #4 // Check ID_AA64DFR0_EL1 PMSVer + ubfx x0, x1, #ID_AA64DFR0_PMSVER_SHIFT, #4 cbz x0, 7f // Skip if SPE not present cbnz x2, 6f // VHE? mrs_s x4, SYS_PMBIDR_EL1 // If SPE available at EL2, @@ -684,7 +684,7 @@ ENTRY(__boot_cpu_mode) * with MMU turned off. */ ENTRY(__early_cpu_boot_status) - .long 0 + .quad 0 .popsection diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 691854b77c7f..30853d5b7859 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -244,9 +244,6 @@ int kgdb_arch_handle_exception(int exception_vector, int signo, static int kgdb_brk_fn(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - kgdb_handle_exception(1, SIGTRAP, 0, regs); return DBG_HOOK_HANDLED; } @@ -254,9 +251,6 @@ NOKPROBE_SYMBOL(kgdb_brk_fn) static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - compiled_break = 1; kgdb_handle_exception(1, SIGTRAP, 0, regs); @@ -266,7 +260,7 @@ NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs) || !kgdb_single_step) + if (!kgdb_single_step) return DBG_HOOK_ERROR; kgdb_handle_exception(1, SIGTRAP, 0, regs); @@ -275,15 +269,13 @@ static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) NOKPROBE_SYMBOL(kgdb_step_brk_fn); static struct break_hook kgdb_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_DYN_DBG_BRK_IMM), - .fn = kgdb_brk_fn + .fn = kgdb_brk_fn, + .imm = KGDB_DYN_DBG_BRK_IMM, }; static struct break_hook kgdb_compiled_brkpt_hook = { - .esr_mask = 0xffffffff, - .esr_val = (u32)ESR_ELx_VAL_BRK64(KGDB_COMPILED_DBG_BRK_IMM), - .fn = kgdb_compiled_brk_fn + .fn = kgdb_compiled_brk_fn, + .imm = KGDB_COMPILED_DBG_BRK_IMM, }; static struct step_hook kgdb_step_hook = { @@ -332,9 +324,9 @@ int kgdb_arch_init(void) if (ret != 0) return ret; - register_break_hook(&kgdb_brkpt_hook); - register_break_hook(&kgdb_compiled_brkpt_hook); - register_step_hook(&kgdb_step_hook); + register_kernel_break_hook(&kgdb_brkpt_hook); + register_kernel_break_hook(&kgdb_compiled_brkpt_hook); + register_kernel_step_hook(&kgdb_step_hook); return 0; } @@ -345,9 +337,9 @@ int kgdb_arch_init(void) */ void kgdb_arch_exit(void) { - unregister_break_hook(&kgdb_brkpt_hook); - unregister_break_hook(&kgdb_compiled_brkpt_hook); - unregister_step_hook(&kgdb_step_hook); + unregister_kernel_break_hook(&kgdb_brkpt_hook); + unregister_kernel_break_hook(&kgdb_compiled_brkpt_hook); + unregister_kernel_step_hook(&kgdb_step_hook); unregister_die_notifier(&kgdb_notifier); } diff --git a/arch/arm64/kernel/kuser32.S b/arch/arm64/kernel/kuser32.S index 997e6b27ff6a..49825e9e421e 100644 --- a/arch/arm64/kernel/kuser32.S +++ b/arch/arm64/kernel/kuser32.S @@ -1,29 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* - * Low-level user helpers placed in the vectors page for AArch32. + * AArch32 user helpers. * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. * * Copyright (C) 2005-2011 Nicolas Pitre <nico@fluxnic.net> - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * Copyright (C) 2012-2018 ARM Ltd. * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - * - * - * AArch32 user helpers. - * - * Each segment is 32-byte aligned and will be moved to the top of the high - * vector page. New segments (if ever needed) must be added in front of - * existing ones. This mechanism should be used only for things that are - * really small and justified, and not be abused freely. + * The kuser helpers below are mapped at a fixed address by + * aarch32_setup_additional_pages() and are provided for compatibility + * reasons with 32 bit (aarch32) applications that need them. * * See Documentation/arm/kernel_user_helpers.txt for formal definitions. */ @@ -77,42 +62,3 @@ __kuser_helper_version: // 0xffff0ffc .word ((__kuser_helper_end - __kuser_helper_start) >> 5) .globl __kuser_helper_end __kuser_helper_end: - -/* - * AArch32 sigreturn code - * - * For ARM syscalls, the syscall number has to be loaded into r7. - * We do not support an OABI userspace. - * - * For Thumb syscalls, we also pass the syscall number via r7. We therefore - * need two 16-bit instructions. - */ - .globl __aarch32_sigret_code_start -__aarch32_sigret_code_start: - - /* - * ARM Code - */ - .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn - .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn - - /* - * ARM code - */ - .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn - - /* - * Thumb code - */ - .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn - .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn - - .globl __aarch32_sigret_code_end -__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 4addb38bc250..6164d389eed6 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -431,7 +431,7 @@ static inline u64 armv8pmu_read_hw_counter(struct perf_event *event) return val; } -static inline u64 armv8pmu_read_counter(struct perf_event *event) +static u64 armv8pmu_read_counter(struct perf_event *event) { struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; @@ -468,7 +468,7 @@ static inline void armv8pmu_write_hw_counter(struct perf_event *event, } } -static inline void armv8pmu_write_counter(struct perf_event *event, u64 value) +static void armv8pmu_write_counter(struct perf_event *event, u64 value) { struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu); struct hw_perf_event *hwc = &event->hw; diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 7a679caf4585..2509fcb6d404 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -439,15 +439,12 @@ kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) return DBG_HOOK_ERROR; } -int __kprobes +static int __kprobes kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); int retval; - if (user_mode(regs)) - return DBG_HOOK_ERROR; - /* return error if this is not our step */ retval = kprobe_ss_hit(kcb, instruction_pointer(regs)); @@ -461,16 +458,22 @@ kprobe_single_step_handler(struct pt_regs *regs, unsigned int esr) return retval; } -int __kprobes +static struct step_hook kprobes_step_hook = { + .fn = kprobe_single_step_handler, +}; + +static int __kprobes kprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - kprobe_handler(regs); return DBG_HOOK_HANDLED; } +static struct break_hook kprobes_break_hook = { + .imm = KPROBES_BRK_IMM, + .fn = kprobe_breakpoint_handler, +}; + /* * Provide a blacklist of symbols identifying ranges which cannot be kprobed. * This blacklist is exposed to userspace via debugfs (kprobes/blacklist). @@ -599,5 +602,8 @@ int __kprobes arch_trampoline_kprobe(struct kprobe *p) int __init arch_init_kprobes(void) { + register_kernel_break_hook(&kprobes_break_hook); + register_kernel_step_hook(&kprobes_step_hook); + return 0; } diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c index 636ca0119c0e..605945eac1f8 100644 --- a/arch/arm64/kernel/probes/uprobes.c +++ b/arch/arm64/kernel/probes/uprobes.c @@ -171,7 +171,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, static int uprobe_breakpoint_handler(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs) && uprobe_pre_sstep_notifier(regs)) + if (uprobe_pre_sstep_notifier(regs)) return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; @@ -182,21 +182,16 @@ static int uprobe_single_step_handler(struct pt_regs *regs, { struct uprobe_task *utask = current->utask; - if (user_mode(regs)) { - WARN_ON(utask && - (instruction_pointer(regs) != utask->xol_vaddr + 4)); - - if (uprobe_post_sstep_notifier(regs)) - return DBG_HOOK_HANDLED; - } + WARN_ON(utask && (instruction_pointer(regs) != utask->xol_vaddr + 4)); + if (uprobe_post_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; return DBG_HOOK_ERROR; } /* uprobe breakpoint handler hook */ static struct break_hook uprobes_break_hook = { - .esr_mask = BRK64_ESR_MASK, - .esr_val = BRK64_ESR_UPROBES, + .imm = UPROBES_BRK_IMM, .fn = uprobe_breakpoint_handler, }; @@ -207,8 +202,8 @@ static struct step_hook uprobes_step_hook = { static int __init arch_init_uprobes(void) { - register_break_hook(&uprobes_break_hook); - register_step_hook(&uprobes_step_hook); + register_user_break_hook(&uprobes_break_hook); + register_user_step_hook(&uprobes_step_hook); return 0; } diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index cb7800acd19f..caea6e25db2a 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -403,8 +403,7 @@ static void compat_setup_return(struct pt_regs *regs, struct k_sigaction *ka, if (ka->sa.sa_flags & SA_SIGINFO) idx += 3; - retcode = AARCH32_VECTORS_BASE + - AARCH32_KERN_SIGRET_CODE_OFFSET + + retcode = (unsigned long)current->mm->context.vdso + (idx << 2) + thumb; } diff --git a/arch/arm64/kernel/sigreturn32.S b/arch/arm64/kernel/sigreturn32.S new file mode 100644 index 000000000000..475d30d471ac --- /dev/null +++ b/arch/arm64/kernel/sigreturn32.S @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * AArch32 sigreturn code. + * Based on the kuser helpers in arch/arm/kernel/entry-armv.S. + * + * Copyright (C) 2005-2011 Nicolas Pitre <nico@fluxnic.net> + * Copyright (C) 2012-2018 ARM Ltd. + * + * For ARM syscalls, the syscall number has to be loaded into r7. + * We do not support an OABI userspace. + * + * For Thumb syscalls, we also pass the syscall number via r7. We therefore + * need two 16-bit instructions. + */ + +#include <asm/unistd.h> + + .globl __aarch32_sigret_code_start +__aarch32_sigret_code_start: + + /* + * ARM Code + */ + .byte __NR_compat_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_sigreturn + .byte __NR_compat_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat_sigreturn, 0x27 // svc #__NR_compat_sigreturn + .byte __NR_compat_sigreturn, 0xdf // mov r7, #__NR_compat_sigreturn + + /* + * ARM code + */ + .byte __NR_compat_rt_sigreturn, 0x70, 0xa0, 0xe3 // mov r7, #__NR_compat_rt_sigreturn + .byte __NR_compat_rt_sigreturn, 0x00, 0x00, 0xef // svc #__NR_compat_rt_sigreturn + + /* + * Thumb code + */ + .byte __NR_compat_rt_sigreturn, 0x27 // svc #__NR_compat_rt_sigreturn + .byte __NR_compat_rt_sigreturn, 0xdf // mov r7, #__NR_compat_rt_sigreturn + + .globl __aarch32_sigret_code_end +__aarch32_sigret_code_end: diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index d908b5e9e949..b00ec7d483d1 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -140,8 +140,6 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) #endif walk_stackframe(current, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_regs); @@ -172,8 +170,6 @@ static noinline void __save_stack_trace(struct task_struct *tsk, #endif walk_stackframe(tsk, &frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; put_task_stack(tsk); } diff --git a/arch/arm64/kernel/sys.c b/arch/arm64/kernel/sys.c index b44065fb1616..6f91e8116514 100644 --- a/arch/arm64/kernel/sys.c +++ b/arch/arm64/kernel/sys.c @@ -31,7 +31,7 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, - unsigned long, fd, off_t, off) + unsigned long, fd, unsigned long, off) { if (offset_in_page(off) != 0) return -EINVAL; diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 8ad119c3f665..ade32046f3fe 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -102,10 +102,16 @@ static void dump_instr(const char *lvl, struct pt_regs *regs) void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - int skip; + int skip = 0; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); + if (regs) { + if (user_mode(regs)) + return; + skip = 1; + } + if (!tsk) tsk = current; @@ -126,7 +132,6 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) frame.graph = 0; #endif - skip = !!regs; printk("Call trace:\n"); do { /* skip until specified stack frame */ @@ -176,15 +181,13 @@ static int __die(const char *str, int err, struct pt_regs *regs) return ret; print_modules(); - __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), end_of_stack(tsk)); + show_regs(regs); - if (!user_mode(regs)) { - dump_backtrace(regs, tsk); + if (!user_mode(regs)) dump_instr(KERN_EMERG, regs); - } return ret; } @@ -459,6 +462,9 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVADP: /* DC CVADP */ + __user_cache_maint("sys 3, c7, c13, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ __user_cache_maint("sys 3, c7, c12, 1", address, ret); break; @@ -493,7 +499,7 @@ static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) { int rt = ESR_ELx_SYS64_ISS_RT(esr); - pt_regs_write_reg(regs, rt, arch_counter_get_cntvct()); + pt_regs_write_reg(regs, rt, arch_timer_read_counter()); arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } @@ -665,7 +671,7 @@ static void compat_cntvct_read_handler(unsigned int esr, struct pt_regs *regs) { int rt = (esr & ESR_ELx_CP15_64_ISS_RT_MASK) >> ESR_ELx_CP15_64_ISS_RT_SHIFT; int rt2 = (esr & ESR_ELx_CP15_64_ISS_RT2_MASK) >> ESR_ELx_CP15_64_ISS_RT2_SHIFT; - u64 val = arch_counter_get_cntvct(); + u64 val = arch_timer_read_counter(); pt_regs_write_reg(regs, rt, lower_32_bits(val)); pt_regs_write_reg(regs, rt2, upper_32_bits(val)); @@ -947,9 +953,6 @@ int is_valid_bugaddr(unsigned long addr) static int bug_handler(struct pt_regs *regs, unsigned int esr) { - if (user_mode(regs)) - return DBG_HOOK_ERROR; - switch (report_bug(regs->pc, regs)) { case BUG_TRAP_TYPE_BUG: die("Oops - BUG", regs, 0); @@ -969,9 +972,8 @@ static int bug_handler(struct pt_regs *regs, unsigned int esr) } static struct break_hook bug_break_hook = { - .esr_val = 0xf2000000 | BUG_BRK_IMM, - .esr_mask = 0xffffffff, .fn = bug_handler, + .imm = BUG_BRK_IMM, }; #ifdef CONFIG_KASAN_SW_TAGS @@ -989,9 +991,6 @@ static int kasan_handler(struct pt_regs *regs, unsigned int esr) u64 addr = regs->regs[0]; u64 pc = regs->pc; - if (user_mode(regs)) - return DBG_HOOK_ERROR; - kasan_report(addr, size, write, pc); /* @@ -1016,13 +1015,10 @@ static int kasan_handler(struct pt_regs *regs, unsigned int esr) return DBG_HOOK_HANDLED; } -#define KASAN_ESR_VAL (0xf2000000 | KASAN_BRK_IMM) -#define KASAN_ESR_MASK 0xffffff00 - static struct break_hook kasan_break_hook = { - .esr_val = KASAN_ESR_VAL, - .esr_mask = KASAN_ESR_MASK, - .fn = kasan_handler, + .fn = kasan_handler, + .imm = KASAN_BRK_IMM, + .mask = KASAN_BRK_MASK, }; #endif @@ -1034,7 +1030,9 @@ int __init early_brk64(unsigned long addr, unsigned int esr, struct pt_regs *regs) { #ifdef CONFIG_KASAN_SW_TAGS - if ((esr & KASAN_ESR_MASK) == KASAN_ESR_VAL) + unsigned int comment = esr & ESR_ELx_BRK64_ISS_COMMENT_MASK; + + if ((comment & ~KASAN_BRK_MASK) == KASAN_BRK_IMM) return kasan_handler(regs, esr) != DBG_HOOK_HANDLED; #endif return bug_handler(regs, esr) != DBG_HOOK_HANDLED; @@ -1043,8 +1041,8 @@ int __init early_brk64(unsigned long addr, unsigned int esr, /* This registration must happen early, before debug_traps_init(). */ void __init trap_init(void) { - register_break_hook(&bug_break_hook); + register_kernel_break_hook(&bug_break_hook); #ifdef CONFIG_KASAN_SW_TAGS - register_break_hook(&kasan_break_hook); + register_kernel_break_hook(&kasan_break_hook); #endif } diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 2d419006ad43..8074cbd3a3a8 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -1,5 +1,5 @@ /* - * VDSO implementation for AArch64 and vector page setup for AArch32. + * VDSO implementations. * * Copyright (C) 2012 ARM Limited * @@ -53,61 +53,129 @@ struct vdso_data *vdso_data = &vdso_data_store.data; /* * Create and map the vectors page for AArch32 tasks. */ -static struct page *vectors_page[1] __ro_after_init; +#define C_VECTORS 0 +#define C_SIGPAGE 1 +#define C_PAGES (C_SIGPAGE + 1) +static struct page *aarch32_vdso_pages[C_PAGES] __ro_after_init; +static const struct vm_special_mapping aarch32_vdso_spec[C_PAGES] = { + { + .name = "[vectors]", /* ABI */ + .pages = &aarch32_vdso_pages[C_VECTORS], + }, + { + .name = "[sigpage]", /* ABI */ + .pages = &aarch32_vdso_pages[C_SIGPAGE], + }, +}; -static int __init alloc_vectors_page(void) +static int aarch32_alloc_kuser_vdso_page(void) { extern char __kuser_helper_start[], __kuser_helper_end[]; - extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; - int kuser_sz = __kuser_helper_end - __kuser_helper_start; - int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; - unsigned long vpage; + unsigned long vdso_page; - vpage = get_zeroed_page(GFP_ATOMIC); + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; - if (!vpage) + vdso_page = get_zeroed_page(GFP_ATOMIC); + if (!vdso_page) return -ENOMEM; - /* kuser helpers */ - memcpy((void *)vpage + 0x1000 - kuser_sz, __kuser_helper_start, - kuser_sz); + memcpy((void *)(vdso_page + 0x1000 - kuser_sz), __kuser_helper_start, + kuser_sz); + aarch32_vdso_pages[C_VECTORS] = virt_to_page(vdso_page); + flush_dcache_page(aarch32_vdso_pages[C_VECTORS]); + return 0; +} - /* sigreturn code */ - memcpy((void *)vpage + AARCH32_KERN_SIGRET_CODE_OFFSET, - __aarch32_sigret_code_start, sigret_sz); +static int __init aarch32_alloc_vdso_pages(void) +{ + extern char __aarch32_sigret_code_start[], __aarch32_sigret_code_end[]; + int sigret_sz = __aarch32_sigret_code_end - __aarch32_sigret_code_start; + unsigned long sigpage; + int ret; - flush_icache_range(vpage, vpage + PAGE_SIZE); - vectors_page[0] = virt_to_page(vpage); + sigpage = get_zeroed_page(GFP_ATOMIC); + if (!sigpage) + return -ENOMEM; - return 0; + memcpy((void *)sigpage, __aarch32_sigret_code_start, sigret_sz); + aarch32_vdso_pages[C_SIGPAGE] = virt_to_page(sigpage); + flush_dcache_page(aarch32_vdso_pages[C_SIGPAGE]); + + ret = aarch32_alloc_kuser_vdso_page(); + if (ret) + free_page(sigpage); + + return ret; } -arch_initcall(alloc_vectors_page); +arch_initcall(aarch32_alloc_vdso_pages); -int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp) +static int aarch32_kuser_helpers_setup(struct mm_struct *mm) { - struct mm_struct *mm = current->mm; - unsigned long addr = AARCH32_VECTORS_BASE; - static const struct vm_special_mapping spec = { - .name = "[vectors]", - .pages = vectors_page, + void *ret; + + if (!IS_ENABLED(CONFIG_KUSER_HELPERS)) + return 0; + + /* + * Avoid VM_MAYWRITE for compatibility with arch/arm/, where it's + * not safe to CoW the page containing the CPU exception vectors. + */ + ret = _install_special_mapping(mm, AARCH32_VECTORS_BASE, PAGE_SIZE, + VM_READ | VM_EXEC | + VM_MAYREAD | VM_MAYEXEC, + &aarch32_vdso_spec[C_VECTORS]); - }; + return PTR_ERR_OR_ZERO(ret); +} + +static int aarch32_sigreturn_setup(struct mm_struct *mm) +{ + unsigned long addr; void *ret; - if (down_write_killable(&mm->mmap_sem)) - return -EINTR; - current->mm->context.vdso = (void *)addr; + addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + if (IS_ERR_VALUE(addr)) { + ret = ERR_PTR(addr); + goto out; + } - /* Map vectors page at the high address. */ + /* + * VM_MAYWRITE is required to allow gdb to Copy-on-Write and + * set breakpoints. + */ ret = _install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_EXEC|VM_MAYREAD|VM_MAYEXEC, - &spec); + VM_READ | VM_EXEC | VM_MAYREAD | + VM_MAYWRITE | VM_MAYEXEC, + &aarch32_vdso_spec[C_SIGPAGE]); + if (IS_ERR(ret)) + goto out; - up_write(&mm->mmap_sem); + mm->context.vdso = (void *)addr; +out: return PTR_ERR_OR_ZERO(ret); } + +int aarch32_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + int ret; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + ret = aarch32_kuser_helpers_setup(mm); + if (ret) + goto out; + + ret = aarch32_sigreturn_setup(mm); + +out: + up_write(&mm->mmap_sem); + return ret; +} #endif /* CONFIG_COMPAT */ static int vdso_mremap(const struct vm_special_mapping *sm, @@ -146,8 +214,6 @@ static int __init vdso_init(void) } vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; - pr_info("vdso: %ld pages (%ld code @ %p, %ld data @ %p)\n", - vdso_pages + 1, vdso_pages, vdso_start, 1L, vdso_data); /* Allocate the vDSO pagelist, plus a page for the data. */ vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), @@ -232,6 +298,9 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->wtm_clock_sec = tk->wall_to_monotonic.tv_sec; vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; + /* Read without the seqlock held by clock_getres() */ + WRITE_ONCE(vdso_data->hrtimer_res, hrtimer_resolution); + if (!use_syscall) { /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index b215c712d897..744b9dbaba03 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -12,17 +12,12 @@ obj-vdso := gettimeofday.o note.o sigreturn.o targets := $(obj-vdso) vdso.so vdso.so.dbg obj-vdso := $(addprefix $(obj)/, $(obj-vdso)) -ccflags-y := -shared -fno-common -fno-builtin -ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 \ + $(call ld-option, --hash-style=sysv) -n -T # Disable gcov profiling for VDSO code GCOV_PROFILE := n -# Workaround for bare-metal (ELF) toolchains that neglect to pass -shared -# down to collect2, resulting in silent corruption of the vDSO image. -ccflags-y += -Wl,-shared - obj-y += vdso.o extra-y += vdso.lds CPPFLAGS_vdso.lds += -P -C -U$(ARCH) @@ -31,8 +26,8 @@ CPPFLAGS_vdso.lds += -P -C -U$(ARCH) $(obj)/vdso.o : $(obj)/vdso.so # Link rule for the .so file, .lds has to be first -$(obj)/vdso.so.dbg: $(src)/vdso.lds $(obj-vdso) - $(call if_changed,vdsold) +$(obj)/vdso.so.dbg: $(obj)/vdso.lds $(obj-vdso) FORCE + $(call if_changed,ld) # Strip rule for the .so file $(obj)/%.so: OBJCOPYFLAGS := -S @@ -42,9 +37,7 @@ $(obj)/%.so: $(obj)/%.so.dbg FORCE # Generate VDSO offsets using helper script gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh quiet_cmd_vdsosym = VDSOSYM $@ -define cmd_vdsosym - $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ -endef + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ include/generated/vdso-offsets.h: $(obj)/vdso.so.dbg FORCE $(call if_changed,vdsosym) @@ -54,8 +47,6 @@ $(obj-vdso): %.o: %.S FORCE $(call if_changed_dep,vdsoas) # Actual build commands -quiet_cmd_vdsold = VDSOL $@ - cmd_vdsold = $(CC) $(c_flags) -Wl,-n -Wl,-T $^ -o $@ quiet_cmd_vdsoas = VDSOA $@ cmd_vdsoas = $(CC) $(a_flags) -c -o $@ $< diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S index c39872a7b03c..856fee6d3512 100644 --- a/arch/arm64/kernel/vdso/gettimeofday.S +++ b/arch/arm64/kernel/vdso/gettimeofday.S @@ -73,6 +73,13 @@ x_tmp .req x8 movn x_tmp, #0xff00, lsl #48 and \res, x_tmp, \res mul \res, \res, \mult + /* + * Fake address dependency from the value computed from the counter + * register to subsequent data page accesses so that the sequence + * locking also orders the read of the counter. + */ + and x_tmp, \res, xzr + add vdso_data, vdso_data, x_tmp .endm /* @@ -147,12 +154,12 @@ ENTRY(__kernel_gettimeofday) /* w11 = cs_mono_mult, w12 = cs_shift */ ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=1b get_nsec_per_sec res=x9 lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=1b get_ts_realtime res_sec=x10, res_nsec=x11, \ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 @@ -211,13 +218,13 @@ realtime: /* w11 = cs_mono_mult, w12 = cs_shift */ ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] - seqcnt_check fail=realtime /* All computations are done with left-shifted nsecs. */ get_nsec_per_sec res=x9 lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=realtime get_ts_realtime res_sec=x10, res_nsec=x11, \ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 clock_gettime_return, shift=1 @@ -231,7 +238,6 @@ monotonic: ldp w11, w12, [vdso_data, #VDSO_CS_MONO_MULT] ldp x13, x14, [vdso_data, #VDSO_XTIME_CLK_SEC] ldp x3, x4, [vdso_data, #VDSO_WTM_CLK_SEC] - seqcnt_check fail=monotonic /* All computations are done with left-shifted nsecs. */ lsl x4, x4, x12 @@ -239,6 +245,7 @@ monotonic: lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=monotonic get_ts_realtime res_sec=x10, res_nsec=x11, \ clock_nsec=x15, xtime_sec=x13, xtime_nsec=x14, nsec_to_sec=x9 @@ -253,13 +260,13 @@ monotonic_raw: /* w11 = cs_raw_mult, w12 = cs_shift */ ldp w12, w11, [vdso_data, #VDSO_CS_SHIFT] ldp x13, x14, [vdso_data, #VDSO_RAW_TIME_SEC] - seqcnt_check fail=monotonic_raw /* All computations are done with left-shifted nsecs. */ get_nsec_per_sec res=x9 lsl x9, x9, x12 get_clock_shifted_nsec res=x15, cycle_last=x10, mult=x11 + seqcnt_check fail=monotonic_raw get_ts_clock_raw res_sec=x10, res_nsec=x11, \ clock_nsec=x15, nsec_to_sec=x9 @@ -301,13 +308,14 @@ ENTRY(__kernel_clock_getres) ccmp w0, #CLOCK_MONOTONIC_RAW, #0x4, ne b.ne 1f - ldr x2, 5f + adr vdso_data, _vdso_data + ldr w2, [vdso_data, #CLOCK_REALTIME_RES] b 2f 1: cmp w0, #CLOCK_REALTIME_COARSE ccmp w0, #CLOCK_MONOTONIC_COARSE, #0x4, ne b.ne 4f - ldr x2, 6f + ldr x2, 5f 2: cbz x1, 3f stp xzr, x2, [x1] @@ -321,8 +329,6 @@ ENTRY(__kernel_clock_getres) svc #0 ret 5: - .quad CLOCK_REALTIME_RES -6: .quad CLOCK_COARSE_RES .cfi_endproc ENDPROC(__kernel_clock_getres) diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 5540a1638baf..33c2a4abda04 100644 --- a/arch/arm64/lib/Makefile +++ b/arch/arm64/lib/Makefile @@ -24,7 +24,7 @@ CFLAGS_atomic_ll_sc.o := -ffixed-x1 -ffixed-x2 \ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ -fcall-saved-x18 -fomit-frame-pointer -CFLAGS_REMOVE_atomic_ll_sc.o := -pg +CFLAGS_REMOVE_atomic_ll_sc.o := $(CC_FLAGS_FTRACE) GCOV_PROFILE_atomic_ll_sc.o := n KASAN_SANITIZE_atomic_ll_sc.o := n KCOV_INSTRUMENT_atomic_ll_sc.o := n diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 1a7e92ab69eb..0cb0e09995e1 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -148,7 +148,7 @@ static inline bool is_ttbr1_addr(unsigned long addr) /* * Dump out the page tables associated with 'addr' in the currently active mm. */ -void show_pte(unsigned long addr) +static void show_pte(unsigned long addr) { struct mm_struct *mm; pgd_t *pgdp; @@ -810,13 +810,12 @@ void __init hook_debug_fault_code(int nr, debug_fault_info[nr].name = name; } -asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint, - unsigned int esr, - struct pt_regs *regs) +asmlinkage void __exception do_debug_exception(unsigned long addr_if_watchpoint, + unsigned int esr, + struct pt_regs *regs) { const struct fault_info *inf = esr_to_debug_fault_info(esr); unsigned long pc = instruction_pointer(regs); - int rv; /* * Tell lockdep we disabled irqs in entry.S. Do nothing if they were @@ -828,17 +827,12 @@ asmlinkage int __exception do_debug_exception(unsigned long addr_if_watchpoint, if (user_mode(regs) && !is_ttbr0_addr(pc)) arm64_apply_bp_hardening(); - if (!inf->fn(addr_if_watchpoint, esr, regs)) { - rv = 1; - } else { + if (inf->fn(addr_if_watchpoint, esr, regs)) { arm64_notify_die(inf->name, regs, inf->sig, inf->code, (void __user *)pc, esr); - rv = 0; } if (interrupts_enabled(regs)) trace_hardirqs_on(); - - return rv; } NOKPROBE_SYMBOL(do_debug_exception); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6bc135042f5e..40e2d7e5efcb 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -363,7 +363,7 @@ void __init arm64_memblock_init(void) * Otherwise, this is a no-op */ u64 base = phys_initrd_start & PAGE_MASK; - u64 size = PAGE_ALIGN(phys_initrd_size); + u64 size = PAGE_ALIGN(phys_initrd_start + phys_initrd_size) - base; /* * We can only add back the initrd memory if we don't end up @@ -377,7 +377,7 @@ void __init arm64_memblock_init(void) base + size > memblock_start_of_DRAM() + linear_region_size, "initrd not fully accessible via the linear mapping -- please check your bootloader ...\n")) { - initrd_start = 0; + phys_initrd_size = 0; } else { memblock_remove(base, size); /* clear MEMBLOCK_ flags */ memblock_add(base, size); @@ -440,6 +440,7 @@ void __init bootmem_init(void) early_memtest(min << PAGE_SHIFT, max << PAGE_SHIFT); max_pfn = max_low_pfn = max; + min_low_pfn = min; arm64_numa_init(); /* @@ -535,7 +536,7 @@ void __init mem_init(void) else swiotlb_force = SWIOTLB_NO_FORCE; - set_max_mapnr(pfn_to_page(max_pfn) - mem_map); + set_max_mapnr(max_pfn - PHYS_PFN_OFFSET); #ifndef CONFIG_SPARSEMEM_VMEMMAP free_unused_memmap(); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index e97f018ff740..ef82312860ac 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -97,7 +97,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, } EXPORT_SYMBOL(phys_mem_access_prot); -static phys_addr_t __init early_pgtable_alloc(void) +static phys_addr_t __init early_pgtable_alloc(int shift) { phys_addr_t phys; void *ptr; @@ -174,7 +174,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; @@ -184,7 +184,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, if (pmd_none(pmd)) { phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); - pte_phys = pgtable_alloc(); + pte_phys = pgtable_alloc(PAGE_SHIFT); __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); pmd = READ_ONCE(*pmdp); } @@ -208,7 +208,7 @@ static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), int flags) + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; pmd_t *pmdp; @@ -246,7 +246,7 @@ static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), int flags) + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; pud_t pud = READ_ONCE(*pudp); @@ -258,7 +258,7 @@ static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, if (pud_none(pud)) { phys_addr_t pmd_phys; BUG_ON(!pgtable_alloc); - pmd_phys = pgtable_alloc(); + pmd_phys = pgtable_alloc(PMD_SHIFT); __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE); pud = READ_ONCE(*pudp); } @@ -294,7 +294,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long next; @@ -304,7 +304,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, if (pgd_none(pgd)) { phys_addr_t pud_phys; BUG_ON(!pgtable_alloc); - pud_phys = pgtable_alloc(); + pud_phys = pgtable_alloc(PUD_SHIFT); __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE); pgd = READ_ONCE(*pgdp); } @@ -345,7 +345,7 @@ static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), + phys_addr_t (*pgtable_alloc)(int), int flags) { unsigned long addr, length, end, next; @@ -371,17 +371,36 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, } while (pgdp++, addr = next, addr != end); } -static phys_addr_t pgd_pgtable_alloc(void) +static phys_addr_t __pgd_pgtable_alloc(int shift) { void *ptr = (void *)__get_free_page(PGALLOC_GFP); - if (!ptr || !pgtable_page_ctor(virt_to_page(ptr))) - BUG(); + BUG_ON(!ptr); /* Ensure the zeroed page is visible to the page table walker */ dsb(ishst); return __pa(ptr); } +static phys_addr_t pgd_pgtable_alloc(int shift) +{ + phys_addr_t pa = __pgd_pgtable_alloc(shift); + + /* + * Call proper page table ctor in case later we need to + * call core mm functions like apply_to_page_range() on + * this pre-allocated page table. + * + * We don't select ARCH_ENABLE_SPLIT_PMD_PTLOCK if pmd is + * folded, and if so pgtable_pmd_page_ctor() becomes nop. + */ + if (shift == PAGE_SHIFT) + BUG_ON(!pgtable_page_ctor(phys_to_page(pa))); + else if (shift == PMD_SHIFT) + BUG_ON(!pgtable_pmd_page_ctor(phys_to_page(pa))); + + return pa; +} + /* * This function can only be used to modify existing table entries, * without allocating new levels of table. Note that this permits the @@ -583,7 +602,7 @@ static int __init map_entry_trampoline(void) /* Map only the text into the trampoline page table */ memset(tramp_pg_dir, 0, PGD_SIZE); __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, - prot, pgd_pgtable_alloc, 0); + prot, __pgd_pgtable_alloc, 0); /* Map both the text and data into the kernel page table */ __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); @@ -1055,7 +1074,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start), - size, PAGE_KERNEL, pgd_pgtable_alloc, flags); + size, PAGE_KERNEL, __pgd_pgtable_alloc, flags); return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap, want_memblock); diff --git a/arch/arm64/mm/numa.c b/arch/arm64/mm/numa.c index 06a6f264f2dd..5202f63c29c9 100644 --- a/arch/arm64/mm/numa.c +++ b/arch/arm64/mm/numa.c @@ -124,7 +124,7 @@ static void __init setup_node_to_cpumask_map(void) } /* - * Set the cpu to node and mem mapping + * Set the cpu to node and mem mapping */ void numa_store_cpu_info(unsigned int cpu) { @@ -200,7 +200,7 @@ void __init setup_per_cpu_areas(void) #endif /** - * numa_add_memblk - Set node id to memblk + * numa_add_memblk() - Set node id to memblk * @nid: NUMA node ID of the new memblk * @start: Start address of the new memblk * @end: End address of the new memblk @@ -223,7 +223,7 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return ret; } -/** +/* * Initialize NODE_DATA for a node on the local memory */ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) @@ -257,7 +257,7 @@ static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn) NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; } -/** +/* * numa_free_distance * * The current table is freed. @@ -277,10 +277,8 @@ void __init numa_free_distance(void) numa_distance = NULL; } -/** - * +/* * Create a new NUMA distance table. - * */ static int __init numa_alloc_distance(void) { @@ -311,7 +309,7 @@ static int __init numa_alloc_distance(void) } /** - * numa_set_distance - Set inter node NUMA distance from node to node. + * numa_set_distance() - Set inter node NUMA distance from node to node. * @from: the 'from' node to set distance * @to: the 'to' node to set distance * @distance: NUMA distance @@ -321,7 +319,6 @@ static int __init numa_alloc_distance(void) * * If @from or @to is higher than the highest known node or lower than zero * or @distance doesn't make sense, the call is ignored. - * */ void __init numa_set_distance(int from, int to, int distance) { @@ -347,7 +344,7 @@ void __init numa_set_distance(int from, int to, int distance) numa_distance[from * numa_distance_cnt + to] = distance; } -/** +/* * Return NUMA distance @from to @to */ int __node_distance(int from, int to) @@ -422,13 +419,15 @@ out_free_distance: } /** - * dummy_numa_init - Fallback dummy NUMA init + * dummy_numa_init() - Fallback dummy NUMA init * * Used if there's no underlying NUMA architecture, NUMA initialization * fails, or NUMA is disabled on the command line. * * Must online at least one node (node 0) and add memory blocks that cover all * allowed memory. It is unlikely that this function fails. + * + * Return: 0 on success, -errno on failure. */ static int __init dummy_numa_init(void) { @@ -454,9 +453,9 @@ static int __init dummy_numa_init(void) } /** - * arm64_numa_init - Initialize NUMA + * arm64_numa_init() - Initialize NUMA * - * Try each configured NUMA initialization method until one succeeds. The + * Try each configured NUMA initialization method until one succeeds. The * last fallback is dummy single node config encomapssing whole memory. */ void __init arm64_numa_init(void) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index aa0817c9c4c3..fdd626d34274 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -65,24 +65,25 @@ ENTRY(cpu_do_suspend) mrs x2, tpidr_el0 mrs x3, tpidrro_el0 mrs x4, contextidr_el1 - mrs x5, cpacr_el1 - mrs x6, tcr_el1 - mrs x7, vbar_el1 - mrs x8, mdscr_el1 - mrs x9, oslsr_el1 - mrs x10, sctlr_el1 + mrs x5, osdlr_el1 + mrs x6, cpacr_el1 + mrs x7, tcr_el1 + mrs x8, vbar_el1 + mrs x9, mdscr_el1 + mrs x10, oslsr_el1 + mrs x11, sctlr_el1 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - mrs x11, tpidr_el1 + mrs x12, tpidr_el1 alternative_else - mrs x11, tpidr_el2 + mrs x12, tpidr_el2 alternative_endif - mrs x12, sp_el0 + mrs x13, sp_el0 stp x2, x3, [x0] - stp x4, xzr, [x0, #16] - stp x5, x6, [x0, #32] - stp x7, x8, [x0, #48] - stp x9, x10, [x0, #64] - stp x11, x12, [x0, #80] + stp x4, x5, [x0, #16] + stp x6, x7, [x0, #32] + stp x8, x9, [x0, #48] + stp x10, x11, [x0, #64] + stp x12, x13, [x0, #80] ret ENDPROC(cpu_do_suspend) @@ -105,8 +106,8 @@ ENTRY(cpu_do_resume) msr cpacr_el1, x6 /* Don't change t0sz here, mask those bits when restoring */ - mrs x5, tcr_el1 - bfi x8, x5, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH + mrs x7, tcr_el1 + bfi x8, x7, TCR_T0SZ_OFFSET, TCR_TxSZ_WIDTH msr tcr_el1, x8 msr vbar_el1, x9 @@ -130,6 +131,7 @@ alternative_endif /* * Restore oslsr_el1 by writing oslar_el1 */ + msr osdlr_el1, x5 ubfx x11, x11, #1, #1 msr oslar_el1, x11 reset_pmuserenr_el0 x0 // Disable PMU access from EL0 diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index e5cd3c5f8399..eeb0471268a0 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -20,6 +20,7 @@ config C6X select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select ARCH_NO_COHERENT_DMA_MMAP + select MMU_GATHER_NO_RANGE if MMU config MMU def_bool n @@ -27,9 +28,6 @@ config MMU config FPU def_bool n -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y diff --git a/arch/c6x/include/asm/Kbuild b/arch/c6x/include/asm/Kbuild index 249c9f6f26dc..6b168d32fbff 100644 --- a/arch/c6x/include/asm/Kbuild +++ b/arch/c6x/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += mmu.h generic-y += mmu_context.h generic-y += pci.h diff --git a/arch/c6x/include/asm/tlb.h b/arch/c6x/include/asm/tlb.h index 34525dea1356..240ba0febb57 100644 --- a/arch/c6x/include/asm/tlb.h +++ b/arch/c6x/include/asm/tlb.h @@ -2,8 +2,6 @@ #ifndef _ASM_C6X_TLB_H #define _ASM_C6X_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #endif /* _ASM_C6X_TLB_H */ diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 725a115759c9..6555d1781132 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -92,9 +92,6 @@ config GENERIC_HWEIGHT config MMU def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config STACKTRACE_SUPPORT def_bool y diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 2a0abe8f2a35..95f4e550db8a 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -28,6 +28,7 @@ generic-y += linkage.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += mutex.h generic-y += pci.h diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index c071da34e081..61c01db6c292 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -27,9 +27,6 @@ config H8300 config CPU_BIG_ENDIAN def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild index e3dead402e5f..123d8f54be4a 100644 --- a/arch/h8300/include/asm/Kbuild +++ b/arch/h8300/include/asm/Kbuild @@ -29,6 +29,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += mmu.h generic-y += mmu_context.h generic-y += module.h diff --git a/arch/h8300/include/asm/tlb.h b/arch/h8300/include/asm/tlb.h index 98f344279904..d8201ca31206 100644 --- a/arch/h8300/include/asm/tlb.h +++ b/arch/h8300/include/asm/tlb.h @@ -2,8 +2,6 @@ #ifndef __H8300_TLB_H__ #define __H8300_TLB_H__ -#define tlb_flush(tlb) do { } while (0) - #include <asm-generic/tlb.h> #endif diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index ac441680dcc0..3e54a53208d5 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -65,12 +65,6 @@ config GENERIC_CSUM config GENERIC_IRQ_PROBE def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool n - -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index d046e8ccdf78..6234a303d2a3 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -24,10 +24,10 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h -generic-y += rwsem.h generic-y += sections.h generic-y += segment.h generic-y += serial.h diff --git a/arch/hexagon/include/asm/io.h b/arch/hexagon/include/asm/io.h index e17262ad125e..3d0ae09c2b8e 100644 --- a/arch/hexagon/include/asm/io.h +++ b/arch/hexagon/include/asm/io.h @@ -184,8 +184,6 @@ static inline void writel(u32 data, volatile void __iomem *addr) #define writew_relaxed __raw_writew #define writel_relaxed __raw_writel -#define mmiowb() - /* * Need an mtype somewhere in here, for cache type deals? * This is probably too long for an inline. diff --git a/arch/hexagon/include/asm/tlb.h b/arch/hexagon/include/asm/tlb.h index 2f00772cc08a..f71c4ba83614 100644 --- a/arch/hexagon/include/asm/tlb.h +++ b/arch/hexagon/include/asm/tlb.h @@ -22,18 +22,6 @@ #include <linux/pagemap.h> #include <asm/tlbflush.h> -/* - * We don't need any special per-pte or per-vma handling... - */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it fills up - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #endif diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 8d7396bd1790..73a26f04644e 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -83,10 +83,6 @@ config STACKTRACE_SUPPORT config GENERIC_LOCKBREAK def_bool n -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config HUGETLB_PAGE_SIZE_VARIABLE bool depends on HUGETLB_PAGE diff --git a/arch/ia64/include/asm/io.h b/arch/ia64/include/asm/io.h index 1e6fef69bb01..a511d62d447a 100644 --- a/arch/ia64/include/asm/io.h +++ b/arch/ia64/include/asm/io.h @@ -113,20 +113,6 @@ extern int valid_mmap_phys_addr_range (unsigned long pfn, size_t count); */ #define __ia64_mf_a() ia64_mfa() -/** - * ___ia64_mmiowb - I/O write barrier - * - * Ensure ordering of I/O space writes. This will make sure that writes - * following the barrier will arrive after all previous writes. For most - * ia64 platforms, this is a simple 'mf.a' instruction. - * - * See Documentation/driver-api/device-io.rst for more information. - */ -static inline void ___ia64_mmiowb(void) -{ - ia64_mfa(); -} - static inline void* __ia64_mk_io_addr (unsigned long port) { @@ -161,7 +147,6 @@ __ia64_mk_io_addr (unsigned long port) #define __ia64_writew ___ia64_writew #define __ia64_writel ___ia64_writel #define __ia64_writeq ___ia64_writeq -#define __ia64_mmiowb ___ia64_mmiowb /* * For the in/out routines, we need to do "mf.a" _after_ doing the I/O access to ensure @@ -296,7 +281,6 @@ __outsl (unsigned long port, const void *src, unsigned long count) #define __outb platform_outb #define __outw platform_outw #define __outl platform_outl -#define __mmiowb platform_mmiowb #define inb(p) __inb(p) #define inw(p) __inw(p) @@ -310,7 +294,6 @@ __outsl (unsigned long port, const void *src, unsigned long count) #define outsb(p,s,c) __outsb(p,s,c) #define outsw(p,s,c) __outsw(p,s,c) #define outsl(p,s,c) __outsl(p,s,c) -#define mmiowb() __mmiowb() /* * The address passed to these functions are ioremap()ped already. diff --git a/arch/ia64/include/asm/machvec.h b/arch/ia64/include/asm/machvec.h index 5133739966bc..beae261fbcb4 100644 --- a/arch/ia64/include/asm/machvec.h +++ b/arch/ia64/include/asm/machvec.h @@ -30,7 +30,6 @@ typedef void ia64_mv_irq_init_t (void); typedef void ia64_mv_send_ipi_t (int, int, int, int); typedef void ia64_mv_timer_interrupt_t (int, void *); typedef void ia64_mv_global_tlb_purge_t (struct mm_struct *, unsigned long, unsigned long, unsigned long); -typedef void ia64_mv_tlb_migrate_finish_t (struct mm_struct *); typedef u8 ia64_mv_irq_to_vector (int); typedef unsigned int ia64_mv_local_vector_to_irq (u8); typedef char *ia64_mv_pci_get_legacy_mem_t (struct pci_bus *); @@ -80,11 +79,6 @@ machvec_noop (void) } static inline void -machvec_noop_mm (struct mm_struct *mm) -{ -} - -static inline void machvec_noop_task (struct task_struct *task) { } @@ -96,7 +90,6 @@ machvec_noop_bus (struct pci_bus *bus) extern void machvec_setup (char **); extern void machvec_timer_interrupt (int, void *); -extern void machvec_tlb_migrate_finish (struct mm_struct *); # if defined (CONFIG_IA64_HP_SIM) # include <asm/machvec_hpsim.h> @@ -124,7 +117,6 @@ extern void machvec_tlb_migrate_finish (struct mm_struct *); # define platform_send_ipi ia64_mv.send_ipi # define platform_timer_interrupt ia64_mv.timer_interrupt # define platform_global_tlb_purge ia64_mv.global_tlb_purge -# define platform_tlb_migrate_finish ia64_mv.tlb_migrate_finish # define platform_dma_init ia64_mv.dma_init # define platform_dma_get_ops ia64_mv.dma_get_ops # define platform_irq_to_vector ia64_mv.irq_to_vector @@ -167,7 +159,6 @@ struct ia64_machine_vector { ia64_mv_send_ipi_t *send_ipi; ia64_mv_timer_interrupt_t *timer_interrupt; ia64_mv_global_tlb_purge_t *global_tlb_purge; - ia64_mv_tlb_migrate_finish_t *tlb_migrate_finish; ia64_mv_dma_init *dma_init; ia64_mv_dma_get_ops *dma_get_ops; ia64_mv_irq_to_vector *irq_to_vector; @@ -206,7 +197,6 @@ struct ia64_machine_vector { platform_send_ipi, \ platform_timer_interrupt, \ platform_global_tlb_purge, \ - platform_tlb_migrate_finish, \ platform_dma_init, \ platform_dma_get_ops, \ platform_irq_to_vector, \ @@ -270,9 +260,6 @@ extern const struct dma_map_ops *dma_get_ops(struct device *); #ifndef platform_global_tlb_purge # define platform_global_tlb_purge ia64_global_tlb_purge /* default to architected version */ #endif -#ifndef platform_tlb_migrate_finish -# define platform_tlb_migrate_finish machvec_noop_mm -#endif #ifndef platform_kernel_launch_event # define platform_kernel_launch_event machvec_noop #endif diff --git a/arch/ia64/include/asm/machvec_sn2.h b/arch/ia64/include/asm/machvec_sn2.h index b5153d300289..a243e4fb4877 100644 --- a/arch/ia64/include/asm/machvec_sn2.h +++ b/arch/ia64/include/asm/machvec_sn2.h @@ -34,7 +34,6 @@ extern ia64_mv_irq_init_t sn_irq_init; extern ia64_mv_send_ipi_t sn2_send_IPI; extern ia64_mv_timer_interrupt_t sn_timer_interrupt; extern ia64_mv_global_tlb_purge_t sn2_global_tlb_purge; -extern ia64_mv_tlb_migrate_finish_t sn_tlb_migrate_finish; extern ia64_mv_irq_to_vector sn_irq_to_vector; extern ia64_mv_local_vector_to_irq sn_local_vector_to_irq; extern ia64_mv_pci_get_legacy_mem_t sn_pci_get_legacy_mem; @@ -77,7 +76,6 @@ extern ia64_mv_pci_fixup_bus_t sn_pci_fixup_bus; #define platform_send_ipi sn2_send_IPI #define platform_timer_interrupt sn_timer_interrupt #define platform_global_tlb_purge sn2_global_tlb_purge -#define platform_tlb_migrate_finish sn_tlb_migrate_finish #define platform_pci_fixup sn_pci_fixup #define platform_inb __sn_inb #define platform_inw __sn_inw diff --git a/arch/ia64/include/asm/mmiowb.h b/arch/ia64/include/asm/mmiowb.h new file mode 100644 index 000000000000..297b85ac84a0 --- /dev/null +++ b/arch/ia64/include/asm/mmiowb.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_IA64_MMIOWB_H +#define _ASM_IA64_MMIOWB_H + +#include <asm/machvec.h> + +/** + * ___ia64_mmiowb - I/O write barrier + * + * Ensure ordering of I/O space writes. This will make sure that writes + * following the barrier will arrive after all previous writes. For most + * ia64 platforms, this is a simple 'mf.a' instruction. + */ +static inline void ___ia64_mmiowb(void) +{ + ia64_mfa(); +} + +#define __ia64_mmiowb ___ia64_mmiowb +#define mmiowb() platform_mmiowb() + +#include <asm-generic/mmiowb.h> + +#endif /* _ASM_IA64_MMIOWB_H */ diff --git a/arch/ia64/include/asm/rwsem.h b/arch/ia64/include/asm/rwsem.h deleted file mode 100644 index 917910607e0e..000000000000 --- a/arch/ia64/include/asm/rwsem.h +++ /dev/null @@ -1,172 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * R/W semaphores for ia64 - * - * Copyright (C) 2003 Ken Chen <kenneth.w.chen@intel.com> - * Copyright (C) 2003 Asit Mallick <asit.k.mallick@intel.com> - * Copyright (C) 2005 Christoph Lameter <cl@linux.com> - * - * Based on asm-i386/rwsem.h and other architecture implementation. - * - * The MSW of the count is the negated number of active writers and - * waiting lockers, and the LSW is the total number of active locks. - * - * The lock count is initialized to 0 (no active and no waiting lockers). - * - * When a writer subtracts WRITE_BIAS, it'll get 0xffffffff00000001 for - * the case of an uncontended lock. Readers increment by 1 and see a positive - * value when uncontended, negative if there are writers (and maybe) readers - * waiting (in which case it goes to sleep). - */ - -#ifndef _ASM_IA64_RWSEM_H -#define _ASM_IA64_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "Please don't include <asm/rwsem.h> directly, use <linux/rwsem.h> instead." -#endif - -#include <asm/intrinsics.h> - -#define RWSEM_UNLOCKED_VALUE __IA64_UL_CONST(0x0000000000000000) -#define RWSEM_ACTIVE_BIAS (1L) -#define RWSEM_ACTIVE_MASK (0xffffffffL) -#define RWSEM_WAITING_BIAS (-0x100000000L) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -static inline int -___down_read (struct rw_semaphore *sem) -{ - long result = ia64_fetchadd8_acq((unsigned long *)&sem->count.counter, 1); - - return (result < 0); -} - -static inline void -__down_read (struct rw_semaphore *sem) -{ - if (___down_read(sem)) - rwsem_down_read_failed(sem); -} - -static inline int -__down_read_killable (struct rw_semaphore *sem) -{ - if (___down_read(sem)) - if (IS_ERR(rwsem_down_read_failed_killable(sem))) - return -EINTR; - - return 0; -} - -/* - * lock for writing - */ -static inline long -___down_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old + RWSEM_ACTIVE_WRITE_BIAS; - } while (atomic_long_cmpxchg_acquire(&sem->count, old, new) != old); - - return old; -} - -static inline void -__down_write (struct rw_semaphore *sem) -{ - if (___down_write(sem)) - rwsem_down_write_failed(sem); -} - -static inline int -__down_write_killable (struct rw_semaphore *sem) -{ - if (___down_write(sem)) { - if (IS_ERR(rwsem_down_write_failed_killable(sem))) - return -EINTR; - } - - return 0; -} - -/* - * unlock after reading - */ -static inline void -__up_read (struct rw_semaphore *sem) -{ - long result = ia64_fetchadd8_rel((unsigned long *)&sem->count.counter, -1); - - if (result < 0 && (--result & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * unlock after writing - */ -static inline void -__up_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old - RWSEM_ACTIVE_WRITE_BIAS; - } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); - - if (new < 0 && (new & RWSEM_ACTIVE_MASK) == 0) - rwsem_wake(sem); -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline int -__down_read_trylock (struct rw_semaphore *sem) -{ - long tmp; - while ((tmp = atomic_long_read(&sem->count)) >= 0) { - if (tmp == atomic_long_cmpxchg_acquire(&sem->count, tmp, tmp+1)) { - return 1; - } - } - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline int -__down_write_trylock (struct rw_semaphore *sem) -{ - long tmp = atomic_long_cmpxchg_acquire(&sem->count, - RWSEM_UNLOCKED_VALUE, RWSEM_ACTIVE_WRITE_BIAS); - return tmp == RWSEM_UNLOCKED_VALUE; -} - -/* - * downgrade write lock to read lock - */ -static inline void -__downgrade_write (struct rw_semaphore *sem) -{ - long old, new; - - do { - old = atomic_long_read(&sem->count); - new = old - RWSEM_WAITING_BIAS; - } while (atomic_long_cmpxchg_release(&sem->count, old, new) != old); - - if (old < 0) - rwsem_downgrade_wake(sem); -} - -#endif /* _ASM_IA64_RWSEM_H */ diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h index afd0b3121b4c..5f620e66384e 100644 --- a/arch/ia64/include/asm/spinlock.h +++ b/arch/ia64/include/asm/spinlock.h @@ -73,6 +73,8 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { unsigned short *p = (unsigned short *)&lock->lock + 1, tmp; + /* This could be optimised with ARCH_HAS_MMIOWB */ + mmiowb(); asm volatile ("ld2.bias %0=[%1]" : "=r"(tmp) : "r"(p)); WRITE_ONCE(*p, (tmp + 2) & ~1); } diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 516355a774bf..86ec034ba499 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -47,263 +47,6 @@ #include <asm/tlbflush.h> #include <asm/machvec.h> -/* - * If we can't allocate a page to make a big batch of page pointers - * to work on, then just handle a few from the on-stack structure. - */ -#define IA64_GATHER_BUNDLE 8 - -struct mmu_gather { - struct mm_struct *mm; - unsigned int nr; - unsigned int max; - unsigned char fullmm; /* non-zero means full mm flush */ - unsigned char need_flush; /* really unmapped some PTEs? */ - unsigned long start, end; - unsigned long start_addr; - unsigned long end_addr; - struct page **pages; - struct page *local[IA64_GATHER_BUNDLE]; -}; - -struct ia64_tr_entry { - u64 ifa; - u64 itir; - u64 pte; - u64 rr; -}; /*Record for tr entry!*/ - -extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size); -extern void ia64_ptr_entry(u64 target_mask, int slot); - -extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS]; - -/* - region register macros -*/ -#define RR_TO_VE(val) (((val) >> 0) & 0x0000000000000001) -#define RR_VE(val) (((val) & 0x0000000000000001) << 0) -#define RR_VE_MASK 0x0000000000000001L -#define RR_VE_SHIFT 0 -#define RR_TO_PS(val) (((val) >> 2) & 0x000000000000003f) -#define RR_PS(val) (((val) & 0x000000000000003f) << 2) -#define RR_PS_MASK 0x00000000000000fcL -#define RR_PS_SHIFT 2 -#define RR_RID_MASK 0x00000000ffffff00L -#define RR_TO_RID(val) ((val >> 8) & 0xffffff) - -static inline void -ia64_tlb_flush_mmu_tlbonly(struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - tlb->need_flush = 0; - - if (tlb->fullmm) { - /* - * Tearing down the entire address space. This happens both as a result - * of exit() and execve(). The latter case necessitates the call to - * flush_tlb_mm() here. - */ - flush_tlb_mm(tlb->mm); - } else if (unlikely (end - start >= 1024*1024*1024*1024UL - || REGION_NUMBER(start) != REGION_NUMBER(end - 1))) - { - /* - * If we flush more than a tera-byte or across regions, we're probably - * better off just flushing the entire TLB(s). This should be very rare - * and is not worth optimizing for. - */ - flush_tlb_all(); - } else { - /* - * flush_tlb_range() takes a vma instead of a mm pointer because - * some architectures want the vm_flags for ITLB/DTLB flush. - */ - struct vm_area_struct vma = TLB_FLUSH_VMA(tlb->mm, 0); - - /* flush the address range from the tlb: */ - flush_tlb_range(&vma, start, end); - /* now flush the virt. page-table area mapping the address range: */ - flush_tlb_range(&vma, ia64_thash(start), ia64_thash(end)); - } - -} - -static inline void -ia64_tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - unsigned long i; - unsigned int nr; - - /* lastly, release the freed pages */ - nr = tlb->nr; - - tlb->nr = 0; - tlb->start_addr = ~0UL; - for (i = 0; i < nr; ++i) - free_page_and_swap_cache(tlb->pages[i]); -} - -/* - * Flush the TLB for address range START to END and, if not in fast mode, release the - * freed pages that where gathered up to this point. - */ -static inline void -ia64_tlb_flush_mmu (struct mmu_gather *tlb, unsigned long start, unsigned long end) -{ - if (!tlb->need_flush) - return; - ia64_tlb_flush_mmu_tlbonly(tlb, start, end); - ia64_tlb_flush_mmu_free(tlb); -} - -static inline void __tlb_alloc_page(struct mmu_gather *tlb) -{ - unsigned long addr = __get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); - - if (addr) { - tlb->pages = (void *)addr; - tlb->max = PAGE_SIZE / sizeof(void *); - } -} - - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->max = ARRAY_SIZE(tlb->local); - tlb->pages = tlb->local; - tlb->nr = 0; - tlb->fullmm = !(start | (end+1)); - tlb->start = start; - tlb->end = end; - tlb->start_addr = ~0UL; -} - -/* - * Called at the end of the shootdown operation to free up any resources that were - * collected. - */ -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) - tlb->need_flush = 1; - /* - * Note: tlb->nr may be 0 at this point, so we can't rely on tlb->start_addr and - * tlb->end_addr. - */ - ia64_tlb_flush_mmu(tlb, start, end); - - /* keep the page table cache within bounds */ - check_pgt_cache(); - - if (tlb->pages != tlb->local) - free_pages((unsigned long)tlb->pages, 0); -} - -/* - * Logically, this routine frees PAGE. On MP machines, the actual freeing of the page - * must be delayed until after the TLB has been flushed (see comments at the beginning of - * this file). - */ -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - - if (!tlb->nr && tlb->pages == tlb->local) - __tlb_alloc_page(tlb); - - tlb->pages[tlb->nr++] = page; - VM_WARN_ON(tlb->nr > tlb->max); - if (tlb->nr == tlb->max) - return true; - return false; -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu_tlbonly(tlb, tlb->start_addr, tlb->end_addr); -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu_free(tlb); -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - ia64_tlb_flush_mmu(tlb, tlb->start_addr, tlb->end_addr); -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - if (__tlb_remove_page(tlb, page)) - tlb_flush_mmu(tlb); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -/* - * Remove TLB entry for PTE mapped at virtual address ADDRESS. This is called for any - * PTE, not just those pointing to (normal) physical memory. - */ -static inline void -__tlb_remove_tlb_entry (struct mmu_gather *tlb, pte_t *ptep, unsigned long address) -{ - if (tlb->start_addr == ~0UL) - tlb->start_addr = address; - tlb->end_addr = address + PAGE_SIZE; -} - -#define tlb_migrate_finish(mm) platform_tlb_migrate_finish(mm) - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) - -#define tlb_remove_tlb_entry(tlb, ptep, addr) \ -do { \ - tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, addr); \ -} while (0) - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, address) \ -do { \ - tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep, address); \ -} while (0) - -#define pmd_free_tlb(tlb, ptep, address) \ -do { \ - tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, ptep, address); \ -} while (0) - -#define pud_free_tlb(tlb, pudp, address) \ -do { \ - tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp, address); \ -} while (0) +#include <asm-generic/tlb.h> #endif /* _ASM_IA64_TLB_H */ diff --git a/arch/ia64/include/asm/tlbflush.h b/arch/ia64/include/asm/tlbflush.h index 25e280810f6c..ceac10c4d6e2 100644 --- a/arch/ia64/include/asm/tlbflush.h +++ b/arch/ia64/include/asm/tlbflush.h @@ -14,6 +14,31 @@ #include <asm/mmu_context.h> #include <asm/page.h> +struct ia64_tr_entry { + u64 ifa; + u64 itir; + u64 pte; + u64 rr; +}; /*Record for tr entry!*/ + +extern int ia64_itr_entry(u64 target_mask, u64 va, u64 pte, u64 log_size); +extern void ia64_ptr_entry(u64 target_mask, int slot); +extern struct ia64_tr_entry *ia64_idtrs[NR_CPUS]; + +/* + region register macros +*/ +#define RR_TO_VE(val) (((val) >> 0) & 0x0000000000000001) +#define RR_VE(val) (((val) & 0x0000000000000001) << 0) +#define RR_VE_MASK 0x0000000000000001L +#define RR_VE_SHIFT 0 +#define RR_TO_PS(val) (((val) >> 2) & 0x000000000000003f) +#define RR_PS(val) (((val) & 0x000000000000003f) << 2) +#define RR_PS_MASK 0x00000000000000fcL +#define RR_PS_SHIFT 2 +#define RR_RID_MASK 0x00000000ffffff00L +#define RR_TO_RID(val) ((val >> 8) & 0xffffff) + /* * Now for some TLB flushing routines. This is the kind of stuff that * can be very expensive, so try to avoid them whenever possible. diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 583a3746d70b..c9cfa760cd57 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -1058,9 +1058,7 @@ check_bugs (void) static int __init run_dmi_scan(void) { - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); return 0; } core_initcall(run_dmi_scan); diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index ab9cda5f6136..56e3d0b685e1 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -344,3 +344,7 @@ 332 common pkey_free sys_pkey_free 333 common rseq sys_rseq # 334 through 423 are reserved to sync up with other architectures +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c index 5fc89aabdce1..5158bd28de05 100644 --- a/arch/ia64/mm/tlb.c +++ b/arch/ia64/mm/tlb.c @@ -305,8 +305,8 @@ local_flush_tlb_all (void) ia64_srlz_i(); /* srlz.i implies srlz.d */ } -void -flush_tlb_range (struct vm_area_struct *vma, unsigned long start, +static void +__flush_tlb_range (struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; @@ -343,6 +343,25 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start, preempt_enable(); ia64_srlz_i(); /* srlz.i implies srlz.d */ } + +void flush_tlb_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + if (unlikely(end - start >= 1024*1024*1024*1024UL + || REGION_NUMBER(start) != REGION_NUMBER(end - 1))) { + /* + * If we flush more than a tera-byte or across regions, we're + * probably better off just flushing the entire TLB(s). This + * should be very rare and is not worth optimizing for. + */ + flush_tlb_all(); + } else { + /* flush the address range from the tlb */ + __flush_tlb_range(vma, start, end); + /* flush the virt. page-table area mapping the addr range */ + __flush_tlb_range(vma, ia64_thash(start), ia64_thash(end)); + } +} EXPORT_SYMBOL(flush_tlb_range); void ia64_tlb_init(void) diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index b73b0ebf8214..b510f4f17fd4 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -120,13 +120,6 @@ void sn_migrate(struct task_struct *task) cpu_relax(); } -void sn_tlb_migrate_finish(struct mm_struct *mm) -{ - /* flush_tlb_mm is inefficient if more than 1 users of mm */ - if (mm == current->mm && mm && atomic_read(&mm->mm_users) == 1) - flush_tlb_mm(mm); -} - static void sn2_ipi_flush_all_tlb(struct mm_struct *mm) { diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index b54206408f91..fe5cc2da6d10 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -20,7 +20,6 @@ config M68K select GENERIC_STRNCPY_FROM_USER if MMU select GENERIC_STRNLEN_USER if MMU select ARCH_WANT_IPC_PARSE_VERSION - select ARCH_USES_GETTIMEOFFSET if MMU && !COLDFIRE select HAVE_FUTEX_CMPXCHG if MMU && FUTEX select HAVE_MOD_ARCH_SPECIFIC select MODULES_USE_ELF_REL @@ -28,17 +27,11 @@ config M68K select OLD_SIGSUSPEND3 select OLD_SIGACTION select ARCH_DISCARD_MEMBLOCK + select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN def_bool y -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/m68k/amiga/cia.c b/arch/m68k/amiga/cia.c index 2081b8cd5591..b9aee983e6f4 100644 --- a/arch/m68k/amiga/cia.c +++ b/arch/m68k/amiga/cia.c @@ -88,10 +88,19 @@ static irqreturn_t cia_handler(int irq, void *dev_id) struct ciabase *base = dev_id; int mach_irq; unsigned char ints; + unsigned long flags; + /* Interrupts get disabled while the timer irq flag is cleared and + * the timer interrupt serviced. + */ mach_irq = base->cia_irq; + local_irq_save(flags); ints = cia_set_irq(base, CIA_ICR_ALL); amiga_custom.intreq = base->int_mask; + if (ints & 1) + generic_handle_irq(mach_irq); + local_irq_restore(flags); + mach_irq++, ints >>= 1; for (; ints; mach_irq++, ints >>= 1) { if (ints & 1) generic_handle_irq(mach_irq); diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index 65f63a457130..c32ab8041cf6 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -17,6 +17,7 @@ #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/tty.h> +#include <linux/clocksource.h> #include <linux/console.h> #include <linux/rtc.h> #include <linux/init.h> @@ -95,8 +96,6 @@ static char amiga_model_name[13] = "Amiga "; static void amiga_sched_init(irq_handler_t handler); static void amiga_get_model(char *model); static void amiga_get_hardware_list(struct seq_file *m); -/* amiga specific timer functions */ -static u32 amiga_gettimeoffset(void); extern void amiga_mksound(unsigned int count, unsigned int ticks); static void amiga_reset(void); extern void amiga_init_sound(void); @@ -386,7 +385,6 @@ void __init config_amiga(void) mach_init_IRQ = amiga_init_IRQ; mach_get_model = amiga_get_model; mach_get_hardware_list = amiga_get_hardware_list; - arch_gettimeoffset = amiga_gettimeoffset; /* * default MAX_DMA=0xffffffff on all machines. If we don't do so, the SCSI @@ -464,7 +462,29 @@ void __init config_amiga(void) *(unsigned char *)ZTWO_VADDR(0xde0002) |= 0x80; } +static u64 amiga_read_clk(struct clocksource *cs); + +static struct clocksource amiga_clk = { + .name = "ciab", + .rating = 250, + .read = amiga_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + static unsigned short jiffy_ticks; +static u32 clk_total, clk_offset; + +static irqreturn_t ciab_timer_handler(int irq, void *dev_id) +{ + irq_handler_t timer_routine = dev_id; + + clk_total += jiffy_ticks; + clk_offset = 0; + timer_routine(0, NULL); + + return IRQ_HANDLED; +} static void __init amiga_sched_init(irq_handler_t timer_routine) { @@ -484,19 +504,22 @@ static void __init amiga_sched_init(irq_handler_t timer_routine) * Please don't change this to use ciaa, as it interferes with the * SCSI code. We'll have to take a look at this later */ - if (request_irq(IRQ_AMIGA_CIAB_TA, timer_routine, 0, "timer", NULL)) + if (request_irq(IRQ_AMIGA_CIAB_TA, ciab_timer_handler, IRQF_TIMER, + "timer", timer_routine)) pr_err("Couldn't register timer interrupt\n"); /* start timer */ ciab.cra |= 0x11; -} -#define TICK_SIZE 10000 + clocksource_register_hz(&amiga_clk, amiga_eclock); +} -/* This is always executed with interrupts disabled. */ -static u32 amiga_gettimeoffset(void) +static u64 amiga_read_clk(struct clocksource *cs) { unsigned short hi, lo, hi2; - u32 ticks, offset = 0; + unsigned long flags; + u32 ticks; + + local_irq_save(flags); /* read CIA B timer A current value */ hi = ciab.tahi; @@ -513,12 +536,14 @@ static u32 amiga_gettimeoffset(void) if (ticks > jiffy_ticks / 2) /* check for pending interrupt */ if (cia_set_irq(&ciab_base, 0) & CIA_ICR_TA) - offset = 10000; + clk_offset = jiffy_ticks; ticks = jiffy_ticks - ticks; - ticks = (10000 * ticks) / jiffy_ticks; + ticks += clk_offset + clk_total; + + local_irq_restore(flags); - return (ticks + offset) * 1000; + return ticks; } static void amiga_reset(void) __noreturn; diff --git a/arch/m68k/apollo/config.c b/arch/m68k/apollo/config.c index aef8d42e078d..7d168e6dfb01 100644 --- a/arch/m68k/apollo/config.c +++ b/arch/m68k/apollo/config.c @@ -29,7 +29,6 @@ u_long apollo_model; extern void dn_sched_init(irq_handler_t handler); extern void dn_init_IRQ(void); -extern u32 dn_gettimeoffset(void); extern int dn_dummy_hwclk(int, struct rtc_time *); extern void dn_dummy_reset(void); #ifdef CONFIG_HEARTBEAT @@ -152,7 +151,6 @@ void __init config_apollo(void) mach_sched_init=dn_sched_init; /* */ mach_init_IRQ=dn_init_IRQ; - arch_gettimeoffset = dn_gettimeoffset; mach_max_dma_address = 0xffffffff; mach_hwclk = dn_dummy_hwclk; /* */ mach_reset = dn_dummy_reset; /* */ @@ -205,11 +203,6 @@ void dn_sched_init(irq_handler_t timer_routine) pr_err("Couldn't register timer interrupt\n"); } -u32 dn_gettimeoffset(void) -{ - return 0xdeadbeef; -} - int dn_dummy_hwclk(int op, struct rtc_time *t) { diff --git a/arch/m68k/atari/ataints.c b/arch/m68k/atari/ataints.c index 3d2b63bedf05..56f02ea2c248 100644 --- a/arch/m68k/atari/ataints.c +++ b/arch/m68k/atari/ataints.c @@ -142,7 +142,7 @@ struct mfptimerbase { .name = "MFP Timer D" }; -static irqreturn_t mfptimer_handler(int irq, void *dev_id) +static irqreturn_t mfp_timer_d_handler(int irq, void *dev_id) { struct mfptimerbase *base = dev_id; int mach_irq; @@ -344,7 +344,7 @@ void __init atari_init_IRQ(void) st_mfp.tim_ct_cd = (st_mfp.tim_ct_cd & 0xf0) | 0x6; /* request timer D dispatch handler */ - if (request_irq(IRQ_MFP_TIMD, mfptimer_handler, IRQF_SHARED, + if (request_irq(IRQ_MFP_TIMD, mfp_timer_d_handler, IRQF_SHARED, stmfp_base.name, &stmfp_base)) pr_err("Couldn't register %s interrupt\n", stmfp_base.name); diff --git a/arch/m68k/atari/config.c b/arch/m68k/atari/config.c index 4fcc4b1df1c0..902255e7b5b2 100644 --- a/arch/m68k/atari/config.c +++ b/arch/m68k/atari/config.c @@ -78,7 +78,6 @@ static void atari_heartbeat(int on); /* atari specific timer functions (in time.c) */ extern void atari_sched_init(irq_handler_t); -extern u32 atari_gettimeoffset(void); extern int atari_mste_hwclk (int, struct rtc_time *); extern int atari_tt_hwclk (int, struct rtc_time *); @@ -205,7 +204,6 @@ void __init config_atari(void) mach_init_IRQ = atari_init_IRQ; mach_get_model = atari_get_model; mach_get_hardware_list = atari_get_hardware_list; - arch_gettimeoffset = atari_gettimeoffset; mach_reset = atari_reset; mach_max_dma_address = 0xffffff; #if IS_ENABLED(CONFIG_INPUT_M68K_BEEP) diff --git a/arch/m68k/atari/time.c b/arch/m68k/atari/time.c index 9cca64286464..ce923a523695 100644 --- a/arch/m68k/atari/time.c +++ b/arch/m68k/atari/time.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/rtc.h> #include <linux/bcd.h> +#include <linux/clocksource.h> #include <linux/delay.h> #include <linux/export.h> @@ -24,6 +25,35 @@ DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL_GPL(rtc_lock); +static u64 atari_read_clk(struct clocksource *cs); + +static struct clocksource atari_clk = { + .name = "mfp", + .rating = 100, + .read = atari_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total; +static u8 last_timer_count; + +static irqreturn_t mfp_timer_c_handler(int irq, void *dev_id) +{ + irq_handler_t timer_routine = dev_id; + unsigned long flags; + + local_irq_save(flags); + do { + last_timer_count = st_mfp.tim_dt_c; + } while (last_timer_count == 1); + clk_total += INT_TICKS; + timer_routine(0, NULL); + local_irq_restore(flags); + + return IRQ_HANDLED; +} + void __init atari_sched_init(irq_handler_t timer_routine) { @@ -32,31 +62,33 @@ atari_sched_init(irq_handler_t timer_routine) /* start timer C, div = 1:100 */ st_mfp.tim_ct_cd = (st_mfp.tim_ct_cd & 15) | 0x60; /* install interrupt service routine for MFP Timer C */ - if (request_irq(IRQ_MFP_TIMC, timer_routine, 0, "timer", timer_routine)) + if (request_irq(IRQ_MFP_TIMC, mfp_timer_c_handler, IRQF_TIMER, "timer", + timer_routine)) pr_err("Couldn't register timer interrupt\n"); + + clocksource_register_hz(&atari_clk, INT_CLK); } /* ++andreas: gettimeoffset fixed to check for pending interrupt */ -#define TICK_SIZE 10000 - -/* This is always executed with interrupts disabled. */ -u32 atari_gettimeoffset(void) +static u64 atari_read_clk(struct clocksource *cs) { - u32 ticks, offset = 0; - - /* read MFP timer C current value */ - ticks = st_mfp.tim_dt_c; - /* The probability of underflow is less than 2% */ - if (ticks > INT_TICKS - INT_TICKS / 50) - /* Check for pending timer interrupt */ - if (st_mfp.int_pn_b & (1 << 5)) - offset = TICK_SIZE; - - ticks = INT_TICKS - ticks; - ticks = ticks * 10000L / INT_TICKS; - - return (ticks + offset) * 1000; + unsigned long flags; + u8 count; + u32 ticks; + + local_irq_save(flags); + /* Ensure that the count is monotonically decreasing, even though + * the result may briefly stop changing after counter wrap-around. + */ + count = min(st_mfp.tim_dt_c, last_timer_count); + last_timer_count = count; + + ticks = INT_TICKS - count; + ticks += clk_total; + local_irq_restore(flags); + + return ticks; } diff --git a/arch/m68k/bvme6000/config.c b/arch/m68k/bvme6000/config.c index 143ee9fa3893..8ebaabc931cd 100644 --- a/arch/m68k/bvme6000/config.c +++ b/arch/m68k/bvme6000/config.c @@ -18,6 +18,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> +#include <linux/clocksource.h> #include <linux/console.h> #include <linux/linkage.h> #include <linux/init.h> @@ -39,16 +40,10 @@ static void bvme6000_get_model(char *model); extern void bvme6000_sched_init(irq_handler_t handler); -extern u32 bvme6000_gettimeoffset(void); extern int bvme6000_hwclk (int, struct rtc_time *); extern void bvme6000_reset (void); void bvme6000_set_vectors (void); -/* Save tick handler routine pointer, will point to xtime_update() in - * kernel/timer/timekeeping.c, called via bvme6000_process_int() */ - -static irq_handler_t tick_handler; - int __init bvme6000_parse_bootinfo(const struct bi_record *bi) { @@ -110,7 +105,6 @@ void __init config_bvme6000(void) mach_max_dma_address = 0xffffffff; mach_sched_init = bvme6000_sched_init; mach_init_IRQ = bvme6000_init_IRQ; - arch_gettimeoffset = bvme6000_gettimeoffset; mach_hwclk = bvme6000_hwclk; mach_reset = bvme6000_reset; mach_get_model = bvme6000_get_model; @@ -154,15 +148,38 @@ irqreturn_t bvme6000_abort_int (int irq, void *dev_id) return IRQ_HANDLED; } +static u64 bvme6000_read_clk(struct clocksource *cs); + +static struct clocksource bvme6000_clk = { + .name = "rtc", + .rating = 250, + .read = bvme6000_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total, clk_offset; + +#define RTC_TIMER_CLOCK_FREQ 8000000 +#define RTC_TIMER_CYCLES (RTC_TIMER_CLOCK_FREQ / HZ) +#define RTC_TIMER_COUNT ((RTC_TIMER_CYCLES / 2) - 1) static irqreturn_t bvme6000_timer_int (int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + unsigned long flags; volatile RtcPtr_t rtc = (RtcPtr_t)BVME_RTC_BASE; - unsigned char msr = rtc->msr & 0xc0; + unsigned char msr; + local_irq_save(flags); + msr = rtc->msr & 0xc0; rtc->msr = msr | 0x20; /* Ack the interrupt */ + clk_total += RTC_TIMER_CYCLES; + clk_offset = 0; + timer_routine(0, NULL); + local_irq_restore(flags); - return tick_handler(irq, dev_id); + return IRQ_HANDLED; } /* @@ -181,14 +198,13 @@ void bvme6000_sched_init (irq_handler_t timer_routine) rtc->msr = 0; /* Ensure timer registers accessible */ - tick_handler = timer_routine; - if (request_irq(BVME_IRQ_RTC, bvme6000_timer_int, 0, - "timer", bvme6000_timer_int)) + if (request_irq(BVME_IRQ_RTC, bvme6000_timer_int, IRQF_TIMER, "timer", + timer_routine)) panic ("Couldn't register timer int"); rtc->t1cr_omr = 0x04; /* Mode 2, ext clk */ - rtc->t1msb = 39999 >> 8; - rtc->t1lsb = 39999 & 0xff; + rtc->t1msb = RTC_TIMER_COUNT >> 8; + rtc->t1lsb = RTC_TIMER_COUNT & 0xff; rtc->irr_icr1 &= 0xef; /* Route timer 1 to INTR pin */ rtc->msr = 0x40; /* Access int.cntrl, etc */ rtc->pfr_icr0 = 0x80; /* Just timer 1 ints enabled */ @@ -200,14 +216,14 @@ void bvme6000_sched_init (irq_handler_t timer_routine) rtc->msr = msr; + clocksource_register_hz(&bvme6000_clk, RTC_TIMER_CLOCK_FREQ); + if (request_irq(BVME_IRQ_ABORT, bvme6000_abort_int, 0, "abort", bvme6000_abort_int)) panic ("Couldn't register abort int"); } -/* This is always executed with interrupts disabled. */ - /* * NOTE: Don't accept any readings within 5us of rollover, as * the T1INT bit may be a little slow getting set. There is also @@ -215,14 +231,18 @@ void bvme6000_sched_init (irq_handler_t timer_routine) * results... */ -u32 bvme6000_gettimeoffset(void) +static u64 bvme6000_read_clk(struct clocksource *cs) { + unsigned long flags; volatile RtcPtr_t rtc = (RtcPtr_t)BVME_RTC_BASE; volatile PitRegsPtr pit = (PitRegsPtr)BVME_PIT_BASE; - unsigned char msr = rtc->msr & 0xc0; + unsigned char msr, msb; unsigned char t1int, t1op; u32 v = 800000, ov; + local_irq_save(flags); + + msr = rtc->msr & 0xc0; rtc->msr = 0; /* Ensure timer registers accessible */ do { @@ -230,22 +250,25 @@ u32 bvme6000_gettimeoffset(void) t1int = rtc->msr & 0x20; t1op = pit->pcdr & 0x04; rtc->t1cr_omr |= 0x40; /* Latch timer1 */ - v = rtc->t1msb << 8; /* Read timer1 */ - v |= rtc->t1lsb; /* Read timer1 */ + msb = rtc->t1msb; /* Read timer1 */ + v = (msb << 8) | rtc->t1lsb; /* Read timer1 */ } while (t1int != (rtc->msr & 0x20) || t1op != (pit->pcdr & 0x04) || abs(ov-v) > 80 || - v > 39960); + v > RTC_TIMER_COUNT - (RTC_TIMER_COUNT / 100)); - v = 39999 - v; + v = RTC_TIMER_COUNT - v; if (!t1op) /* If in second half cycle.. */ - v += 40000; - v /= 8; /* Convert ticks to microseconds */ - if (t1int) - v += 10000; /* Int pending, + 10ms */ + v += RTC_TIMER_CYCLES / 2; + if (msb > 0 && t1int) + clk_offset = RTC_TIMER_CYCLES; rtc->msr = msr; - return v * 1000; + v += clk_offset + clk_total; + + local_irq_restore(flags); + + return v; } /* diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 525421ae277d..fea392cfcf1b 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -56,6 +56,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -210,9 +211,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -234,9 +232,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -313,7 +308,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -460,12 +454,12 @@ CONFIG_RTC_DRV_RP5C01=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -573,9 +567,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -640,6 +636,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -649,4 +646,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index db0e654a88d5..2474d267460e 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -52,6 +52,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -206,9 +207,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -230,9 +228,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -309,7 +304,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -420,12 +414,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -533,9 +527,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -600,6 +596,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -609,4 +606,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index 1451168eb789..0fc7d2992fe0 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -59,6 +59,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -213,9 +214,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -237,9 +235,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -316,7 +311,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -442,12 +436,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -555,9 +549,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -622,6 +618,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -631,4 +628,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index b0d3609f5bb3..699df9fdf866 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -49,6 +49,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -203,9 +204,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -227,9 +225,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -306,7 +301,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -413,12 +407,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -526,9 +520,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -593,6 +589,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -602,4 +599,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 4ed7c151347c..b50802255324 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -51,6 +51,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -205,9 +206,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -229,9 +227,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -308,7 +303,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -422,12 +416,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -535,9 +529,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -602,6 +598,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -611,4 +608,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 0dc544e1ce1f..04e7d70f6030 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -50,6 +50,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -204,9 +205,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -228,9 +226,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -310,7 +305,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -444,12 +438,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -557,9 +551,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -624,6 +620,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -633,4 +630,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 5a7b7b0d6e72..5e1cc4c17852 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -70,6 +70,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -224,9 +225,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -248,9 +246,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -330,7 +325,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -526,12 +520,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -639,9 +633,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -706,6 +702,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -715,4 +712,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 71eb9be1803b..170ac8792c2d 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -48,6 +48,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -202,9 +203,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -226,9 +224,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -305,7 +300,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -412,12 +406,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -525,9 +519,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -592,6 +588,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -601,4 +598,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index ea2ebd4241c0..d865592a423e 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -49,6 +49,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -203,9 +204,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -227,9 +225,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -306,7 +301,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -413,12 +407,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -526,9 +520,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -593,6 +589,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -602,4 +599,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index cef6dc47c725..034a9de90484 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -50,6 +50,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -204,9 +205,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -228,9 +226,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -307,7 +302,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -431,12 +425,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -544,9 +538,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -611,6 +607,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -620,4 +617,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 69f2282dc4e9..49be0f9fcd8d 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -46,6 +46,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -200,9 +201,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -224,9 +222,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -303,7 +298,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -415,12 +409,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -528,9 +522,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -595,6 +591,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -604,3 +601,4 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index e91267e868b2..a71acf4a6004 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -46,6 +46,7 @@ CONFIG_TLS=m CONFIG_XFRM_MIGRATE=y CONFIG_NET_KEY=y CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=m CONFIG_INET=y CONFIG_IP_PNP=y CONFIG_IP_PNP_DHCP=y @@ -200,9 +201,6 @@ CONFIG_NFT_FIB_IPV4=m CONFIG_NF_TABLES_ARP=y CONFIG_NF_FLOW_TABLE_IPV4=m CONFIG_NF_LOG_ARP=m -CONFIG_NFT_CHAIN_NAT_IPV4=m -CONFIG_NFT_MASQ_IPV4=m -CONFIG_NFT_REDIR_IPV4=m CONFIG_IP_NF_IPTABLES=m CONFIG_IP_NF_MATCH_AH=m CONFIG_IP_NF_MATCH_ECN=m @@ -224,9 +222,6 @@ CONFIG_IP_NF_ARPTABLES=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m CONFIG_NFT_CHAIN_ROUTE_IPV6=m -CONFIG_NFT_CHAIN_NAT_IPV6=m -CONFIG_NFT_MASQ_IPV6=m -CONFIG_NFT_REDIR_IPV6=m CONFIG_NFT_DUP_IPV6=m CONFIG_NFT_FIB_IPV6=m CONFIG_NF_FLOW_TABLE_IPV6=m @@ -303,7 +298,6 @@ CONFIG_AF_KCM=m # CONFIG_WIRELESS is not set CONFIG_PSAMPLE=m CONFIG_NET_IFE=m -CONFIG_NET_DEVLINK=m # CONFIG_UEVENT_HELPER is not set CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -414,12 +408,12 @@ CONFIG_RTC_DRV_GENERIC=m # CONFIG_VIRTIO_MENU is not set # CONFIG_IOMMU_SUPPORT is not set CONFIG_DAX=m +# CONFIG_VALIDATE_FS_PARSER is not set CONFIG_EXT4_FS=y CONFIG_REISERFS_FS=m CONFIG_JFS_FS=m CONFIG_OCFS2_FS=m # CONFIG_OCFS2_DEBUG_MASKLOG is not set -CONFIG_FS_ENCRYPTION=m CONFIG_FANOTIFY=y CONFIG_QUOTA_NETLINK_INTERFACE=y # CONFIG_PRINT_QUOTA_WARNING is not set @@ -527,9 +521,11 @@ CONFIG_CRYPTO_AEGIS256=m CONFIG_CRYPTO_MORUS640=m CONFIG_CRYPTO_MORUS1280=m CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTS=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_OFB=m CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m CONFIG_CRYPTO_KEYWRAP=m CONFIG_CRYPTO_ADIANTUM=m CONFIG_CRYPTO_XCBC=m @@ -594,6 +590,7 @@ CONFIG_TEST_OVERFLOW=m CONFIG_TEST_RHASHTABLE=m CONFIG_TEST_HASH=m CONFIG_TEST_IDA=m +CONFIG_TEST_VMALLOC=m CONFIG_TEST_USER_COPY=m CONFIG_TEST_BPF=m CONFIG_FIND_BIT_BENCHMARK=m @@ -603,4 +600,5 @@ CONFIG_TEST_UDELAY=m CONFIG_TEST_STATIC_KEYS=m CONFIG_TEST_KMOD=m CONFIG_TEST_MEMCAT_P=m +CONFIG_TEST_STACKINIT=m CONFIG_EARLY_PRINTK=y diff --git a/arch/m68k/hp300/config.c b/arch/m68k/hp300/config.c index a19bcd23f80b..a161d44fd20b 100644 --- a/arch/m68k/hp300/config.c +++ b/arch/m68k/hp300/config.c @@ -254,7 +254,6 @@ void __init config_hp300(void) mach_sched_init = hp300_sched_init; mach_init_IRQ = hp300_init_IRQ; mach_get_model = hp300_get_model; - arch_gettimeoffset = hp300_gettimeoffset; mach_hwclk = hp300_hwclk; mach_get_ss = hp300_get_ss; mach_reset = hp300_reset; diff --git a/arch/m68k/hp300/time.c b/arch/m68k/hp300/time.c index 289d928a46cb..bfee13e1d0fe 100644 --- a/arch/m68k/hp300/time.c +++ b/arch/m68k/hp300/time.c @@ -8,6 +8,7 @@ */ #include <asm/ptrace.h> +#include <linux/clocksource.h> #include <linux/types.h> #include <linux/init.h> #include <linux/sched.h> @@ -19,6 +20,18 @@ #include <asm/traps.h> #include <asm/blinken.h> +static u64 hp300_read_clk(struct clocksource *cs); + +static struct clocksource hp300_clk = { + .name = "timer", + .rating = 250, + .read = hp300_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total, clk_offset; + /* Clock hardware definitions */ #define CLOCKBASE 0xf05f8000 @@ -28,39 +41,61 @@ #define CLKCR3 CLKCR1 #define CLKSR CLKCR2 #define CLKMSB1 0x5 +#define CLKLSB1 0x7 #define CLKMSB2 0x9 #define CLKMSB3 0xD +#define CLKSR_INT1 BIT(0) + /* This is for machines which generate the exact clock. */ -#define USECS_PER_JIFFY (1000000/HZ) -#define INTVAL ((10000 / 4) - 1) +#define HP300_TIMER_CLOCK_FREQ 250000 +#define HP300_TIMER_CYCLES (HP300_TIMER_CLOCK_FREQ / HZ) +#define INTVAL (HP300_TIMER_CYCLES - 1) static irqreturn_t hp300_tick(int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + unsigned long flags; unsigned long tmp; - irq_handler_t vector = dev_id; + + local_irq_save(flags); in_8(CLOCKBASE + CLKSR); asm volatile ("movpw %1@(5),%0" : "=d" (tmp) : "a" (CLOCKBASE)); + clk_total += INTVAL; + clk_offset = 0; + timer_routine(0, NULL); + local_irq_restore(flags); + /* Turn off the network and SCSI leds */ blinken_leds(0, 0xe0); - return vector(irq, NULL); + return IRQ_HANDLED; } -u32 hp300_gettimeoffset(void) +static u64 hp300_read_clk(struct clocksource *cs) { - /* Read current timer 1 value */ - unsigned char lsb, msb1, msb2; - unsigned short ticks; - - msb1 = in_8(CLOCKBASE + 5); - lsb = in_8(CLOCKBASE + 7); - msb2 = in_8(CLOCKBASE + 5); - if (msb1 != msb2) - /* A carry happened while we were reading. Read it again */ - lsb = in_8(CLOCKBASE + 7); - ticks = INTVAL - ((msb2 << 8) | lsb); - return ((USECS_PER_JIFFY * ticks) / INTVAL) * 1000; + unsigned long flags; + unsigned char lsb, msb, msb_new; + u32 ticks; + + local_irq_save(flags); + /* Read current timer 1 value */ + msb = in_8(CLOCKBASE + CLKMSB1); +again: + if ((in_8(CLOCKBASE + CLKSR) & CLKSR_INT1) && msb > 0) + clk_offset = INTVAL; + lsb = in_8(CLOCKBASE + CLKLSB1); + msb_new = in_8(CLOCKBASE + CLKMSB1); + if (msb_new != msb) { + msb = msb_new; + goto again; + } + + ticks = INTVAL - ((msb << 8) | lsb); + ticks += clk_offset + clk_total; + local_irq_restore(flags); + + return ticks; } void __init hp300_sched_init(irq_handler_t vector) @@ -70,9 +105,11 @@ void __init hp300_sched_init(irq_handler_t vector) asm volatile(" movpw %0,%1@(5)" : : "d" (INTVAL), "a" (CLOCKBASE)); - if (request_irq(IRQ_AUTO_6, hp300_tick, 0, "timer tick", vector)) + if (request_irq(IRQ_AUTO_6, hp300_tick, IRQF_TIMER, "timer tick", vector)) pr_err("Couldn't register timer interrupt\n"); out_8(CLOCKBASE + CLKCR2, 0x1); /* select CR1 */ out_8(CLOCKBASE + CLKCR1, 0x40); /* enable irq */ + + clocksource_register_hz(&hp300_clk, HP300_TIMER_CLOCK_FREQ); } diff --git a/arch/m68k/hp300/time.h b/arch/m68k/hp300/time.h index f5583ec4033d..1d77b55cc72a 100644 --- a/arch/m68k/hp300/time.h +++ b/arch/m68k/hp300/time.h @@ -1,2 +1 @@ extern void hp300_sched_init(irq_handler_t vector); -extern u32 hp300_gettimeoffset(void); diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 2c359d9e80f6..0ddae4a74adb 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -18,6 +18,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += percpu.h generic-y += preempt.h generic-y += sections.h diff --git a/arch/m68k/include/asm/io_mm.h b/arch/m68k/include/asm/io_mm.h index 782b78f8a048..6c03ca5bc436 100644 --- a/arch/m68k/include/asm/io_mm.h +++ b/arch/m68k/include/asm/io_mm.h @@ -377,8 +377,6 @@ static inline void isa_delay(void) #define writesw(port, buf, nr) raw_outsw((port), (u16 *)(buf), (nr)) #define writesl(port, buf, nr) raw_outsl((port), (u32 *)(buf), (nr)) -#define mmiowb() - #ifndef CONFIG_SUN3 #define IO_SPACE_LIMIT 0xffff #else diff --git a/arch/m68k/include/asm/mvme147hw.h b/arch/m68k/include/asm/mvme147hw.h index 9c7ff67c5ffd..257b29184af9 100644 --- a/arch/m68k/include/asm/mvme147hw.h +++ b/arch/m68k/include/asm/mvme147hw.h @@ -66,7 +66,7 @@ struct pcc_regs { #define PCC_INT_ENAB 0x08 #define PCC_TIMER_INT_CLR 0x80 -#define PCC_TIMER_PRELOAD 63936l +#define PCC_TIMER_CLR_OVF 0x04 #define PCC_LEVEL_ABORT 0x07 #define PCC_LEVEL_SERIAL 0x04 diff --git a/arch/m68k/include/asm/tlb.h b/arch/m68k/include/asm/tlb.h index b4b9efb6f963..3c81f6adfc8b 100644 --- a/arch/m68k/include/asm/tlb.h +++ b/arch/m68k/include/asm/tlb.h @@ -2,20 +2,6 @@ #ifndef _M68K_TLB_H #define _M68K_TLB_H -/* - * m68k doesn't need any special per-pte or - * per-vma handling.. - */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it - * fills up. - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #endif /* _M68K_TLB_H */ diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 125c14178979..df4ec3ec71d1 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -423,3 +423,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index cd9317d53276..11be08f4f750 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -54,8 +54,6 @@ struct mac_booter_data mac_bi_data; /* The phys. video addr. - might be bogus on some machines */ static unsigned long mac_orig_videoaddr; -/* Mac specific timer functions */ -extern u32 mac_gettimeoffset(void); extern int mac_hwclk(int, struct rtc_time *); extern void iop_preinit(void); extern void iop_init(void); @@ -155,7 +153,6 @@ void __init config_mac(void) mach_sched_init = mac_sched_init; mach_init_IRQ = mac_init_IRQ; mach_get_model = mac_get_model; - arch_gettimeoffset = mac_gettimeoffset; mach_hwclk = mac_hwclk; mach_reset = mac_reset; mach_halt = mac_poweroff; diff --git a/arch/m68k/mac/via.c b/arch/m68k/mac/via.c index 0b0289459173..3c2cfcb74982 100644 --- a/arch/m68k/mac/via.c +++ b/arch/m68k/mac/via.c @@ -23,6 +23,7 @@ * */ +#include <linux/clocksource.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> @@ -55,16 +56,6 @@ static __u8 rbv_clear; static int gIER,gIFR,gBufA,gBufB; /* - * Timer defs. - */ - -#define TICK_SIZE 10000 -#define MAC_CLOCK_TICK (783300/HZ) /* ticks per HZ */ -#define MAC_CLOCK_LOW (MAC_CLOCK_TICK&0xFF) -#define MAC_CLOCK_HIGH (MAC_CLOCK_TICK>>8) - - -/* * On Macs with a genuine VIA chip there is no way to mask an individual slot * interrupt. This limitation also seems to apply to VIA clone logic cores in * Quadra-like ASICs. (RBV and OSS machines don't have this limitation.) @@ -272,22 +263,6 @@ void __init via_init(void) } /* - * Start the 100 Hz clock - */ - -void __init via_init_clock(irq_handler_t func) -{ - via1[vACR] |= 0x40; - via1[vT1LL] = MAC_CLOCK_LOW; - via1[vT1LH] = MAC_CLOCK_HIGH; - via1[vT1CL] = MAC_CLOCK_LOW; - via1[vT1CH] = MAC_CLOCK_HIGH; - - if (request_irq(IRQ_MAC_TIMER_1, func, 0, "timer", func)) - pr_err("Couldn't register %s interrupt\n", "timer"); -} - -/* * Debugging dump, used in various places to see what's going on. */ @@ -315,29 +290,6 @@ void via_debug_dump(void) } /* - * This is always executed with interrupts disabled. - * - * TBI: get time offset between scheduling timer ticks - */ - -u32 mac_gettimeoffset(void) -{ - unsigned long ticks, offset = 0; - - /* read VIA1 timer 2 current value */ - ticks = via1[vT1CL] | (via1[vT1CH] << 8); - /* The probability of underflow is less than 2% */ - if (ticks > MAC_CLOCK_TICK - MAC_CLOCK_TICK / 50) - /* Check for pending timer interrupt in VIA1 IFR */ - if (via1[vIFR] & 0x40) offset = TICK_SIZE; - - ticks = MAC_CLOCK_TICK - ticks; - ticks = ticks * 10000L / MAC_CLOCK_TICK; - - return (ticks + offset) * 1000; -} - -/* * Flush the L2 cache on Macs that have it by flipping * the system into 24-bit mode for an instant. */ @@ -440,6 +392,8 @@ void via_nubus_irq_shutdown(int irq) * via6522.c :-), disable/pending masks added. */ +#define VIA_TIMER_1_INT BIT(6) + void via1_irq(struct irq_desc *desc) { int irq_num; @@ -449,6 +403,21 @@ void via1_irq(struct irq_desc *desc) if (!events) return; + irq_num = IRQ_MAC_TIMER_1; + irq_bit = VIA_TIMER_1_INT; + if (events & irq_bit) { + unsigned long flags; + + local_irq_save(flags); + via1[vIFR] = irq_bit; + generic_handle_irq(irq_num); + local_irq_restore(flags); + + events &= ~irq_bit; + if (!events) + return; + } + irq_num = VIA1_SOURCE_BASE; irq_bit = 1; do { @@ -605,3 +574,82 @@ int via2_scsi_drq_pending(void) return via2[gIFR] & (1 << IRQ_IDX(IRQ_MAC_SCSIDRQ)); } EXPORT_SYMBOL(via2_scsi_drq_pending); + +/* timer and clock source */ + +#define VIA_CLOCK_FREQ 783360 /* VIA "phase 2" clock in Hz */ +#define VIA_TIMER_CYCLES (VIA_CLOCK_FREQ / HZ) /* clock cycles per jiffy */ + +#define VIA_TC (VIA_TIMER_CYCLES - 2) /* including 0 and -1 */ +#define VIA_TC_LOW (VIA_TC & 0xFF) +#define VIA_TC_HIGH (VIA_TC >> 8) + +static u64 mac_read_clk(struct clocksource *cs); + +static struct clocksource mac_clk = { + .name = "via1", + .rating = 250, + .read = mac_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total, clk_offset; + +static irqreturn_t via_timer_handler(int irq, void *dev_id) +{ + irq_handler_t timer_routine = dev_id; + + clk_total += VIA_TIMER_CYCLES; + clk_offset = 0; + timer_routine(0, NULL); + + return IRQ_HANDLED; +} + +void __init via_init_clock(irq_handler_t timer_routine) +{ + if (request_irq(IRQ_MAC_TIMER_1, via_timer_handler, IRQF_TIMER, "timer", + timer_routine)) { + pr_err("Couldn't register %s interrupt\n", "timer"); + return; + } + + via1[vT1LL] = VIA_TC_LOW; + via1[vT1LH] = VIA_TC_HIGH; + via1[vT1CL] = VIA_TC_LOW; + via1[vT1CH] = VIA_TC_HIGH; + via1[vACR] |= 0x40; + + clocksource_register_hz(&mac_clk, VIA_CLOCK_FREQ); +} + +static u64 mac_read_clk(struct clocksource *cs) +{ + unsigned long flags; + u8 count_high; + u16 count; + u32 ticks; + + /* + * Timer counter wrap-around is detected with the timer interrupt flag + * but reading the counter low byte (vT1CL) would reset the flag. + * Also, accessing both counter registers is essentially a data race. + * These problems are avoided by ignoring the low byte. Clock accuracy + * is 256 times worse (error can reach 0.327 ms) but CPU overhead is + * reduced by avoiding slow VIA register accesses. + */ + + local_irq_save(flags); + count_high = via1[vT1CH]; + if (count_high == 0xFF) + count_high = 0; + if (count_high > 0 && (via1[vIFR] & VIA_TIMER_1_INT)) + clk_offset = VIA_TIMER_CYCLES; + count = count_high << 8; + ticks = VIA_TIMER_CYCLES - count; + ticks += clk_offset + clk_total; + local_irq_restore(flags); + + return ticks; +} diff --git a/arch/m68k/mvme147/config.c b/arch/m68k/mvme147/config.c index adea549d240e..545a1fe0e119 100644 --- a/arch/m68k/mvme147/config.c +++ b/arch/m68k/mvme147/config.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/tty.h> +#include <linux/clocksource.h> #include <linux/console.h> #include <linux/linkage.h> #include <linux/init.h> @@ -38,18 +39,12 @@ static void mvme147_get_model(char *model); extern void mvme147_sched_init(irq_handler_t handler); -extern u32 mvme147_gettimeoffset(void); extern int mvme147_hwclk (int, struct rtc_time *); extern void mvme147_reset (void); static int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to xtime_update() in - * kernel/time/timekeeping.c, called via mvme147_process_int() */ - -irq_handler_t tick_handler; - int __init mvme147_parse_bootinfo(const struct bi_record *bi) { @@ -89,7 +84,6 @@ void __init config_mvme147(void) mach_max_dma_address = 0x01000000; mach_sched_init = mvme147_sched_init; mach_init_IRQ = mvme147_init_IRQ; - arch_gettimeoffset = mvme147_gettimeoffset; mach_hwclk = mvme147_hwclk; mach_reset = mvme147_reset; mach_get_model = mvme147_get_model; @@ -99,45 +93,76 @@ void __init config_mvme147(void) vme_brdtype = VME_TYPE_MVME147; } +static u64 mvme147_read_clk(struct clocksource *cs); + +static struct clocksource mvme147_clk = { + .name = "pcc", + .rating = 250, + .read = mvme147_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total; + +#define PCC_TIMER_CLOCK_FREQ 160000 +#define PCC_TIMER_CYCLES (PCC_TIMER_CLOCK_FREQ / HZ) +#define PCC_TIMER_PRELOAD (0x10000 - PCC_TIMER_CYCLES) /* Using pcc tick timer 1 */ static irqreturn_t mvme147_timer_int (int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + unsigned long flags; + + local_irq_save(flags); m147_pcc->t1_int_cntrl = PCC_TIMER_INT_CLR; - m147_pcc->t1_int_cntrl = PCC_INT_ENAB|PCC_LEVEL_TIMER1; - return tick_handler(irq, dev_id); + m147_pcc->t1_cntrl = PCC_TIMER_CLR_OVF; + clk_total += PCC_TIMER_CYCLES; + timer_routine(0, NULL); + local_irq_restore(flags); + + return IRQ_HANDLED; } void mvme147_sched_init (irq_handler_t timer_routine) { - tick_handler = timer_routine; - if (request_irq(PCC_IRQ_TIMER1, mvme147_timer_int, 0, "timer 1", NULL)) + if (request_irq(PCC_IRQ_TIMER1, mvme147_timer_int, IRQF_TIMER, + "timer 1", timer_routine)) pr_err("Couldn't register timer interrupt\n"); /* Init the clock with a value */ - /* our clock goes off every 6.25us */ + /* The clock counter increments until 0xFFFF then reloads */ m147_pcc->t1_preload = PCC_TIMER_PRELOAD; m147_pcc->t1_cntrl = 0x0; /* clear timer */ m147_pcc->t1_cntrl = 0x3; /* start timer */ m147_pcc->t1_int_cntrl = PCC_TIMER_INT_CLR; /* clear pending ints */ m147_pcc->t1_int_cntrl = PCC_INT_ENAB|PCC_LEVEL_TIMER1; + + clocksource_register_hz(&mvme147_clk, PCC_TIMER_CLOCK_FREQ); } -/* This is always executed with interrupts disabled. */ -/* XXX There are race hazards in this code XXX */ -u32 mvme147_gettimeoffset(void) +static u64 mvme147_read_clk(struct clocksource *cs) { - volatile unsigned short *cp = (volatile unsigned short *)0xfffe1012; - unsigned short n; - - n = *cp; - while (n != *cp) - n = *cp; - - n -= PCC_TIMER_PRELOAD; - return ((unsigned long)n * 25 / 4) * 1000; + unsigned long flags; + u8 overflow, tmp; + u16 count; + u32 ticks; + + local_irq_save(flags); + tmp = m147_pcc->t1_cntrl >> 4; + count = m147_pcc->t1_count; + overflow = m147_pcc->t1_cntrl >> 4; + if (overflow != tmp) + count = m147_pcc->t1_count; + count -= PCC_TIMER_PRELOAD; + ticks = count + overflow * PCC_TIMER_CYCLES; + ticks += clk_total; + local_irq_restore(flags); + + return ticks; } static int bcd2int (unsigned char b) diff --git a/arch/m68k/mvme16x/config.c b/arch/m68k/mvme16x/config.c index 6ee36a5b528d..9bc2da69f80c 100644 --- a/arch/m68k/mvme16x/config.c +++ b/arch/m68k/mvme16x/config.c @@ -19,6 +19,7 @@ #include <linux/mm.h> #include <linux/seq_file.h> #include <linux/tty.h> +#include <linux/clocksource.h> #include <linux/console.h> #include <linux/linkage.h> #include <linux/init.h> @@ -44,17 +45,11 @@ static MK48T08ptr_t volatile rtc = (MK48T08ptr_t)MVME_RTC_BASE; static void mvme16x_get_model(char *model); extern void mvme16x_sched_init(irq_handler_t handler); -extern u32 mvme16x_gettimeoffset(void); extern int mvme16x_hwclk (int, struct rtc_time *); extern void mvme16x_reset (void); int bcd2int (unsigned char b); -/* Save tick handler routine pointer, will point to xtime_update() in - * kernel/time/timekeeping.c, called via mvme16x_process_int() */ - -static irq_handler_t tick_handler; - unsigned short mvme16x_config; EXPORT_SYMBOL(mvme16x_config); @@ -120,11 +115,11 @@ static void __init mvme16x_init_IRQ (void) m68k_setup_user_interrupt(VEC_USER, 192); } -#define pcc2chip ((volatile u_char *)0xfff42000) -#define PccSCCMICR 0x1d -#define PccSCCTICR 0x1e -#define PccSCCRICR 0x1f -#define PccTPIACKR 0x25 +#define PCC2CHIP (0xfff42000) +#define PCCSCCMICR (PCC2CHIP + 0x1d) +#define PCCSCCTICR (PCC2CHIP + 0x1e) +#define PCCSCCRICR (PCC2CHIP + 0x1f) +#define PCCTPIACKR (PCC2CHIP + 0x25) #ifdef CONFIG_EARLY_PRINTK @@ -232,10 +227,10 @@ void mvme16x_cons_write(struct console *co, const char *str, unsigned count) base_addr[CyIER] = CyTxMpty; while (1) { - if (pcc2chip[PccSCCTICR] & 0x20) + if (in_8(PCCSCCTICR) & 0x20) { /* We have a Tx int. Acknowledge it */ - sink = pcc2chip[PccTPIACKR]; + sink = in_8(PCCTPIACKR); if ((base_addr[CyLICR] >> 2) == port) { if (i == count) { /* Last char of string is now output */ @@ -277,7 +272,6 @@ void __init config_mvme16x(void) mach_max_dma_address = 0xffffffff; mach_sched_init = mvme16x_sched_init; mach_init_IRQ = mvme16x_init_IRQ; - arch_gettimeoffset = mvme16x_gettimeoffset; mach_hwclk = mvme16x_hwclk; mach_reset = mvme16x_reset; mach_get_model = mvme16x_get_model; @@ -350,10 +344,46 @@ static irqreturn_t mvme16x_abort_int (int irq, void *dev_id) return IRQ_HANDLED; } +static u64 mvme16x_read_clk(struct clocksource *cs); + +static struct clocksource mvme16x_clk = { + .name = "pcc", + .rating = 250, + .read = mvme16x_read_clk, + .mask = CLOCKSOURCE_MASK(32), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, +}; + +static u32 clk_total; + +#define PCC_TIMER_CLOCK_FREQ 1000000 +#define PCC_TIMER_CYCLES (PCC_TIMER_CLOCK_FREQ / HZ) + +#define PCCTCMP1 (PCC2CHIP + 0x04) +#define PCCTCNT1 (PCC2CHIP + 0x08) +#define PCCTOVR1 (PCC2CHIP + 0x17) +#define PCCTIC1 (PCC2CHIP + 0x1b) + +#define PCCTOVR1_TIC_EN 0x01 +#define PCCTOVR1_COC_EN 0x02 +#define PCCTOVR1_OVR_CLR 0x04 + +#define PCCTIC1_INT_CLR 0x08 +#define PCCTIC1_INT_EN 0x10 + static irqreturn_t mvme16x_timer_int (int irq, void *dev_id) { - *(volatile unsigned char *)0xfff4201b |= 8; - return tick_handler(irq, dev_id); + irq_handler_t timer_routine = dev_id; + unsigned long flags; + + local_irq_save(flags); + out_8(PCCTIC1, in_8(PCCTIC1) | PCCTIC1_INT_CLR); + out_8(PCCTOVR1, PCCTOVR1_OVR_CLR); + clk_total += PCC_TIMER_CYCLES; + timer_routine(0, NULL); + local_irq_restore(flags); + + return IRQ_HANDLED; } void mvme16x_sched_init (irq_handler_t timer_routine) @@ -361,16 +391,17 @@ void mvme16x_sched_init (irq_handler_t timer_routine) uint16_t brdno = be16_to_cpu(mvme_bdid.brdno); int irq; - tick_handler = timer_routine; /* Using PCCchip2 or MC2 chip tick timer 1 */ - *(volatile unsigned long *)0xfff42008 = 0; - *(volatile unsigned long *)0xfff42004 = 10000; /* 10ms */ - *(volatile unsigned char *)0xfff42017 |= 3; - *(volatile unsigned char *)0xfff4201b = 0x16; - if (request_irq(MVME16x_IRQ_TIMER, mvme16x_timer_int, 0, - "timer", mvme16x_timer_int)) + out_be32(PCCTCNT1, 0); + out_be32(PCCTCMP1, PCC_TIMER_CYCLES); + out_8(PCCTOVR1, in_8(PCCTOVR1) | PCCTOVR1_TIC_EN | PCCTOVR1_COC_EN); + out_8(PCCTIC1, PCCTIC1_INT_EN | 6); + if (request_irq(MVME16x_IRQ_TIMER, mvme16x_timer_int, IRQF_TIMER, "timer", + timer_routine)) panic ("Couldn't register timer int"); + clocksource_register_hz(&mvme16x_clk, PCC_TIMER_CLOCK_FREQ); + if (brdno == 0x0162 || brdno == 0x172) irq = MVME162_IRQ_ABORT; else @@ -380,11 +411,23 @@ void mvme16x_sched_init (irq_handler_t timer_routine) panic ("Couldn't register abort int"); } - -/* This is always executed with interrupts disabled. */ -u32 mvme16x_gettimeoffset(void) +static u64 mvme16x_read_clk(struct clocksource *cs) { - return (*(volatile u32 *)0xfff42008) * 1000; + unsigned long flags; + u8 overflow, tmp; + u32 ticks; + + local_irq_save(flags); + tmp = in_8(PCCTOVR1) >> 4; + ticks = in_be32(PCCTCNT1); + overflow = in_8(PCCTOVR1) >> 4; + if (overflow != tmp) + ticks = in_be32(PCCTCNT1); + ticks += overflow * PCC_TIMER_CYCLES; + ticks += clk_total; + local_irq_restore(flags); + + return ticks; } int bcd2int (unsigned char b) diff --git a/arch/m68k/q40/config.c b/arch/m68k/q40/config.c index 96810d91da2b..e63eb5f06999 100644 --- a/arch/m68k/q40/config.c +++ b/arch/m68k/q40/config.c @@ -40,7 +40,6 @@ extern void q40_init_IRQ(void); static void q40_get_model(char *model); extern void q40_sched_init(irq_handler_t handler); -static u32 q40_gettimeoffset(void); static int q40_hwclk(int, struct rtc_time *); static unsigned int q40_get_ss(void); static int q40_get_rtc_pll(struct rtc_pll_info *pll); @@ -169,7 +168,6 @@ void __init config_q40(void) mach_sched_init = q40_sched_init; mach_init_IRQ = q40_init_IRQ; - arch_gettimeoffset = q40_gettimeoffset; mach_hwclk = q40_hwclk; mach_get_ss = q40_get_ss; mach_get_rtc_pll = q40_get_rtc_pll; @@ -201,13 +199,6 @@ int __init q40_parse_bootinfo(const struct bi_record *rec) return 1; } - -static u32 q40_gettimeoffset(void) -{ - return 5000 * (ql_ticks != 0) * 1000; -} - - /* * Looks like op is non-zero for setting the clock, and zero for * reading the clock. diff --git a/arch/m68k/q40/q40ints.c b/arch/m68k/q40/q40ints.c index 3e7603202977..1c696906c159 100644 --- a/arch/m68k/q40/q40ints.c +++ b/arch/m68k/q40/q40ints.c @@ -127,10 +127,10 @@ void q40_mksound(unsigned int hz, unsigned int ticks) sound_ticks = ticks << 1; } -static irq_handler_t q40_timer_routine; - -static irqreturn_t q40_timer_int (int irq, void * dev) +static irqreturn_t q40_timer_int(int irq, void *dev_id) { + irq_handler_t timer_routine = dev_id; + ql_ticks = ql_ticks ? 0 : 1; if (sound_ticks) { unsigned char sval=(sound_ticks & 1) ? 128-SVOL : 128+SVOL; @@ -139,8 +139,13 @@ static irqreturn_t q40_timer_int (int irq, void * dev) *DAC_RIGHT=sval; } - if (!ql_ticks) - q40_timer_routine(irq, dev); + if (!ql_ticks) { + unsigned long flags; + + local_irq_save(flags); + timer_routine(0, NULL); + local_irq_restore(flags); + } return IRQ_HANDLED; } @@ -148,11 +153,9 @@ void q40_sched_init (irq_handler_t timer_routine) { int timer_irq; - q40_timer_routine = timer_routine; timer_irq = Q40_IRQ_FRAME; - if (request_irq(timer_irq, q40_timer_int, 0, - "timer", q40_timer_int)) + if (request_irq(timer_irq, q40_timer_int, 0, "timer", timer_routine)) panic("Couldn't register timer int"); master_outb(-1, FRAME_CLEAR_REG); diff --git a/arch/m68k/sun3/config.c b/arch/m68k/sun3/config.c index 542c4404861c..229ea37dfe1b 100644 --- a/arch/m68k/sun3/config.c +++ b/arch/m68k/sun3/config.c @@ -37,7 +37,6 @@ char sun3_reserved_pmeg[SUN3_PMEGS_NUM]; -extern u32 sun3_gettimeoffset(void); static void sun3_sched_init(irq_handler_t handler); extern void sun3_get_model (char* model); extern int sun3_hwclk(int set, struct rtc_time *t); @@ -138,7 +137,6 @@ void __init config_sun3(void) mach_sched_init = sun3_sched_init; mach_init_IRQ = sun3_init_IRQ; mach_reset = sun3_reboot; - arch_gettimeoffset = sun3_gettimeoffset; mach_get_model = sun3_get_model; mach_hwclk = sun3_hwclk; mach_halt = sun3_halt; diff --git a/arch/m68k/sun3/intersil.c b/arch/m68k/sun3/intersil.c index d911070af02a..8fc74864de81 100644 --- a/arch/m68k/sun3/intersil.c +++ b/arch/m68k/sun3/intersil.c @@ -22,13 +22,6 @@ #define STOP_VAL (INTERSIL_STOP | INTERSIL_INT_ENABLE | INTERSIL_24H_MODE) #define START_VAL (INTERSIL_RUN | INTERSIL_INT_ENABLE | INTERSIL_24H_MODE) -/* does this need to be implemented? */ -u32 sun3_gettimeoffset(void) -{ - return 1000; -} - - /* get/set hwclock */ int sun3_hwclk(int set, struct rtc_time *t) diff --git a/arch/m68k/sun3/sun3ints.c b/arch/m68k/sun3/sun3ints.c index 6bbca30c9188..a5824abb4a39 100644 --- a/arch/m68k/sun3/sun3ints.c +++ b/arch/m68k/sun3/sun3ints.c @@ -61,8 +61,10 @@ static irqreturn_t sun3_int7(int irq, void *dev_id) static irqreturn_t sun3_int5(int irq, void *dev_id) { + unsigned long flags; unsigned int cnt; + local_irq_save(flags); #ifdef CONFIG_SUN3 intersil_clear(); #endif @@ -76,6 +78,7 @@ static irqreturn_t sun3_int5(int irq, void *dev_id) cnt = kstat_irqs_cpu(irq, 0); if (!(cnt % 20)) sun3_leds(led_pattern[cnt % 160 / 20]); + local_irq_restore(flags); return IRQ_HANDLED; } diff --git a/arch/m68k/sun3x/config.c b/arch/m68k/sun3x/config.c index 33d3a1c6fba0..03ce7f9facfe 100644 --- a/arch/m68k/sun3x/config.c +++ b/arch/m68k/sun3x/config.c @@ -49,7 +49,6 @@ void __init config_sun3x(void) mach_sched_init = sun3x_sched_init; mach_init_IRQ = sun3_init_IRQ; - arch_gettimeoffset = sun3x_gettimeoffset; mach_reset = sun3x_reboot; mach_hwclk = sun3x_hwclk; diff --git a/arch/m68k/sun3x/time.c b/arch/m68k/sun3x/time.c index 047e2bcee3d7..9163294b0fb6 100644 --- a/arch/m68k/sun3x/time.c +++ b/arch/m68k/sun3x/time.c @@ -73,22 +73,21 @@ int sun3x_hwclk(int set, struct rtc_time *t) return 0; } -/* Not much we can do here */ -u32 sun3x_gettimeoffset(void) -{ - return 0L; -} #if 0 -static void sun3x_timer_tick(int irq, void *dev_id, struct pt_regs *regs) +static irqreturn_t sun3x_timer_tick(int irq, void *dev_id) { - void (*vector)(int, void *, struct pt_regs *) = dev_id; + irq_handler_t timer_routine = dev_id; + unsigned long flags; - /* Clear the pending interrupt - pulse the enable line low */ - disable_irq(5); - enable_irq(5); + local_irq_save(flags); + /* Clear the pending interrupt - pulse the enable line low */ + disable_irq(5); + enable_irq(5); + timer_routine(0, NULL); + local_irq_restore(flags); - vector(irq, NULL, regs); + return IRQ_HANDLED; } #endif diff --git a/arch/m68k/sun3x/time.h b/arch/m68k/sun3x/time.h index 496f406412ad..86ce78bb3c28 100644 --- a/arch/m68k/sun3x/time.h +++ b/arch/m68k/sun3x/time.h @@ -3,7 +3,6 @@ #define SUN3X_TIME_H extern int sun3x_hwclk(int set, struct rtc_time *t); -u32 sun3x_gettimeoffset(void); void sun3x_sched_init(irq_handler_t vector); struct mostek_dt { diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index a51b965b3b82..adb179f519f9 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -41,6 +41,7 @@ config MICROBLAZE select TRACING_SUPPORT select VIRT_TO_BUS select CPU_NO_EFFICIENT_FFS + select MMU_GATHER_NO_RANGE if MMU # Endianness selection choice @@ -58,15 +59,9 @@ config CPU_LITTLE_ENDIAN endchoice -config RWSEM_GENERIC_SPINLOCK - def_bool y - config ZONE_DMA def_bool y -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 def_bool n diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 1a8285c3f693..17a8d0a62038 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -23,6 +23,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/microblaze/include/asm/tlb.h b/arch/microblaze/include/asm/tlb.h index 99b6ded54849..628a78ee0a72 100644 --- a/arch/microblaze/include/asm/tlb.h +++ b/arch/microblaze/include/asm/tlb.h @@ -11,16 +11,7 @@ #ifndef _ASM_MICROBLAZE_TLB_H #define _ASM_MICROBLAZE_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <linux/pagemap.h> - -#ifdef CONFIG_MMU -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) -#endif - #include <asm-generic/tlb.h> #endif /* _ASM_MICROBLAZE_TLB_H */ diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 8ee3a8c18498..4964947732af 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -429,3 +429,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index c2ce1e42b888..8fe54fda31dc 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -75,7 +75,7 @@ static void __iomem *__ioremap(phys_addr_t addr, unsigned long size, p >= memory_start && p < virt_to_phys(high_memory) && !(p >= __virt_to_phys((phys_addr_t)__bss_stop) && p < __virt_to_phys((phys_addr_t)__bss_stop))) { - pr_warn("__ioremap(): phys addr "PTE_FMT" is RAM lr %pf\n", + pr_warn("__ioremap(): phys addr "PTE_FMT" is RAM lr %ps\n", (unsigned long)p, __builtin_return_address(0)); return NULL; } diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4a5f5b0ee9a9..b9c48b27162d 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1037,13 +1037,6 @@ source "arch/mips/paravirt/Kconfig" endmenu -config RWSEM_GENERIC_SPINLOCK - bool - default y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_HWEIGHT bool default y diff --git a/arch/mips/ath79/setup.c b/arch/mips/ath79/setup.c index 4a70c5de8c92..25a57895a3a3 100644 --- a/arch/mips/ath79/setup.c +++ b/arch/mips/ath79/setup.c @@ -210,12 +210,6 @@ const char *get_system_type(void) return ath79_sys_type; } -int get_c0_perfcount_int(void) -{ - return ATH79_MISC_IRQ(5); -} -EXPORT_SYMBOL_GPL(get_c0_perfcount_int); - unsigned int get_c0_compare_int(void) { return CP0_LEGACY_COMPARE_IRQ; diff --git a/arch/mips/configs/generic/board-ocelot.config b/arch/mips/configs/generic/board-ocelot.config index f607888d2483..184eb65a6ba7 100644 --- a/arch/mips/configs/generic/board-ocelot.config +++ b/arch/mips/configs/generic/board-ocelot.config @@ -1,6 +1,10 @@ # require CONFIG_CPU_MIPS32_R2=y CONFIG_LEGACY_BOARD_OCELOT=y +CONFIG_FIT_IMAGE_FDT_OCELOT=y + +CONFIG_BRIDGE=y +CONFIG_GENERIC_PHY=y CONFIG_MTD=y CONFIG_MTD_CMDLINE_PARTS=y @@ -19,6 +23,8 @@ CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y CONFIG_NETDEVICES=y +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_DSA=y CONFIG_MSCC_OCELOT_SWITCH=y CONFIG_MSCC_OCELOT_SWITCH_OCELOT=y CONFIG_MDIO_MSCC_MIIM=y @@ -35,6 +41,8 @@ CONFIG_SPI_DESIGNWARE=y CONFIG_SPI_DW_MMIO=y CONFIG_SPI_SPIDEV=y +CONFIG_PINCTRL_OCELOT=y + CONFIG_GPIO_SYSFS=y CONFIG_POWER_RESET=y diff --git a/arch/mips/include/asm/io.h b/arch/mips/include/asm/io.h index 845fbbc7a2e3..29997e42480e 100644 --- a/arch/mips/include/asm/io.h +++ b/arch/mips/include/asm/io.h @@ -102,9 +102,6 @@ static inline void set_io_port_base(unsigned long base) #define iobarrier_w() wmb() #define iobarrier_sync() iob() -/* Some callers use this older API instead. */ -#define mmiowb() iobarrier_w() - /* * virt_to_phys - map virtual addresses to physical * @address: address to remap diff --git a/arch/mips/include/asm/mmiowb.h b/arch/mips/include/asm/mmiowb.h new file mode 100644 index 000000000000..a40824e3ef8e --- /dev/null +++ b/arch/mips/include/asm/mmiowb.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_MMIOWB_H +#define _ASM_MMIOWB_H + +#include <asm/io.h> + +#define mmiowb() iobarrier_w() + +#include <asm-generic/mmiowb.h> + +#endif /* _ASM_MMIOWB_H */ diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h index ee81297d9117..8a88eb265516 100644 --- a/arch/mips/include/asm/spinlock.h +++ b/arch/mips/include/asm/spinlock.h @@ -11,6 +11,21 @@ #include <asm/processor.h> #include <asm/qrwlock.h> + +#include <asm-generic/qspinlock_types.h> + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + */ +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + /* This could be optimised with ARCH_HAS_MMIOWB */ + mmiowb(); + smp_store_release(&lock->locked, 0); +} + #include <asm/qspinlock.h> #endif /* _ASM_SPINLOCK_H */ diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h index b6823b9e94da..90f3ad76d9e0 100644 --- a/arch/mips/include/asm/tlb.h +++ b/arch/mips/include/asm/tlb.h @@ -5,23 +5,6 @@ #include <asm/cpu-features.h> #include <asm/mipsregs.h> -/* - * MIPS doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for area to be unmapped. - */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) - -/* - * .. because we flush the whole mm when it fills up. - */ -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #define _UNIQUE_ENTRYHI(base, idx) \ (((base) + ((idx) << (PAGE_SHIFT + 1))) | \ (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0)) diff --git a/arch/mips/kernel/kgdb.c b/arch/mips/kernel/kgdb.c index 6e574c02e4c3..ea781b29f7f1 100644 --- a/arch/mips/kernel/kgdb.c +++ b/arch/mips/kernel/kgdb.c @@ -33,6 +33,7 @@ #include <asm/processor.h> #include <asm/sigcontext.h> #include <linux/uaccess.h> +#include <asm/irq_regs.h> static struct hard_trap_info { unsigned char tt; /* Trap type code for MIPS R3xxx and R4xxx */ @@ -214,7 +215,7 @@ void kgdb_call_nmi_hook(void *ignored) old_fs = get_fs(); set_fs(KERNEL_DS); - kgdb_nmicallback(raw_smp_processor_id(), NULL); + kgdb_nmicallback(raw_smp_processor_id(), get_irq_regs()); set_fs(old_fs); } diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index f158c5894a9a..feb2653490df 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -125,7 +125,7 @@ trace_a_syscall: subu t1, v0, __NR_O32_Linux move a1, v0 bnez t1, 1f /* __NR_syscall at offset 0 */ - lw a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ + ld a1, PT_R4(sp) /* Arg1 for __NR_syscall case */ .set pop 1: jal syscall_trace_enter diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 15f4117900ee..9392dfe33f97 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -362,3 +362,7 @@ 421 n32 rt_sigtimedwait_time64 compat_sys_rt_sigtimedwait_time64 422 n32 futex_time64 sys_futex 423 n32 sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 n32 pidfd_send_signal sys_pidfd_send_signal +425 n32 io_uring_setup sys_io_uring_setup +426 n32 io_uring_enter sys_io_uring_enter +427 n32 io_uring_register sys_io_uring_register diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index c85502e67b44..cd0c8aa21fba 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -338,3 +338,7 @@ 327 n64 rseq sys_rseq 328 n64 io_pgetevents sys_io_pgetevents # 329 through 423 are reserved to sync up with other architectures +424 n64 pidfd_send_signal sys_pidfd_send_signal +425 n64 io_uring_setup sys_io_uring_setup +426 n64 io_uring_enter sys_io_uring_enter +427 n64 io_uring_register sys_io_uring_register diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 2e063d0f837e..e849e8ffe4a2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -411,3 +411,7 @@ 421 o32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 o32 futex_time64 sys_futex sys_futex 423 o32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 o32 pidfd_send_signal sys_pidfd_send_signal +425 o32 io_uring_setup sys_io_uring_setup +426 o32 io_uring_enter sys_io_uring_enter +427 o32 io_uring_register sys_io_uring_register diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 0effd3cba9a7..98bf0c222b5f 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -186,8 +186,9 @@ enum which_ebpf_reg { * separate frame pointer, so BPF_REG_10 relative accesses are * adjusted to be $sp relative. */ -int ebpf_to_mips_reg(struct jit_ctx *ctx, const struct bpf_insn *insn, - enum which_ebpf_reg w) +static int ebpf_to_mips_reg(struct jit_ctx *ctx, + const struct bpf_insn *insn, + enum which_ebpf_reg w) { int ebpf_reg = (w == src_reg || w == src_reg_no_fp) ? insn->src_reg : insn->dst_reg; diff --git a/arch/mips/sgi-ip27/ip27-irq.c b/arch/mips/sgi-ip27/ip27-irq.c index 710a59764b01..a32f843cdbe0 100644 --- a/arch/mips/sgi-ip27/ip27-irq.c +++ b/arch/mips/sgi-ip27/ip27-irq.c @@ -118,7 +118,6 @@ static void shutdown_bridge_irq(struct irq_data *d) { struct hub_irq_data *hd = irq_data_get_irq_chip_data(d); struct bridge_controller *bc; - int pin = hd->pin; if (!hd) return; @@ -126,7 +125,7 @@ static void shutdown_bridge_irq(struct irq_data *d) disable_hub_irq(d); bc = hd->bc; - bridge_clr(bc, b_int_enable, (1 << pin)); + bridge_clr(bc, b_int_enable, (1 << hd->pin)); bridge_read(bc, b_wid_tflush); } diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index addb7f5f5264..55559ca0efe4 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -60,9 +60,6 @@ config GENERIC_LOCKBREAK def_bool y depends on PREEMPT -config RWSEM_GENERIC_SPINLOCK - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool y diff --git a/arch/nds32/include/asm/Kbuild b/arch/nds32/include/asm/Kbuild index 64ceff7ab99b..688b6ed26227 100644 --- a/arch/nds32/include/asm/Kbuild +++ b/arch/nds32/include/asm/Kbuild @@ -31,6 +31,7 @@ generic-y += limits.h generic-y += local.h generic-y += local64.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += parport.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/nds32/include/asm/io.h b/arch/nds32/include/asm/io.h index 71cd226d6863..5ef8ae5ba833 100644 --- a/arch/nds32/include/asm/io.h +++ b/arch/nds32/include/asm/io.h @@ -55,8 +55,6 @@ static inline u32 __raw_readl(const volatile void __iomem *addr) #define __iormb() rmb() #define __iowmb() wmb() -#define mmiowb() __asm__ __volatile__ ("msync all" : : : "memory"); - /* * {read,write}{b,w,l,q}_relaxed() are like the regular version, but * are not guaranteed to provide ordering against spinlocks or memory diff --git a/arch/nds32/include/asm/tlb.h b/arch/nds32/include/asm/tlb.h index b35ae5eae3ab..d5ae571c8d30 100644 --- a/arch/nds32/include/asm/tlb.h +++ b/arch/nds32/include/asm/tlb.h @@ -4,22 +4,6 @@ #ifndef __ASMNDS32_TLB_H #define __ASMNDS32_TLB_H -#define tlb_start_vma(tlb,vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb,vma) \ - do { \ - if(!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, addr) do { } while (0) - -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) diff --git a/arch/nds32/include/asm/tlbflush.h b/arch/nds32/include/asm/tlbflush.h index 9b411f401903..38ee769b18d8 100644 --- a/arch/nds32/include/asm/tlbflush.h +++ b/arch/nds32/include/asm/tlbflush.h @@ -42,6 +42,5 @@ void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long addr); void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t * pte); -void tlb_migrate_finish(struct mm_struct *mm); #endif diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index 4ef15a61b7bc..ea37394ff3ea 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -24,6 +24,7 @@ config NIOS2 select USB_ARCH_HAS_HCD if USB_SUPPORT select CPU_NO_EFFICIENT_FFS select ARCH_DISCARD_MEMBLOCK + select MMU_GATHER_NO_RANGE if MMU config GENERIC_CSUM def_bool y @@ -40,9 +41,6 @@ config NO_IOPORT_MAP config FPU def_bool n -config RWSEM_GENERIC_SPINLOCK - def_bool y - config TRACE_IRQFLAGS_SUPPORT def_bool n diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 88a667d12aaa..d7ef3512504a 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -27,6 +27,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/nios2/include/asm/tlb.h b/arch/nios2/include/asm/tlb.h index d3bc648e08b5..f9f2e27e32dd 100644 --- a/arch/nios2/include/asm/tlb.h +++ b/arch/nios2/include/asm/tlb.h @@ -11,22 +11,12 @@ #ifndef _ASM_NIOS2_TLB_H #define _ASM_NIOS2_TLB_H -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - extern void set_mmu_pid(unsigned long pid); /* - * NiosII doesn't need any special per-pte or per-vma handling, except - * we need to flush cache for the area to be unmapped. + * NIOS32 does have flush_tlb_range(), but it lacks a limit and fallback to + * full mm invalidation. So use flush_tlb_mm() for everything. */ -#define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while (0) - -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) #include <linux/pagemap.h> #include <asm-generic/tlb.h> diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index a5e361fbb75a..7cfb20555b10 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -36,6 +36,7 @@ config OPENRISC select OMPIC if SMP select ARCH_WANT_FRAME_POINTERS select GENERIC_IRQ_MULTI_HANDLER + select MMU_GATHER_NO_RANGE if MMU config CPU_BIG_ENDIAN def_bool y @@ -43,12 +44,6 @@ config CPU_BIG_ENDIAN config MMU def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - def_bool n - config GENERIC_HWEIGHT def_bool y diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index 22aa97136c01..1919cc5e0f11 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -24,6 +24,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/openrisc/include/asm/tlb.h b/arch/openrisc/include/asm/tlb.h index fa4376a4515d..92d8a4209884 100644 --- a/arch/openrisc/include/asm/tlb.h +++ b/arch/openrisc/include/asm/tlb.h @@ -20,14 +20,10 @@ #define __ASM_OPENRISC_TLB_H__ /* - * or32 doesn't need any special per-pte or - * per-vma handling.. + * OpenRISC doesn't have an efficient flush_tlb_range() so use flush_tlb_mm() + * for everything. */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) #include <linux/pagemap.h> #include <asm-generic/tlb.h> diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index c8e621296092..f1ed8ddfe486 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -75,12 +75,6 @@ config GENERIC_LOCKBREAK default y depends on SMP && PREEMPT -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool default n diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index 9bcd0c903dbb..b8c7db777144 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += percpu.h generic-y += preempt.h generic-y += seccomp.h diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index 30a8315d5c07..93d37010b375 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -229,8 +229,6 @@ static inline void writeq(unsigned long long q, volatile void __iomem *addr) #define writel_relaxed(l, addr) writel(l, addr) #define writeq_relaxed(q, addr) writeq(q, addr) -#define mmiowb() do { } while (0) - void memset_io(volatile void __iomem *addr, unsigned char val, int count); void memcpy_fromio(void *dst, const volatile void __iomem *src, int count); void memcpy_toio(volatile void __iomem *dst, const void *src, int count); diff --git a/arch/parisc/include/asm/tlb.h b/arch/parisc/include/asm/tlb.h index 0c881e74d8a6..8c0446b04c9e 100644 --- a/arch/parisc/include/asm/tlb.h +++ b/arch/parisc/include/asm/tlb.h @@ -2,24 +2,6 @@ #ifndef _PARISC_TLB_H #define _PARISC_TLB_H -#define tlb_flush(tlb) \ -do { if ((tlb)->fullmm) \ - flush_tlb_mm((tlb)->mm);\ -} while (0) - -#define tlb_start_vma(tlb, vma) \ -do { if (!(tlb)->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define tlb_end_vma(tlb, vma) \ -do { if (!(tlb)->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, address) \ - do { } while (0) - #include <asm-generic/tlb.h> #define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) diff --git a/arch/parisc/kernel/stacktrace.c b/arch/parisc/kernel/stacktrace.c index ec5835e83a7a..6f0b9c8d8052 100644 --- a/arch/parisc/kernel/stacktrace.c +++ b/arch/parisc/kernel/stacktrace.c @@ -29,22 +29,17 @@ static void dump_trace(struct task_struct *task, struct stack_trace *trace) } } - /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { dump_trace(current, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index b26766c6647d..fe8ca623add8 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -420,3 +420,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2d0be82c3061..fa7219ffeadc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -103,13 +103,6 @@ config LOCKDEP_SUPPORT bool default y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - bool - default y - config GENERIC_LOCKBREAK bool default y @@ -132,6 +125,7 @@ config PPC select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_KCOV + select ARCH_HAS_MMIOWB if PPC64 select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 select ARCH_HAS_PTE_SPECIAL @@ -218,6 +212,8 @@ config PPC select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE + select HAVE_MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if PPC_BOOK3S_64 && CPU_LITTLE_ENDIAN select HAVE_SYSCALL_TRACEPOINTS @@ -318,6 +314,10 @@ config ARCH_SUSPEND_POSSIBLE (PPC_85xx && !PPC_E500MC) || PPC_86xx || PPC_PSERIES \ || 44x || 40x +config ARCH_SUSPEND_NONZERO_CPU + def_bool y + depends on PPC_POWERNV || PPC_PSERIES + config PPC_DCR_NATIVE bool diff --git a/arch/powerpc/configs/skiroot_defconfig b/arch/powerpc/configs/skiroot_defconfig index 5ba131c30f6b..1bcd468ab422 100644 --- a/arch/powerpc/configs/skiroot_defconfig +++ b/arch/powerpc/configs/skiroot_defconfig @@ -266,6 +266,7 @@ CONFIG_UDF_FS=m CONFIG_MSDOS_FS=m CONFIG_VFAT_FS=m CONFIG_PROC_KCORE=y +CONFIG_HUGETLBFS=y # CONFIG_MISC_FILESYSTEMS is not set # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_NLS=y diff --git a/arch/powerpc/crypto/crc32c-vpmsum_glue.c b/arch/powerpc/crypto/crc32c-vpmsum_glue.c index fd1d6c83f0c0..c4fa242dd652 100644 --- a/arch/powerpc/crypto/crc32c-vpmsum_glue.c +++ b/arch/powerpc/crypto/crc32c-vpmsum_glue.c @@ -1,10 +1,12 @@ #include <linux/crc32.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/cpufeature.h> +#include <asm/simd.h> #include <asm/switch_to.h> #define CHKSUM_BLOCK_SIZE 1 @@ -22,7 +24,7 @@ static u32 crc32c_vpmsum(u32 crc, unsigned char const *p, size_t len) unsigned int prealign; unsigned int tail; - if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt()) + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable()) return __crc32c_le(crc, p, len); if ((unsigned long)p & VMX_ALIGN_MASK) { diff --git a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c index 02ea277863d1..e27ff16573b5 100644 --- a/arch/powerpc/crypto/crct10dif-vpmsum_glue.c +++ b/arch/powerpc/crypto/crct10dif-vpmsum_glue.c @@ -12,11 +12,13 @@ #include <linux/crc-t10dif.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> #include <linux/kernel.h> #include <linux/cpufeature.h> +#include <asm/simd.h> #include <asm/switch_to.h> #define VMX_ALIGN 16 @@ -32,7 +34,7 @@ static u16 crct10dif_vpmsum(u16 crci, unsigned char const *p, size_t len) unsigned int tail; u32 crc = crci; - if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || in_interrupt()) + if (len < (VECTOR_BREAKPOINT + VMX_ALIGN) || !crypto_simd_usable()) return crc_t10dif_generic(crc, p, len); if ((unsigned long)p & VMX_ALIGN_MASK) { diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index a0c132bedfae..b9f6e72bf4e5 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -8,6 +8,6 @@ generic-y += irq_regs.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += preempt.h -generic-y += rwsem.h generic-y += vtime.h generic-y += msi.h +generic-y += simd.h diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 4b73847e9b95..1fad67b46409 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -34,14 +34,11 @@ extern struct pci_dev *isa_bridge_pcidev; #include <asm/byteorder.h> #include <asm/synch.h> #include <asm/delay.h> +#include <asm/mmiowb.h> #include <asm/mmu.h> #include <asm/ppc_asm.h> #include <asm/pgtable.h> -#ifdef CONFIG_PPC64 -#include <asm/paca.h> -#endif - #define SIO_CONFIG_RA 0x398 #define SIO_CONFIG_RD 0x399 @@ -107,12 +104,6 @@ extern bool isa_io_special; * */ -#ifdef CONFIG_PPC64 -#define IO_SET_SYNC_FLAG() do { local_paca->io_sync = 1; } while(0) -#else -#define IO_SET_SYNC_FLAG() -#endif - #define DEF_MMIO_IN_X(name, size, insn) \ static inline u##size name(const volatile u##size __iomem *addr) \ { \ @@ -127,7 +118,7 @@ static inline void name(volatile u##size __iomem *addr, u##size val) \ { \ __asm__ __volatile__("sync;"#insn" %1,%y0" \ : "=Z" (*addr) : "r" (val) : "memory"); \ - IO_SET_SYNC_FLAG(); \ + mmiowb_set_pending(); \ } #define DEF_MMIO_IN_D(name, size, insn) \ @@ -144,7 +135,7 @@ static inline void name(volatile u##size __iomem *addr, u##size val) \ { \ __asm__ __volatile__("sync;"#insn"%U0%X0 %1,%0" \ : "=m" (*addr) : "r" (val) : "memory"); \ - IO_SET_SYNC_FLAG(); \ + mmiowb_set_pending(); \ } DEF_MMIO_IN_D(in_8, 8, lbz); @@ -652,24 +643,6 @@ static inline void name at \ #include <asm-generic/iomap.h> -#ifdef CONFIG_PPC32 -#define mmiowb() -#else -/* - * Enforce synchronisation of stores vs. spin_unlock - * (this does it explicitly, though our implementation of spin_unlock - * does it implicitely too) - */ -static inline void mmiowb(void) -{ - unsigned long tmp; - - __asm__ __volatile__("sync; li %0,0; stb %0,%1(13)" - : "=&r" (tmp) : "i" (offsetof(struct paca_struct, io_sync)) - : "memory"); -} -#endif /* !CONFIG_PPC32 */ - static inline void iosync(void) { __asm__ __volatile__ ("sync" : : : "memory"); diff --git a/arch/powerpc/include/asm/mmiowb.h b/arch/powerpc/include/asm/mmiowb.h new file mode 100644 index 000000000000..74a00127eb20 --- /dev/null +++ b/arch/powerpc/include/asm/mmiowb.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_MMIOWB_H +#define _ASM_POWERPC_MMIOWB_H + +#ifdef CONFIG_MMIOWB + +#include <linux/compiler.h> +#include <asm/barrier.h> +#include <asm/paca.h> + +#define arch_mmiowb_state() (&local_paca->mmiowb_state) +#define mmiowb() mb() + +#endif /* CONFIG_MMIOWB */ + +#include <asm-generic/mmiowb.h> + +#endif /* _ASM_POWERPC_MMIOWB_H */ diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 598cdcdd1355..8ddd4a91bdc1 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -352,7 +352,7 @@ static inline bool strict_kernel_rwx_enabled(void) #if defined(CONFIG_SPARSEMEM_VMEMMAP) && defined(CONFIG_SPARSEMEM_EXTREME) && \ defined (CONFIG_PPC_64K_PAGES) #define MAX_PHYSMEM_BITS 51 -#elif defined(CONFIG_SPARSEMEM) +#elif defined(CONFIG_PPC64) #define MAX_PHYSMEM_BITS 46 #endif diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index e843bc5d1a0f..134e912d403f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -34,6 +34,8 @@ #include <asm/cpuidle.h> #include <asm/atomic.h> +#include <asm-generic/mmiowb_types.h> + register struct paca_struct *local_paca asm("r13"); #if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_SMP) @@ -171,7 +173,6 @@ struct paca_struct { u16 trap_save; /* Used when bad stack is encountered */ u8 irq_soft_mask; /* mask for irq soft masking */ u8 irq_happened; /* irq happened while soft-disabled */ - u8 io_sync; /* writel() needs spin_unlock sync */ u8 irq_work_pending; /* IRQ_WORK interrupt while soft-disable */ u8 nap_state_lost; /* NV GPR values lost in power7_idle */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -264,6 +265,9 @@ struct paca_struct { #ifdef CONFIG_STACKPROTECTOR unsigned long canary; #endif +#ifdef CONFIG_MMIOWB + struct mmiowb_state mmiowb_state; +#endif } ____cacheline_aligned; extern void copy_mm_to_paca(struct mm_struct *mm); diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index 685c72310f5d..15b39c407c4e 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -39,19 +39,6 @@ #define LOCK_TOKEN 1 #endif -#if defined(CONFIG_PPC64) && defined(CONFIG_SMP) -#define CLEAR_IO_SYNC (get_paca()->io_sync = 0) -#define SYNC_IO do { \ - if (unlikely(get_paca()->io_sync)) { \ - mb(); \ - get_paca()->io_sync = 0; \ - } \ - } while (0) -#else -#define CLEAR_IO_SYNC -#define SYNC_IO -#endif - #ifdef CONFIG_PPC_PSERIES #define vcpu_is_preempted vcpu_is_preempted static inline bool vcpu_is_preempted(int cpu) @@ -99,7 +86,6 @@ static inline unsigned long __arch_spin_trylock(arch_spinlock_t *lock) static inline int arch_spin_trylock(arch_spinlock_t *lock) { - CLEAR_IO_SYNC; return __arch_spin_trylock(lock) == 0; } @@ -130,7 +116,6 @@ extern void __rw_yield(arch_rwlock_t *lock); static inline void arch_spin_lock(arch_spinlock_t *lock) { - CLEAR_IO_SYNC; while (1) { if (likely(__arch_spin_trylock(lock) == 0)) break; @@ -148,7 +133,6 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) { unsigned long flags_dis; - CLEAR_IO_SYNC; while (1) { if (likely(__arch_spin_trylock(lock) == 0)) break; @@ -167,7 +151,6 @@ void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags) static inline void arch_spin_unlock(arch_spinlock_t *lock) { - SYNC_IO; __asm__ __volatile__("# arch_spin_unlock\n\t" PPC_RELEASE_BARRIER: : :"memory"); lock->slock = 0; diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index e24c67d5ba75..34fba1ce27f7 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -27,8 +27,8 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +#define tlb_flush tlb_flush extern void tlb_flush(struct mmu_gather *tlb); /* Get the generic bits... */ @@ -46,22 +46,6 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ - if (!tlb->page_size) - tlb->page_size = page_size; - else if (tlb->page_size != page_size) { - if (!tlb->fullmm) - tlb_flush_mmu(tlb); - /* - * update the page size after flush for the new - * mmu_gather. - */ - tlb->page_size = page_size; - } -} - #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index a5b8fbae56a0..9481a117e242 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -656,11 +656,17 @@ EXC_COMMON_BEGIN(data_access_slb_common) ld r4,PACA_EXSLB+EX_DAR(r13) std r4,_DAR(r1) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_MMU_FTR_SECTION + /* HPT case, do SLB fault */ bl do_slb_fault cmpdi r3,0 bne- 1f b fast_exception_return 1: /* Error case */ +MMU_FTR_SECTION_ELSE + /* Radix case, access is outside page table range */ + li r3,-EFAULT +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) @@ -705,11 +711,17 @@ EXC_COMMON_BEGIN(instruction_access_slb_common) EXCEPTION_PROLOG_COMMON(0x480, PACA_EXSLB) ld r4,_NIP(r1) addi r3,r1,STACK_FRAME_OVERHEAD +BEGIN_MMU_FTR_SECTION + /* HPT case, do SLB fault */ bl do_slb_fault cmpdi r3,0 bne- 1f b fast_exception_return 1: /* Error case */ +MMU_FTR_SECTION_ELSE + /* Radix case, access is outside page table range */ + li r3,-EFAULT +ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) std r3,RESULT(r1) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) diff --git a/arch/powerpc/kernel/head_32.S b/arch/powerpc/kernel/head_32.S index 48051c8977c5..e25b615e9f9e 100644 --- a/arch/powerpc/kernel/head_32.S +++ b/arch/powerpc/kernel/head_32.S @@ -851,10 +851,6 @@ __secondary_start: tophys(r4,r2) addi r4,r4,THREAD /* phys address of our thread_struct */ mtspr SPRN_SPRG_THREAD,r4 -#ifdef CONFIG_PPC_RTAS - li r3,0 - stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ -#endif lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l mtspr SPRN_SPRG_PGDIR, r4 @@ -941,10 +937,6 @@ start_here: tophys(r4,r2) addi r4,r4,THREAD /* init task's THREAD */ mtspr SPRN_SPRG_THREAD,r4 -#ifdef CONFIG_PPC_RTAS - li r3,0 - stw r3, RTAS_SP(r4) /* 0 => not in RTAS */ -#endif lis r4, (swapper_pg_dir - PAGE_OFFSET)@h ori r4, r4, (swapper_pg_dir - PAGE_OFFSET)@l mtspr SPRN_SPRG_PGDIR, r4 diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index b33bafb8fcea..70568ccbd9fd 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -57,7 +57,7 @@ void setup_barrier_nospec(void) enable = security_ftr_enabled(SEC_FTR_FAVOUR_SECURITY) && security_ftr_enabled(SEC_FTR_BNDS_CHK_SPEC_BAR); - if (!no_nospec) + if (!no_nospec && !cpu_mitigations_off()) enable_barrier_nospec(enable); } @@ -116,7 +116,7 @@ static int __init handle_nospectre_v2(char *p) early_param("nospectre_v2", handle_nospectre_v2); void setup_spectre_v2(void) { - if (no_spectrev2) + if (no_spectrev2 || cpu_mitigations_off()) do_btb_flush_fixups(); else btb_flush_enabled = true; @@ -300,7 +300,7 @@ void setup_stf_barrier(void) stf_enabled_flush_types = type; - if (!no_stf_barrier) + if (!no_stf_barrier && !cpu_mitigations_off()) stf_barrier_enable(enable); } diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index ba404dd9ce1d..4f49e1a3594c 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -932,7 +932,7 @@ void setup_rfi_flush(enum l1d_flush_type types, bool enable) enabled_flush_types = types; - if (!no_rfi_flush) + if (!no_rfi_flush && !cpu_mitigations_off()) rfi_flush_enable(enable); } diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index b18abb0c3dae..00f5a63c8d9a 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -505,3 +505,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/powerpc/kernel/vdso32/gettimeofday.S b/arch/powerpc/kernel/vdso32/gettimeofday.S index 1e0bc5955a40..afd516b572f8 100644 --- a/arch/powerpc/kernel/vdso32/gettimeofday.S +++ b/arch/powerpc/kernel/vdso32/gettimeofday.S @@ -98,7 +98,7 @@ V_FUNCTION_BEGIN(__kernel_clock_gettime) * can be used, r7 contains NSEC_PER_SEC. */ - lwz r5,WTOM_CLOCK_SEC(r9) + lwz r5,(WTOM_CLOCK_SEC+LOPART)(r9) lwz r6,WTOM_CLOCK_NSEC(r9) /* We now have our offset in r5,r6. We create a fake dependency diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index f02b04973710..f100e331e69b 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -543,14 +543,14 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvmppc_tce_validate(stt, tce); if (ret != H_SUCCESS) - return ret; + goto unlock_exit; dir = iommu_tce_direction(tce); - idx = srcu_read_lock(&vcpu->kvm->srcu); - if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) { ret = H_PARAMETER; goto unlock_exit; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 06964350b97a..b2b29d4f9842 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3423,7 +3423,9 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2); vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3); - mtspr(SPRN_PSSCR, host_psscr); + /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */ + mtspr(SPRN_PSSCR, host_psscr | + (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); mtspr(SPRN_HFSCR, host_hfscr); mtspr(SPRN_CIABR, host_ciabr); mtspr(SPRN_DAWR, host_dawr); diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e7a9c4f6bfca..8330f135294f 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -95,28 +95,15 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { - struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; + struct mm_iommu_table_group_mem_t *mem, *mem2; + long i, ret, locked_entries = 0, pinned = 0; unsigned int pageshift; - - mutex_lock(&mem_list_mutex); - - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - - } + unsigned long entry, chunk; if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { ret = mm_iommu_adjust_locked_vm(mm, entries, true); if (ret) - goto unlock_exit; + return ret; locked_entries = entries; } @@ -148,17 +135,27 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + chunk = (1UL << (PAGE_SHIFT + MAX_ORDER - 1)) / + sizeof(struct vm_area_struct *); + chunk = min(chunk, entries); + for (entry = 0; entry < entries; entry += chunk) { + unsigned long n = min(entries - entry, chunk); + + ret = get_user_pages_longterm(ua + (entry << PAGE_SHIFT), n, + FOLL_WRITE, mem->hpages + entry, NULL); + if (ret == n) { + pinned += n; + continue; + } + if (ret > 0) + pinned += ret; + break; + } up_read(&mm->mmap_sem); - if (ret != entries) { - /* free the reference taken */ - for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); - - vfree(mem->hpas); - kfree(mem); - ret = -EFAULT; - goto unlock_exit; + if (pinned != entries) { + if (!ret) + ret = -EFAULT; + goto free_exit; } pageshift = PAGE_SHIFT; @@ -183,21 +180,43 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } good_exit: - ret = 0; atomic64_set(&mem->mapped, 1); mem->used = 1; mem->ua = ua; mem->entries = entries; - *pmem = mem; - list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + mutex_lock(&mem_list_mutex); -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); + list_for_each_entry_rcu(mem2, &mm->context.iommu_group_mem_list, next) { + /* Overlap? */ + if ((mem2->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem2->ua + + (mem2->entries << PAGE_SHIFT)))) { + ret = -EINVAL; + mutex_unlock(&mem_list_mutex); + goto free_exit; + } + } + + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); mutex_unlock(&mem_list_mutex); + *pmem = mem; + + return 0; + +free_exit: + /* free the reference taken */ + for (i = 0; i < pinned; i++) + put_page(mem->hpages[i]); + + vfree(mem->hpas); + kfree(mem); + +unlock_exit: + mm_iommu_adjust_locked_vm(mm, locked_entries, false); + return ret; } @@ -266,7 +285,7 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) { long ret = 0; - unsigned long entries, dev_hpa; + unsigned long unlock_entries = 0; mutex_lock(&mem_list_mutex); @@ -287,17 +306,17 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) goto unlock_exit; } + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + unlock_entries = mem->entries; + /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; mm_iommu_release(mem); - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); - unlock_exit: mutex_unlock(&mem_list_mutex); + mm_iommu_adjust_locked_vm(mm, unlock_entries, false); + return ret; } EXPORT_SYMBOL_GPL(mm_iommu_put); diff --git a/arch/powerpc/mm/ppc_mmu_32.c b/arch/powerpc/mm/ppc_mmu_32.c index f29d2f118b44..5d9c3ff728c9 100644 --- a/arch/powerpc/mm/ppc_mmu_32.c +++ b/arch/powerpc/mm/ppc_mmu_32.c @@ -98,10 +98,20 @@ static int find_free_bat(void) return -1; } +/* + * This function calculates the size of the larger block usable to map the + * beginning of an area based on the start address and size of that area: + * - max block size is 8M on 601 and 256 on other 6xx. + * - base address must be aligned to the block size. So the maximum block size + * is identified by the lowest bit set to 1 in the base address (for instance + * if base is 0x16000000, max size is 0x02000000). + * - block size has to be a power of two. This is calculated by finding the + * highest bit set to 1. + */ static unsigned int block_size(unsigned long base, unsigned long top) { unsigned int max_size = (cpu_has_feature(CPU_FTR_601) ? 8 : 256) << 20; - unsigned int base_shift = (fls(base) - 1) & 31; + unsigned int base_shift = (ffs(base) - 1) & 31; unsigned int block_shift = (fls(top - base) - 1) & 31; return min3(max_size, 1U << base_shift, 1U << block_shift); @@ -157,7 +167,7 @@ static unsigned long __init __mmu_mapin_ram(unsigned long base, unsigned long to unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) { - int done; + unsigned long done; unsigned long border = (unsigned long)__init_begin - PAGE_OFFSET; if (__map_without_bats) { @@ -169,10 +179,10 @@ unsigned long __init mmu_mapin_ram(unsigned long base, unsigned long top) return __mmu_mapin_ram(base, top); done = __mmu_mapin_ram(base, border); - if (done != border - base) + if (done != border) return done; - return done + __mmu_mapin_ram(border, top); + return __mmu_mapin_ram(border, top); } void mmu_mark_initmem_nx(void) diff --git a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c index 17cf249b18ee..3cb2f07ce8eb 100644 --- a/arch/powerpc/platforms/52xx/mpc52xx_gpt.c +++ b/arch/powerpc/platforms/52xx/mpc52xx_gpt.c @@ -628,7 +628,7 @@ static int mpc52xx_wdt_open(struct inode *inode, struct file *file) } file->private_data = mpc52xx_gpt_wdt; - return nonseekable_open(inode, file); + return stream_open(inode, file); } static int mpc52xx_wdt_release(struct inode *inode, struct file *file) diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 842b2c7e156a..50cd09b4e05d 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -324,7 +324,7 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK config PPC_RADIX_MMU bool "Radix MMU Support" - depends on PPC_BOOK3S_64 + depends on PPC_BOOK3S_64 && HUGETLB_PAGE select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA default y help diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 48c2477e7e2a..bfb9ca99ac05 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -588,7 +588,7 @@ static int spufs_pipe_open(struct inode *inode, struct file *file) struct spufs_inode_info *i = SPUFS_I(inode); file->private_data = i->i_ctx; - return nonseekable_open(inode, file); + return stream_open(inode, file); } /* diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index db329d4bf1c3..c1a75216050a 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -71,17 +71,11 @@ spufs_alloc_inode(struct super_block *sb) return &ei->vfs_inode; } -static void spufs_i_callback(struct rcu_head *head) +static void spufs_free_inode(struct inode *inode) { - struct inode *inode = container_of(head, struct inode, i_rcu); kmem_cache_free(spufs_inode_cache, SPUFS_I(inode)); } -static void spufs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, spufs_i_callback); -} - static void spufs_init_once(void *p) { @@ -739,7 +733,7 @@ spufs_fill_super(struct super_block *sb, void *data, int silent) struct spufs_sb_info *info; static const struct super_operations s_ops = { .alloc_inode = spufs_alloc_inode, - .destroy_inode = spufs_destroy_inode, + .free_inode = spufs_free_inode, .statfs = simple_statfs, .evict_inode = spufs_evict_inode, .show_options = spufs_show_options, diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index a0f44f992360..13c6a47e6150 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2429,7 +2429,10 @@ static void dump_one_paca(int cpu) DUMP(p, trap_save, "%#-*x"); DUMP(p, irq_soft_mask, "%#-*x"); DUMP(p, irq_happened, "%#-*x"); - DUMP(p, io_sync, "%#-*x"); +#ifdef CONFIG_MMIOWB + DUMP(p, mmiowb_state.nesting_count, "%#-*x"); + DUMP(p, mmiowb_state.mmiowb_pending, "%#-*x"); +#endif DUMP(p, irq_work_pending, "%#-*x"); DUMP(p, nap_state_lost, "%#-*x"); DUMP(p, sprg_vdso, "%#-*llx"); diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index eb56c82d8aa1..e66745decea1 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -48,6 +48,7 @@ config RISCV select RISCV_TIMER select GENERIC_IRQ_MULTI_HANDLER select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_MMIOWB select HAVE_EBPF_JIT if 64BIT config MMU @@ -69,9 +70,6 @@ config STACKTRACE_SUPPORT config TRACE_IRQFLAGS_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/riscv/configs/rv32_defconfig b/arch/riscv/configs/rv32_defconfig new file mode 100644 index 000000000000..1a911ed8e772 --- /dev/null +++ b/arch/riscv/configs/rv32_defconfig @@ -0,0 +1,84 @@ +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_CGROUPS=y +CONFIG_CGROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +CONFIG_CGROUP_BPF=y +CONFIG_NAMESPACES=y +CONFIG_USER_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EXPERT=y +CONFIG_BPF_SYSCALL=y +CONFIG_ARCH_RV32I=y +CONFIG_SMP=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NETLINK_DIAG=y +CONFIG_PCI=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_DEVTMPFS=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_VIRTIO_NET=y +CONFIG_MACB=y +CONFIG_E1000E=y +CONFIG_R8169=y +CONFIG_MICROSEMI_PHY=y +CONFIG_INPUT_MOUSEDEV=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_EARLYCON_RISCV_SBI=y +CONFIG_HVC_RISCV_SBI=y +# CONFIG_PTP_1588_CLOCK is not set +CONFIG_DRM=y +CONFIG_DRM_RADEON=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_PLATFORM=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_UAS=y +CONFIG_VIRTIO_MMIO=y +CONFIG_SIFIVE_PLIC=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y +CONFIG_CRYPTO_USER_API_HASH=y +CONFIG_CRYPTO_DEV_VIRTIO=y +CONFIG_PRINTK_TIME=y +# CONFIG_RCU_TRACE is not set diff --git a/arch/riscv/include/asm/io.h b/arch/riscv/include/asm/io.h index 1d9c1376dc64..744fd92e77bc 100644 --- a/arch/riscv/include/asm/io.h +++ b/arch/riscv/include/asm/io.h @@ -20,6 +20,7 @@ #define _ASM_RISCV_IO_H #include <linux/types.h> +#include <asm/mmiowb.h> extern void __iomem *ioremap(phys_addr_t offset, unsigned long size); @@ -100,18 +101,6 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) #endif /* - * FIXME: I'm flip-flopping on whether or not we should keep this or enforce - * the ordering with I/O on spinlocks like PowerPC does. The worry is that - * drivers won't get this correct, but I also don't want to introduce a fence - * into the lock code that otherwise only uses AMOs (and is essentially defined - * by the ISA to be correct). For now I'm leaving this here: "o,w" is - * sufficient to ensure that all writes to the device have completed before the - * write to the spinlock is allowed to commit. I surmised this from reading - * "ACQUIRES VS I/O ACCESSES" in memory-barriers.txt. - */ -#define mmiowb() __asm__ __volatile__ ("fence o,w" : : : "memory"); - -/* * Unordered I/O memory access primitives. These are even more relaxed than * the relaxed versions, as they don't even order accesses between successive * operations to the I/O regions. @@ -165,7 +154,7 @@ static inline u64 __raw_readq(const volatile void __iomem *addr) #define __io_br() do {} while (0) #define __io_ar(v) __asm__ __volatile__ ("fence i,r" : : : "memory"); #define __io_bw() __asm__ __volatile__ ("fence w,o" : : : "memory"); -#define __io_aw() do {} while (0) +#define __io_aw() mmiowb_set_pending() #define readb(c) ({ u8 __v; __io_br(); __v = readb_cpu(c); __io_ar(__v); __v; }) #define readw(c) ({ u16 __v; __io_br(); __v = readw_cpu(c); __io_ar(__v); __v; }) diff --git a/arch/riscv/include/asm/mmiowb.h b/arch/riscv/include/asm/mmiowb.h new file mode 100644 index 000000000000..5d7e3a2b4e3b --- /dev/null +++ b/arch/riscv/include/asm/mmiowb.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _ASM_RISCV_MMIOWB_H +#define _ASM_RISCV_MMIOWB_H + +/* + * "o,w" is sufficient to ensure that all writes to the device have completed + * before the write to the spinlock is allowed to commit. + */ +#define mmiowb() __asm__ __volatile__ ("fence o,w" : : : "memory"); + +#include <asm-generic/mmiowb.h> + +#endif /* ASM_RISCV_MMIOWB_H */ diff --git a/arch/riscv/include/asm/tlb.h b/arch/riscv/include/asm/tlb.h index 439dc7072e05..1ad8d093c58b 100644 --- a/arch/riscv/include/asm/tlb.h +++ b/arch/riscv/include/asm/tlb.h @@ -18,6 +18,7 @@ struct mmu_gather; static void tlb_flush(struct mmu_gather *tlb); +#define tlb_flush tlb_flush #include <asm-generic/tlb.h> static inline void tlb_flush(struct mmu_gather *tlb) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index a4b1d94371a0..4d403274c2e8 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -169,8 +169,6 @@ static bool save_trace(unsigned long pc, void *arg) void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { walk_stackframe(tsk, NULL, save_trace, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 5fd8c922e1c2..bc7b77e34d09 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -121,6 +121,14 @@ void __init setup_bootmem(void) */ memblock_reserve(reg->base, vmlinux_end - reg->base); mem_size = min(reg->size, (phys_addr_t)-PAGE_OFFSET); + + /* + * Remove memblock from the end of usable area to the + * end of region + */ + if (reg->base + mem_size < end) + memblock_remove(reg->base + mem_size, + end - reg->base - mem_size); } } BUG_ON(mem_size == 0); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b6e3d0653002..07485582d027 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -14,12 +14,6 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - bool - -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config ARCH_HAS_ILOG2_U32 def_bool n @@ -149,6 +143,7 @@ config S390 select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX select HAVE_GCC_PLUGINS + select HAVE_GENERIC_GUP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -164,11 +159,13 @@ config S390 select HAVE_PERF_USER_STACK_DUMP select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP + select HAVE_MMU_GATHER_NO_GATHER select HAVE_MOD_ARCH_SPECIFIC select HAVE_NOP_MCOUNT select HAVE_OPROFILE select HAVE_PCI select HAVE_PERF_EVENTS + select HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS @@ -188,7 +185,6 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select ARCH_HAS_SCALED_CPUTIME - select VIRT_TO_BUS select HAVE_NMI @@ -240,6 +236,7 @@ choice config MARCH_Z900 bool "IBM zSeries model z800 and z900" + depends on !CC_IS_CLANG select HAVE_MARCH_Z900_FEATURES help Select this to enable optimizations for model z800/z900 (2064 and @@ -248,6 +245,7 @@ config MARCH_Z900 config MARCH_Z990 bool "IBM zSeries model z890 and z990" + depends on !CC_IS_CLANG select HAVE_MARCH_Z990_FEATURES help Select this to enable optimizations for model z890/z990 (2084 and @@ -256,6 +254,7 @@ config MARCH_Z990 config MARCH_Z9_109 bool "IBM System z9" + depends on !CC_IS_CLANG select HAVE_MARCH_Z9_109_FEATURES help Select this to enable optimizations for IBM System z9 (2094 and @@ -347,12 +346,15 @@ config TUNE_DEFAULT config TUNE_Z900 bool "IBM zSeries model z800 and z900" + depends on !CC_IS_CLANG config TUNE_Z990 bool "IBM zSeries model z890 and z990" + depends on !CC_IS_CLANG config TUNE_Z9_109 bool "IBM System z9" + depends on !CC_IS_CLANG config TUNE_Z10 bool "IBM System z10" @@ -388,6 +390,9 @@ config COMPAT (and some other stuff like libraries and such) is needed for executing 31 bit applications. It is safe to say "Y". +config COMPAT_VDSO + def_bool COMPAT && !CC_IS_CLANG + config SYSVIPC_COMPAT def_bool y if COMPAT && SYSVIPC @@ -549,6 +554,17 @@ config ARCH_HAS_KEXEC_PURGATORY def_bool y depends on KEXEC_FILE +config KEXEC_VERIFY_SIG + bool "Verify kernel signature during kexec_file_load() syscall" + depends on KEXEC_FILE && SYSTEM_DATA_VERIFICATION + help + This option makes kernel signature verification mandatory for + the kexec_file_load() syscall. + + In addition to that option, you need to enable signature + verification for the corresponding kernel image type being + loaded in order for this to work. + config ARCH_RANDOM def_bool y prompt "s390 architectural random number generation API" @@ -609,6 +625,29 @@ config EXPOLINE_FULL endchoice +config RELOCATABLE + bool "Build a relocatable kernel" + select MODULE_REL_CRCS if MODVERSIONS + default y + help + This builds a kernel image that retains relocation information + so it can be loaded at an arbitrary address. + The kernel is linked as a position-independent executable (PIE) + and contains dynamic relocations which are processed early in the + bootup process. + The relocations make the kernel image about 15% larger (compressed + 10%), but are discarded at runtime. + +config RANDOMIZE_BASE + bool "Randomize the address of the kernel image (KASLR)" + depends on RELOCATABLE + default y + help + In support of Kernel Address Space Layout Randomization (KASLR), + this randomizes the address at which the kernel image is loaded, + as a security feature that deters exploit attempts relying on + knowledge of the location of kernel internals. + endmenu menu "Memory setup" @@ -837,6 +876,17 @@ config HAVE_PNETID menu "Virtualization" +config PROTECTED_VIRTUALIZATION_GUEST + def_bool n + prompt "Protected virtualization guest support" + help + Select this option, if you want to be able to run this + kernel as a protected virtualization KVM guest. + Protected virtualization capable machines have a mini hypervisor + located at machine level (an ultravisor). With help of the + Ultravisor, KVM will be able to run "protected" VMs, special + VMs whose memory and management data are unavailable to KVM. + config PFAULT def_bool y prompt "Pseudo page fault support" diff --git a/arch/s390/Makefile b/arch/s390/Makefile index e21053e5e0da..df1d6a150f30 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -16,10 +16,14 @@ KBUILD_AFLAGS_MODULE += -fPIC KBUILD_CFLAGS_MODULE += -fPIC KBUILD_AFLAGS += -m64 KBUILD_CFLAGS += -m64 +ifeq ($(CONFIG_RELOCATABLE),y) +KBUILD_CFLAGS += -fPIE +LDFLAGS_vmlinux := -pie +endif aflags_dwarf := -Wa,-gdwarf-2 -KBUILD_AFLAGS_DECOMPRESSOR := -m64 -D__ASSEMBLY__ +KBUILD_AFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -D__ASSEMBLY__ KBUILD_AFLAGS_DECOMPRESSOR += $(if $(CONFIG_DEBUG_INFO),$(aflags_dwarf)) -KBUILD_CFLAGS_DECOMPRESSOR := -m64 -O2 +KBUILD_CFLAGS_DECOMPRESSOR := $(CLANG_FLAGS) -m64 -O2 KBUILD_CFLAGS_DECOMPRESSOR += -DDISABLE_BRANCH_PROFILING -D__NO_FORTIFY KBUILD_CFLAGS_DECOMPRESSOR += -fno-delete-null-pointer-checks -msoft-float KBUILD_CFLAGS_DECOMPRESSOR += -fno-asynchronous-unwind-tables @@ -111,7 +115,7 @@ endif cfi := $(call as-instr,.cfi_startproc\n.cfi_val_offset 15$(comma)-160\n.cfi_endproc,-DCONFIG_AS_CFI_VAL_OFFSET=1) KBUILD_CFLAGS += -mbackchain -msoft-float $(cflags-y) -KBUILD_CFLAGS += -pipe -fno-strength-reduce -Wno-sign-compare +KBUILD_CFLAGS += -pipe -Wno-sign-compare KBUILD_CFLAGS += -fno-asynchronous-unwind-tables $(cfi) KBUILD_AFLAGS += $(aflags-y) $(cfi) export KBUILD_AFLAGS_DECOMPRESSOR diff --git a/arch/s390/boot/Makefile b/arch/s390/boot/Makefile index c844eaf24ed7..c51496bbac19 100644 --- a/arch/s390/boot/Makefile +++ b/arch/s390/boot/Makefile @@ -12,25 +12,35 @@ KBUILD_AFLAGS := $(KBUILD_AFLAGS_DECOMPRESSOR) KBUILD_CFLAGS := $(KBUILD_CFLAGS_DECOMPRESSOR) # -# Use -march=z900 for als.c to be able to print an error +# Use minimum architecture for als.c to be able to print an error # message if the kernel is started on a machine which is too old # -ifneq ($(CC_FLAGS_MARCH),-march=z900) +ifndef CONFIG_CC_IS_CLANG +CC_FLAGS_MARCH_MINIMUM := -march=z900 +else +CC_FLAGS_MARCH_MINIMUM := -march=z10 +endif + +ifneq ($(CC_FLAGS_MARCH),$(CC_FLAGS_MARCH_MINIMUM)) AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) -AFLAGS_head.o += -march=z900 +AFLAGS_head.o += $(CC_FLAGS_MARCH_MINIMUM) AFLAGS_REMOVE_mem.o += $(CC_FLAGS_MARCH) -AFLAGS_mem.o += -march=z900 +AFLAGS_mem.o += $(CC_FLAGS_MARCH_MINIMUM) CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) -CFLAGS_als.o += -march=z900 +CFLAGS_als.o += $(CC_FLAGS_MARCH_MINIMUM) CFLAGS_REMOVE_sclp_early_core.o += $(CC_FLAGS_MARCH) -CFLAGS_sclp_early_core.o += -march=z900 +CFLAGS_sclp_early_core.o += $(CC_FLAGS_MARCH_MINIMUM) endif CFLAGS_sclp_early_core.o += -I$(srctree)/drivers/s390/char -obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o string.o ebcdic.o -obj-y += sclp_early_core.o mem.o ipl_vmparm.o cmdline.o ctype.o -targets := bzImage startup.a section_cmp.boot.data $(obj-y) +obj-y := head.o als.o startup.o mem_detect.o ipl_parm.o ipl_report.o +obj-y += string.o ebcdic.o sclp_early_core.o mem.o ipl_vmparm.o cmdline.o +obj-y += ctype.o text_dma.o +obj-$(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) += uv.o +obj-$(CONFIG_RELOCATABLE) += machine_kexec_reloc.o +obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +targets := bzImage startup.a section_cmp.boot.data section_cmp.boot.preserved.data $(obj-y) subdir- := compressed OBJECTS := $(addprefix $(obj)/,$(obj-y)) @@ -48,7 +58,8 @@ define cmd_section_cmp touch $@ endef -$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data FORCE +OBJCOPYFLAGS_bzImage := --pad-to $$(readelf -s $(obj)/compressed/vmlinux | awk '/\<_end\>/ {print or(strtonum("0x"$$2),4095)+1}') +$(obj)/bzImage: $(obj)/compressed/vmlinux $(obj)/section_cmp.boot.data $(obj)/section_cmp.boot.preserved.data FORCE $(call if_changed,objcopy) $(obj)/section_cmp%: vmlinux $(obj)/compressed/vmlinux FORCE diff --git a/arch/s390/boot/als.c b/arch/s390/boot/als.c index f902215e9cd9..ff6801d401c4 100644 --- a/arch/s390/boot/als.c +++ b/arch/s390/boot/als.c @@ -99,7 +99,7 @@ static void facility_mismatch(void) print_machine_type(); print_missing_facilities(); sclp_early_printk("See Principles of Operations for facility bits\n"); - disabled_wait(0x8badcccc); + disabled_wait(); } void verify_facilities(void) diff --git a/arch/s390/boot/boot.h b/arch/s390/boot/boot.h index 82bc06346e05..ad57c2205a71 100644 --- a/arch/s390/boot/boot.h +++ b/arch/s390/boot/boot.h @@ -9,5 +9,10 @@ void setup_boot_command_line(void); void parse_boot_command_line(void); void setup_memory_end(void); void print_missing_facilities(void); +unsigned long get_random_base(unsigned long safe_addr); + +extern int kaslr_enabled; + +unsigned long read_ipl_report(unsigned long safe_offset); #endif /* BOOT_BOOT_H */ diff --git a/arch/s390/boot/compressed/decompressor.h b/arch/s390/boot/compressed/decompressor.h index e1c1f2ec60f4..c15eb7114d83 100644 --- a/arch/s390/boot/compressed/decompressor.h +++ b/arch/s390/boot/compressed/decompressor.h @@ -17,6 +17,11 @@ struct vmlinux_info { unsigned long bss_size; /* uncompressed image .bss size */ unsigned long bootdata_off; unsigned long bootdata_size; + unsigned long bootdata_preserved_off; + unsigned long bootdata_preserved_size; + unsigned long dynsym_start; + unsigned long rela_dyn_start; + unsigned long rela_dyn_end; }; extern char _vmlinux_info[]; diff --git a/arch/s390/boot/compressed/vmlinux.lds.S b/arch/s390/boot/compressed/vmlinux.lds.S index 7efc3938f595..112b8d9f1e4c 100644 --- a/arch/s390/boot/compressed/vmlinux.lds.S +++ b/arch/s390/boot/compressed/vmlinux.lds.S @@ -33,7 +33,29 @@ SECTIONS *(.data.*) _edata = . ; } + /* + * .dma section for code, data, ex_table that need to stay below 2 GB, + * even when the kernel is relocate: above 2 GB. + */ + _sdma = .; + .dma.text : { + . = ALIGN(PAGE_SIZE); + _stext_dma = .; + *(.dma.text) + . = ALIGN(PAGE_SIZE); + _etext_dma = .; + } + . = ALIGN(16); + .dma.ex_table : { + _start_dma_ex_table = .; + KEEP(*(.dma.ex_table)) + _stop_dma_ex_table = .; + } + .dma.data : { *(.dma.data) } + _edma = .; + BOOT_DATA + BOOT_DATA_PRESERVED /* * uncompressed image info used by the decompressor it should match diff --git a/arch/s390/boot/head.S b/arch/s390/boot/head.S index ce2cbbc41742..028aab03a9e7 100644 --- a/arch/s390/boot/head.S +++ b/arch/s390/boot/head.S @@ -305,7 +305,7 @@ ENTRY(startup_kdump) xc 0x300(256),0x300 xc 0xe00(256),0xe00 xc 0xf00(256),0xf00 - lctlg %c0,%c15,0x200(%r0) # initialize control registers + lctlg %c0,%c15,.Lctl-.LPG0(%r13) # load control registers stcke __LC_BOOT_CLOCK mvc __LC_LAST_UPDATE_CLOCK(8),__LC_BOOT_CLOCK+1 spt 6f-.LPG0(%r13) @@ -319,20 +319,54 @@ ENTRY(startup_kdump) .align 8 6: .long 0x7fffffff,0xffffffff +.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space + .quad 0 # cr1: primary space segment table + .quad .Lduct # cr2: dispatchable unit control table + .quad 0 # cr3: instruction authorization + .quad 0xffff # cr4: instruction authorization + .quad .Lduct # cr5: primary-aste origin + .quad 0 # cr6: I/O interrupts + .quad 0 # cr7: secondary space segment table + .quad 0 # cr8: access registers translation + .quad 0 # cr9: tracing off + .quad 0 # cr10: tracing off + .quad 0 # cr11: tracing off + .quad 0 # cr12: tracing off + .quad 0 # cr13: home space segment table + .quad 0xc0000000 # cr14: machine check handling off + .quad .Llinkage_stack # cr15: linkage stack operations + + .section .dma.data,"aw",@progbits +.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 + .long 0,0,0,0,0,0,0,0 +.Llinkage_stack: + .long 0,0,0x89000000,0,0,0,0x8a000000,0 + .align 64 +.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 + .align 128 +.Lduald:.rept 8 + .long 0x80000000,0,0,0 # invalid access-list entries + .endr + .previous + #include "head_kdump.S" # # params at 10400 (setup.h) +# Must be keept in sync with struct parmarea in setup.h # .org PARMAREA - .long 0,0 # IPL_DEVICE - .long 0,0 # INITRD_START - .long 0,0 # INITRD_SIZE - .long 0,0 # OLDMEM_BASE - .long 0,0 # OLDMEM_SIZE + .quad 0 # IPL_DEVICE + .quad 0 # INITRD_START + .quad 0 # INITRD_SIZE + .quad 0 # OLDMEM_BASE + .quad 0 # OLDMEM_SIZE .org COMMAND_LINE .byte "root=/dev/ram0 ro" .byte 0 - .org 0x11000 + .org EARLY_SCCB_OFFSET + .fill 4096 + + .org HEAD_END diff --git a/arch/s390/boot/ipl_parm.c b/arch/s390/boot/ipl_parm.c index 36beb56de021..3c49bde8aa5e 100644 --- a/arch/s390/boot/ipl_parm.c +++ b/arch/s390/boot/ipl_parm.c @@ -7,16 +7,19 @@ #include <asm/sections.h> #include <asm/boot_data.h> #include <asm/facility.h> +#include <asm/uv.h> #include "boot.h" char __bootdata(early_command_line)[COMMAND_LINE_SIZE]; -struct ipl_parameter_block __bootdata(early_ipl_block); -int __bootdata(early_ipl_block_valid); +struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_block_valid); unsigned long __bootdata(memory_end); int __bootdata(memory_end_set); int __bootdata(noexec_disabled); +int kaslr_enabled __section(.data); + static inline int __diag308(unsigned long subcode, void *addr) { register unsigned long _addr asm("0") = (unsigned long)addr; @@ -45,13 +48,15 @@ void store_ipl_parmblock(void) { int rc; - rc = __diag308(DIAG308_STORE, &early_ipl_block); + uv_set_shared(__pa(&ipl_block)); + rc = __diag308(DIAG308_STORE, &ipl_block); + uv_remove_shared(__pa(&ipl_block)); if (rc == DIAG308_RC_OK && - early_ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) - early_ipl_block_valid = 1; + ipl_block.hdr.version <= IPL_MAX_SUPPORTED_VERSION) + ipl_block_valid = 1; } -static size_t scpdata_length(const char *buf, size_t count) +static size_t scpdata_length(const u8 *buf, size_t count) { while (count) { if (buf[count - 1] != '\0' && buf[count - 1] != ' ') @@ -68,26 +73,26 @@ static size_t ipl_block_get_ascii_scpdata(char *dest, size_t size, size_t i; int has_lowercase; - count = min(size - 1, scpdata_length(ipb->ipl_info.fcp.scp_data, - ipb->ipl_info.fcp.scp_data_len)); + count = min(size - 1, scpdata_length(ipb->fcp.scp_data, + ipb->fcp.scp_data_len)); if (!count) goto out; has_lowercase = 0; for (i = 0; i < count; i++) { - if (!isascii(ipb->ipl_info.fcp.scp_data[i])) { + if (!isascii(ipb->fcp.scp_data[i])) { count = 0; goto out; } - if (!has_lowercase && islower(ipb->ipl_info.fcp.scp_data[i])) + if (!has_lowercase && islower(ipb->fcp.scp_data[i])) has_lowercase = 1; } if (has_lowercase) - memcpy(dest, ipb->ipl_info.fcp.scp_data, count); + memcpy(dest, ipb->fcp.scp_data, count); else for (i = 0; i < count; i++) - dest[i] = tolower(ipb->ipl_info.fcp.scp_data[i]); + dest[i] = tolower(ipb->fcp.scp_data[i]); out: dest[count] = '\0'; return count; @@ -103,14 +108,14 @@ static void append_ipl_block_parm(void) delim = early_command_line + len; /* '\0' character position */ parm = early_command_line + len + 1; /* append right after '\0' */ - switch (early_ipl_block.hdr.pbt) { - case DIAG308_IPL_TYPE_CCW: + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: rc = ipl_block_get_ascii_vmparm( - parm, COMMAND_LINE_SIZE - len - 1, &early_ipl_block); + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; - case DIAG308_IPL_TYPE_FCP: + case IPL_PBT_FCP: rc = ipl_block_get_ascii_scpdata( - parm, COMMAND_LINE_SIZE - len - 1, &early_ipl_block); + parm, COMMAND_LINE_SIZE - len - 1, &ipl_block); break; } if (rc) { @@ -141,7 +146,7 @@ void setup_boot_command_line(void) strcpy(early_command_line, strim(COMMAND_LINE)); /* append IPL PARM data to the boot command line */ - if (early_ipl_block_valid) + if (!is_prot_virt_guest() && ipl_block_valid) append_ipl_block_parm(); } @@ -211,6 +216,7 @@ void parse_boot_command_line(void) char *args; int rc; + kaslr_enabled = IS_ENABLED(CONFIG_RANDOMIZE_BASE); args = strcpy(command_line_buf, early_command_line); while (*args) { args = next_arg(args, ¶m, &val); @@ -228,15 +234,21 @@ void parse_boot_command_line(void) if (!strcmp(param, "facilities")) modify_fac_list(val); + + if (!strcmp(param, "nokaslr")) + kaslr_enabled = 0; } } void setup_memory_end(void) { #ifdef CONFIG_CRASH_DUMP - if (!OLDMEM_BASE && early_ipl_block_valid && - early_ipl_block.hdr.pbt == DIAG308_IPL_TYPE_FCP && - early_ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) { + if (OLDMEM_BASE) { + kaslr_enabled = 0; + } else if (ipl_block_valid && + ipl_block.pb0_hdr.pbt == IPL_PBT_FCP && + ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) { + kaslr_enabled = 0; if (!sclp_early_get_hsa_size(&memory_end) && memory_end) memory_end_set = 1; } diff --git a/arch/s390/boot/ipl_report.c b/arch/s390/boot/ipl_report.c new file mode 100644 index 000000000000..0b4965573656 --- /dev/null +++ b/arch/s390/boot/ipl_report.c @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/init.h> +#include <linux/ctype.h> +#include <asm/ebcdic.h> +#include <asm/sclp.h> +#include <asm/sections.h> +#include <asm/boot_data.h> +#include <uapi/asm/ipl.h> +#include "boot.h" + +int __bootdata_preserved(ipl_secure_flag); + +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); + +#define for_each_rb_entry(entry, rb) \ + for (entry = rb->entries; \ + (void *) entry + sizeof(*entry) <= (void *) rb + rb->len; \ + entry++) + +static inline bool intersects(unsigned long addr0, unsigned long size0, + unsigned long addr1, unsigned long size1) +{ + return addr0 + size0 > addr1 && addr1 + size1 > addr0; +} + +static unsigned long find_bootdata_space(struct ipl_rb_components *comps, + struct ipl_rb_certificates *certs, + unsigned long safe_addr) +{ + struct ipl_rb_certificate_entry *cert; + struct ipl_rb_component_entry *comp; + size_t size; + + /* + * Find the length for the IPL report boot data + */ + early_ipl_comp_list_size = 0; + for_each_rb_entry(comp, comps) + early_ipl_comp_list_size += sizeof(*comp); + ipl_cert_list_size = 0; + for_each_rb_entry(cert, certs) + ipl_cert_list_size += sizeof(unsigned int) + cert->len; + size = ipl_cert_list_size + early_ipl_comp_list_size; + + /* + * Start from safe_addr to find a free memory area large + * enough for the IPL report boot data. This area is used + * for ipl_cert_list_addr/ipl_cert_list_size and + * early_ipl_comp_list_addr/early_ipl_comp_list_size. It must + * not overlap with any component or any certificate. + */ +repeat: + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + intersects(INITRD_START, INITRD_SIZE, safe_addr, size)) + safe_addr = INITRD_START + INITRD_SIZE; + for_each_rb_entry(comp, comps) + if (intersects(safe_addr, size, comp->addr, comp->len)) { + safe_addr = comp->addr + comp->len; + goto repeat; + } + for_each_rb_entry(cert, certs) + if (intersects(safe_addr, size, cert->addr, cert->len)) { + safe_addr = cert->addr + cert->len; + goto repeat; + } + early_ipl_comp_list_addr = safe_addr; + ipl_cert_list_addr = safe_addr + early_ipl_comp_list_size; + + return safe_addr + size; +} + +static void copy_components_bootdata(struct ipl_rb_components *comps) +{ + struct ipl_rb_component_entry *comp, *ptr; + + ptr = (struct ipl_rb_component_entry *) early_ipl_comp_list_addr; + for_each_rb_entry(comp, comps) + memcpy(ptr++, comp, sizeof(*ptr)); +} + +static void copy_certificates_bootdata(struct ipl_rb_certificates *certs) +{ + struct ipl_rb_certificate_entry *cert; + void *ptr; + + ptr = (void *) ipl_cert_list_addr; + for_each_rb_entry(cert, certs) { + *(unsigned int *) ptr = cert->len; + ptr += sizeof(unsigned int); + memcpy(ptr, (void *) cert->addr, cert->len); + ptr += cert->len; + } +} + +unsigned long read_ipl_report(unsigned long safe_addr) +{ + struct ipl_rb_certificates *certs; + struct ipl_rb_components *comps; + struct ipl_pl_hdr *pl_hdr; + struct ipl_rl_hdr *rl_hdr; + struct ipl_rb_hdr *rb_hdr; + unsigned long tmp; + void *rl_end; + + /* + * Check if there is a IPL report by looking at the copy + * of the IPL parameter information block. + */ + if (!ipl_block_valid || + !(ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR)) + return safe_addr; + ipl_secure_flag = !!(ipl_block.hdr.flags & IPL_PL_FLAG_SIPL); + /* + * There is an IPL report, to find it load the pointer to the + * IPL parameter information block from lowcore and skip past + * the IPL parameter list, then align the address to a double + * word boundary. + */ + tmp = (unsigned long) S390_lowcore.ipl_parmblock_ptr; + pl_hdr = (struct ipl_pl_hdr *) tmp; + tmp = (tmp + pl_hdr->len + 7) & -8UL; + rl_hdr = (struct ipl_rl_hdr *) tmp; + /* Walk through the IPL report blocks in the IPL Report list */ + certs = NULL; + comps = NULL; + rl_end = (void *) rl_hdr + rl_hdr->len; + rb_hdr = (void *) rl_hdr + sizeof(*rl_hdr); + while ((void *) rb_hdr + sizeof(*rb_hdr) < rl_end && + (void *) rb_hdr + rb_hdr->len <= rl_end) { + + switch (rb_hdr->rbt) { + case IPL_RBT_CERTIFICATES: + certs = (struct ipl_rb_certificates *) rb_hdr; + break; + case IPL_RBT_COMPONENTS: + comps = (struct ipl_rb_components *) rb_hdr; + break; + default: + break; + } + + rb_hdr = (void *) rb_hdr + rb_hdr->len; + } + + /* + * With either the component list or the certificate list + * missing the kernel will stay ignorant of secure IPL. + */ + if (!comps || !certs) + return safe_addr; + + /* + * Copy component and certificate list to a safe area + * where the decompressed kernel can find them. + */ + safe_addr = find_bootdata_space(comps, certs, safe_addr); + copy_components_bootdata(comps); + copy_certificates_bootdata(certs); + + return safe_addr; +} diff --git a/arch/s390/boot/kaslr.c b/arch/s390/boot/kaslr.c new file mode 100644 index 000000000000..3bdd8132e56b --- /dev/null +++ b/arch/s390/boot/kaslr.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corp. 2019 + */ +#include <asm/mem_detect.h> +#include <asm/cpacf.h> +#include <asm/timex.h> +#include <asm/sclp.h> +#include "compressed/decompressor.h" + +#define PRNG_MODE_TDES 1 +#define PRNG_MODE_SHA512 2 +#define PRNG_MODE_TRNG 3 + +struct prno_parm { + u32 res; + u32 reseed_counter; + u64 stream_bytes; + u8 V[112]; + u8 C[112]; +}; + +struct prng_parm { + u8 parm_block[32]; + u32 reseed_counter; + u64 byte_counter; +}; + +static int check_prng(void) +{ + if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) { + sclp_early_printk("KASLR disabled: CPU has no PRNG\n"); + return 0; + } + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + return PRNG_MODE_TRNG; + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_SHA512_DRNG_GEN)) + return PRNG_MODE_SHA512; + else + return PRNG_MODE_TDES; +} + +static unsigned long get_random(unsigned long limit) +{ + struct prng_parm prng = { + /* initial parameter block for tdes mode, copied from libica */ + .parm_block = { + 0x0F, 0x2B, 0x8E, 0x63, 0x8C, 0x8E, 0xD2, 0x52, + 0x64, 0xB7, 0xA0, 0x7B, 0x75, 0x28, 0xB8, 0xF4, + 0x75, 0x5F, 0xD2, 0xA6, 0x8D, 0x97, 0x11, 0xFF, + 0x49, 0xD8, 0x23, 0xF3, 0x7E, 0x21, 0xEC, 0xA0 + }, + }; + unsigned long seed, random; + struct prno_parm prno; + __u64 entropy[4]; + int mode, i; + + mode = check_prng(); + seed = get_tod_clock_fast(); + switch (mode) { + case PRNG_MODE_TRNG: + cpacf_trng(NULL, 0, (u8 *) &random, sizeof(random)); + break; + case PRNG_MODE_SHA512: + cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, &prno, NULL, 0, + (u8 *) &seed, sizeof(seed)); + cpacf_prno(CPACF_PRNO_SHA512_DRNG_GEN, &prno, (u8 *) &random, + sizeof(random), NULL, 0); + break; + case PRNG_MODE_TDES: + /* add entropy */ + *(unsigned long *) prng.parm_block ^= seed; + for (i = 0; i < 16; i++) { + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, + (char *) entropy, (char *) entropy, + sizeof(entropy)); + memcpy(prng.parm_block, entropy, sizeof(entropy)); + } + random = seed; + cpacf_kmc(CPACF_KMC_PRNG, prng.parm_block, (u8 *) &random, + (u8 *) &random, sizeof(random)); + break; + default: + random = 0; + } + return random % limit; +} + +unsigned long get_random_base(unsigned long safe_addr) +{ + unsigned long base, start, end, kernel_size; + unsigned long block_sum, offset; + int i; + + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE) { + if (safe_addr < INITRD_START + INITRD_SIZE) + safe_addr = INITRD_START + INITRD_SIZE; + } + safe_addr = ALIGN(safe_addr, THREAD_SIZE); + + kernel_size = vmlinux.image_size + vmlinux.bss_size; + block_sum = 0; + for_each_mem_detect_block(i, &start, &end) { + if (memory_end_set) { + if (start >= memory_end) + break; + if (end > memory_end) + end = memory_end; + } + if (end - start < kernel_size) + continue; + block_sum += end - start - kernel_size; + } + if (!block_sum) { + sclp_early_printk("KASLR disabled: not enough memory\n"); + return 0; + } + + base = get_random(block_sum); + if (base == 0) + return 0; + if (base < safe_addr) + base = safe_addr; + block_sum = offset = 0; + for_each_mem_detect_block(i, &start, &end) { + if (memory_end_set) { + if (start >= memory_end) + break; + if (end > memory_end) + end = memory_end; + } + if (end - start < kernel_size) + continue; + block_sum += end - start - kernel_size; + if (base <= block_sum) { + base = start + base - offset; + base = ALIGN_DOWN(base, THREAD_SIZE); + break; + } + offset = block_sum; + } + return base; +} diff --git a/arch/s390/boot/machine_kexec_reloc.c b/arch/s390/boot/machine_kexec_reloc.c new file mode 100644 index 000000000000..b7a5d0f72097 --- /dev/null +++ b/arch/s390/boot/machine_kexec_reloc.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "../kernel/machine_kexec_reloc.c" diff --git a/arch/s390/boot/mem_detect.c b/arch/s390/boot/mem_detect.c index 4cb771ba13fa..5d316fe40480 100644 --- a/arch/s390/boot/mem_detect.c +++ b/arch/s390/boot/mem_detect.c @@ -25,7 +25,7 @@ static void *mem_detect_alloc_extended(void) { unsigned long offset = ALIGN(mem_safe_offset(), sizeof(u64)); - if (IS_ENABLED(BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && + if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && INITRD_START && INITRD_SIZE && INITRD_START < offset + ENTRIES_EXTENDED_MAX) offset = ALIGN(INITRD_START + INITRD_SIZE, sizeof(u64)); diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index bdfc5549a299..7b0d05414618 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -1,11 +1,55 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/string.h> +#include <linux/elf.h> +#include <asm/sections.h> #include <asm/setup.h> +#include <asm/kexec.h> #include <asm/sclp.h> +#include <asm/diag.h> +#include <asm/uv.h> #include "compressed/decompressor.h" #include "boot.h" extern char __boot_data_start[], __boot_data_end[]; +extern char __boot_data_preserved_start[], __boot_data_preserved_end[]; +unsigned long __bootdata_preserved(__kaslr_offset); + +/* + * Some code and data needs to stay below 2 GB, even when the kernel would be + * relocated above 2 GB, because it has to use 31 bit addresses. + * Such code and data is part of the .dma section, and its location is passed + * over to the decompressed / relocated kernel via the .boot.preserved.data + * section. + */ +extern char _sdma[], _edma[]; +extern char _stext_dma[], _etext_dma[]; +extern struct exception_table_entry _start_dma_ex_table[]; +extern struct exception_table_entry _stop_dma_ex_table[]; +unsigned long __bootdata_preserved(__sdma) = __pa(&_sdma); +unsigned long __bootdata_preserved(__edma) = __pa(&_edma); +unsigned long __bootdata_preserved(__stext_dma) = __pa(&_stext_dma); +unsigned long __bootdata_preserved(__etext_dma) = __pa(&_etext_dma); +struct exception_table_entry * + __bootdata_preserved(__start_dma_ex_table) = _start_dma_ex_table; +struct exception_table_entry * + __bootdata_preserved(__stop_dma_ex_table) = _stop_dma_ex_table; + +int _diag210_dma(struct diag210 *addr); +int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode); +int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode); +void _diag0c_dma(struct hypfs_diag0c_entry *entry); +void _diag308_reset_dma(void); +struct diag_ops __bootdata_preserved(diag_dma_ops) = { + .diag210 = _diag210_dma, + .diag26c = _diag26c_dma, + .diag14 = _diag14_dma, + .diag0c = _diag0c_dma, + .diag308_reset = _diag308_reset_dma +}; +static struct diag210 _diag210_tmp_dma __section(".dma.data"); +struct diag210 *__bootdata_preserved(__diag210_tmp_dma) = &_diag210_tmp_dma; +void _swsusp_reset_dma(void); +unsigned long __bootdata_preserved(__swsusp_reset_dma) = __pa(_swsusp_reset_dma); void error(char *x) { @@ -13,7 +57,7 @@ void error(char *x) sclp_early_printk(x); sclp_early_printk("\n\n -- System halted"); - disabled_wait(0xdeadbeef); + disabled_wait(); } #ifdef CONFIG_KERNEL_UNCOMPRESSED @@ -23,19 +67,16 @@ unsigned long mem_safe_offset(void) } #endif -static void rescue_initrd(void) +static void rescue_initrd(unsigned long addr) { - unsigned long min_initrd_addr; - if (!IS_ENABLED(CONFIG_BLK_DEV_INITRD)) return; if (!INITRD_START || !INITRD_SIZE) return; - min_initrd_addr = mem_safe_offset(); - if (min_initrd_addr <= INITRD_START) + if (addr <= INITRD_START) return; - memmove((void *)min_initrd_addr, (void *)INITRD_START, INITRD_SIZE); - INITRD_START = min_initrd_addr; + memmove((void *)addr, (void *)INITRD_START, INITRD_SIZE); + INITRD_START = addr; } static void copy_bootdata(void) @@ -43,23 +84,81 @@ static void copy_bootdata(void) if (__boot_data_end - __boot_data_start != vmlinux.bootdata_size) error(".boot.data section size mismatch"); memcpy((void *)vmlinux.bootdata_off, __boot_data_start, vmlinux.bootdata_size); + if (__boot_data_preserved_end - __boot_data_preserved_start != vmlinux.bootdata_preserved_size) + error(".boot.preserved.data section size mismatch"); + memcpy((void *)vmlinux.bootdata_preserved_off, __boot_data_preserved_start, vmlinux.bootdata_preserved_size); +} + +static void handle_relocs(unsigned long offset) +{ + Elf64_Rela *rela_start, *rela_end, *rela; + int r_type, r_sym, rc; + Elf64_Addr loc, val; + Elf64_Sym *dynsym; + + rela_start = (Elf64_Rela *) vmlinux.rela_dyn_start; + rela_end = (Elf64_Rela *) vmlinux.rela_dyn_end; + dynsym = (Elf64_Sym *) vmlinux.dynsym_start; + for (rela = rela_start; rela < rela_end; rela++) { + loc = rela->r_offset + offset; + val = rela->r_addend + offset; + r_sym = ELF64_R_SYM(rela->r_info); + if (r_sym) + val += dynsym[r_sym].st_value; + r_type = ELF64_R_TYPE(rela->r_info); + rc = arch_kexec_do_relocs(r_type, (void *) loc, val, 0); + if (rc) + error("Unknown relocation type"); + } } void startup_kernel(void) { + unsigned long random_lma; + unsigned long safe_addr; void *img; - rescue_initrd(); - sclp_early_read_info(); store_ipl_parmblock(); + safe_addr = mem_safe_offset(); + safe_addr = read_ipl_report(safe_addr); + uv_query_info(); + rescue_initrd(safe_addr); + sclp_early_read_info(); setup_boot_command_line(); parse_boot_command_line(); setup_memory_end(); detect_memory(); + + random_lma = __kaslr_offset = 0; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE) && kaslr_enabled) { + random_lma = get_random_base(safe_addr); + if (random_lma) { + __kaslr_offset = random_lma - vmlinux.default_lma; + img = (void *)vmlinux.default_lma; + vmlinux.default_lma += __kaslr_offset; + vmlinux.entry += __kaslr_offset; + vmlinux.bootdata_off += __kaslr_offset; + vmlinux.bootdata_preserved_off += __kaslr_offset; + vmlinux.rela_dyn_start += __kaslr_offset; + vmlinux.rela_dyn_end += __kaslr_offset; + vmlinux.dynsym_start += __kaslr_offset; + } + } + if (!IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) { img = decompress_kernel(); memmove((void *)vmlinux.default_lma, img, vmlinux.image_size); - } + } else if (__kaslr_offset) + memcpy((void *)vmlinux.default_lma, img, vmlinux.image_size); + copy_bootdata(); + if (IS_ENABLED(CONFIG_RELOCATABLE)) + handle_relocs(__kaslr_offset); + + if (__kaslr_offset) { + /* Clear non-relocated kernel */ + if (IS_ENABLED(CONFIG_KERNEL_UNCOMPRESSED)) + memset(img, 0, vmlinux.image_size); + } vmlinux.entry(); } diff --git a/arch/s390/boot/text_dma.S b/arch/s390/boot/text_dma.S new file mode 100644 index 000000000000..9715715c4c28 --- /dev/null +++ b/arch/s390/boot/text_dma.S @@ -0,0 +1,184 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code that needs to run below 2 GB. + * + * Copyright IBM Corp. 2019 + */ + +#include <linux/linkage.h> +#include <asm/errno.h> +#include <asm/sigp.h> + +#ifdef CC_USING_EXPOLINE + .pushsection .dma.text.__s390_indirect_jump_r14,"axG" +__dma__s390_indirect_jump_r14: + larl %r1,0f + ex 0,0(%r1) + j . +0: br %r14 + .popsection +#endif + + .section .dma.text,"ax" +/* + * Simplified version of expoline thunk. The normal thunks can not be used here, + * because they might be more than 2 GB away, and not reachable by the relative + * branch. No comdat, exrl, etc. optimizations used here, because it only + * affects a few functions that are not performance-relevant. + */ + .macro BR_EX_DMA_r14 +#ifdef CC_USING_EXPOLINE + jg __dma__s390_indirect_jump_r14 +#else + br %r14 +#endif + .endm + +/* + * int _diag14_dma(unsigned long rx, unsigned long ry1, unsigned long subcode) + */ +ENTRY(_diag14_dma) + lgr %r1,%r2 + lgr %r2,%r3 + lgr %r3,%r4 + lhi %r5,-EIO + sam31 + diag %r1,%r2,0x14 +.Ldiag14_ex: + ipm %r5 + srl %r5,28 +.Ldiag14_fault: + sam64 + lgfr %r2,%r5 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag14_ex, .Ldiag14_fault) +ENDPROC(_diag14_dma) + +/* + * int _diag210_dma(struct diag210 *addr) + */ +ENTRY(_diag210_dma) + lgr %r1,%r2 + lhi %r2,-1 + sam31 + diag %r1,%r0,0x210 +.Ldiag210_ex: + ipm %r2 + srl %r2,28 +.Ldiag210_fault: + sam64 + lgfr %r2,%r2 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag210_ex, .Ldiag210_fault) +ENDPROC(_diag210_dma) + +/* + * int _diag26c_dma(void *req, void *resp, enum diag26c_sc subcode) + */ +ENTRY(_diag26c_dma) + lghi %r5,-EOPNOTSUPP + sam31 + diag %r2,%r4,0x26c +.Ldiag26c_ex: + sam64 + lgfr %r2,%r5 + BR_EX_DMA_r14 + EX_TABLE_DMA(.Ldiag26c_ex, .Ldiag26c_ex) +ENDPROC(_diag26c_dma) + +/* + * void _diag0c_dma(struct hypfs_diag0c_entry *entry) + */ +ENTRY(_diag0c_dma) + sam31 + diag %r2,%r2,0x0c + sam64 + BR_EX_DMA_r14 +ENDPROC(_diag0c_dma) + +/* + * void _swsusp_reset_dma(void) + */ +ENTRY(_swsusp_reset_dma) + larl %r1,restart_entry + larl %r2,.Lrestart_diag308_psw + og %r1,0(%r2) + stg %r1,0(%r0) + lghi %r0,0 + diag %r0,%r0,0x308 +restart_entry: + lhi %r1,1 + sigp %r1,%r0,SIGP_SET_ARCHITECTURE + sam64 + BR_EX_DMA_r14 +ENDPROC(_swsusp_reset_dma) + +/* + * void _diag308_reset_dma(void) + * + * Calls diag 308 subcode 1 and continues execution + */ +ENTRY(_diag308_reset_dma) + larl %r4,.Lctlregs # Save control registers + stctg %c0,%c15,0(%r4) + lg %r2,0(%r4) # Disable lowcore protection + nilh %r2,0xefff + larl %r4,.Lctlreg0 + stg %r2,0(%r4) + lctlg %c0,%c0,0(%r4) + larl %r4,.Lfpctl # Floating point control register + stfpc 0(%r4) + larl %r4,.Lprefix # Save prefix register + stpx 0(%r4) + larl %r4,.Lprefix_zero # Set prefix register to 0 + spx 0(%r4) + larl %r4,.Lcontinue_psw # Save PSW flags + epsw %r2,%r3 + stm %r2,%r3,0(%r4) + larl %r4,restart_part2 # Setup restart PSW at absolute 0 + larl %r3,.Lrestart_diag308_psw + og %r4,0(%r3) # Save PSW + lghi %r3,0 + sturg %r4,%r3 # Use sturg, because of large pages + lghi %r1,1 + lghi %r0,0 + diag %r0,%r1,0x308 +restart_part2: + lhi %r0,0 # Load r0 with zero + lhi %r1,2 # Use mode 2 = ESAME (dump) + sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode + sam64 # Switch to 64 bit addressing mode + larl %r4,.Lctlregs # Restore control registers + lctlg %c0,%c15,0(%r4) + larl %r4,.Lfpctl # Restore floating point ctl register + lfpc 0(%r4) + larl %r4,.Lprefix # Restore prefix register + spx 0(%r4) + larl %r4,.Lcontinue_psw # Restore PSW flags + lpswe 0(%r4) +.Lcontinue: + BR_EX_DMA_r14 +ENDPROC(_diag308_reset_dma) + + .section .dma.data,"aw",@progbits +.align 8 +.Lrestart_diag308_psw: + .long 0x00080000,0x80000000 + +.align 8 +.Lcontinue_psw: + .quad 0,.Lcontinue + +.align 8 +.Lctlreg0: + .quad 0 +.Lctlregs: + .rept 16 + .quad 0 + .endr +.Lfpctl: + .long 0 +.Lprefix: + .long 0 +.Lprefix_zero: + .long 0 diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c new file mode 100644 index 000000000000..ed007f4a6444 --- /dev/null +++ b/arch/s390/boot/uv.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <asm/uv.h> +#include <asm/facility.h> +#include <asm/sections.h> + +int __bootdata_preserved(prot_virt_guest); + +void uv_query_info(void) +{ + struct uv_cb_qui uvcb = { + .header.cmd = UVC_CMD_QUI, + .header.len = sizeof(uvcb) + }; + + if (!test_facility(158)) + return; + + if (uv_call(0, (uint64_t)&uvcb)) + return; + + if (test_bit_inv(BIT_UVC_CMD_SET_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list) && + test_bit_inv(BIT_UVC_CMD_REMOVE_SHARED_ACCESS, (unsigned long *)uvcb.inst_calls_list)) + prot_virt_guest = 1; +} diff --git a/arch/s390/configs/debug_defconfig b/arch/s390/configs/debug_defconfig index 9824c7bad9d4..b0920b35f87b 100644 --- a/arch/s390/configs/debug_defconfig +++ b/arch/s390/configs/debug_defconfig @@ -64,6 +64,7 @@ CONFIG_NUMA=y CONFIG_PREEMPT=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 4fcbe5792744..09aa5cb14873 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -65,6 +65,7 @@ CONFIG_NR_CPUS=512 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_EXPOLINE=y CONFIG_EXPOLINE_AUTO=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/crypto/crc32be-vx.S b/arch/s390/crypto/crc32be-vx.S index 2bf01ba44107..0099044e2c86 100644 --- a/arch/s390/crypto/crc32be-vx.S +++ b/arch/s390/crypto/crc32be-vx.S @@ -207,5 +207,6 @@ ENTRY(crc32_be_vgfm_16) .Ldone: VLGVF %r2,%v2,3 BR_EX %r14 +ENDPROC(crc32_be_vgfm_16) .previous diff --git a/arch/s390/crypto/crc32le-vx.S b/arch/s390/crypto/crc32le-vx.S index 7d6f568bd3ad..71caf0f4ec08 100644 --- a/arch/s390/crypto/crc32le-vx.S +++ b/arch/s390/crypto/crc32le-vx.S @@ -105,13 +105,14 @@ ENTRY(crc32_le_vgfm_16) larl %r5,.Lconstants_CRC_32_LE j crc32_le_vgfm_generic +ENDPROC(crc32_le_vgfm_16) ENTRY(crc32c_le_vgfm_16) larl %r5,.Lconstants_CRC_32C_LE j crc32_le_vgfm_generic +ENDPROC(crc32c_le_vgfm_16) - -crc32_le_vgfm_generic: +ENTRY(crc32_le_vgfm_generic) /* Load CRC-32 constants */ VLM CONST_PERM_LE2BE,CONST_CRC_POLY,0,%r5 @@ -267,5 +268,6 @@ crc32_le_vgfm_generic: .Ldone: VLGVF %r2,%v2,2 BR_EX %r14 +ENDPROC(crc32_le_vgfm_generic) .previous diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index 0d15383d0ff1..1f9ab24dc048 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -224,24 +224,11 @@ static int des3_setkey(struct crypto_tfm *tfm, const u8 *key, unsigned int key_len) { struct s390_des_ctx *ctx = crypto_tfm_ctx(tfm); + int err; - if (!(crypto_memneq(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && - crypto_memneq(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2], - DES_KEY_SIZE)) && - (tfm->crt_flags & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) { - tfm->crt_flags |= CRYPTO_TFM_RES_WEAK_KEY; - return -EINVAL; - } - - /* in fips mode, ensure k1 != k2 and k2 != k3 and k1 != k3 */ - if (fips_enabled && - !(crypto_memneq(key, &key[DES_KEY_SIZE], DES_KEY_SIZE) && - crypto_memneq(&key[DES_KEY_SIZE], &key[DES_KEY_SIZE * 2], - DES_KEY_SIZE) && - crypto_memneq(key, &key[DES_KEY_SIZE * 2], DES_KEY_SIZE))) { - tfm->crt_flags |= CRYPTO_TFM_RES_WEAK_KEY; - return -EINVAL; - } + err = __des3_verify_key(&tfm->crt_flags, key); + if (unlikely(err)) + return err; memcpy(ctx->key, key, key_len); return 0; diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index a97a1802cfb4..12cca467af7d 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -61,6 +61,7 @@ static unsigned int prng_reseed_limit; module_param_named(reseed_limit, prng_reseed_limit, int, 0); MODULE_PARM_DESC(prng_reseed_limit, "PRNG reseed limit"); +static bool trng_available; /* * Any one who considers arithmetical methods of producing random digits is, @@ -115,46 +116,68 @@ static const u8 initial_parm_block[32] __initconst = { /* * generate_entropy: - * This algorithm produces 64 bytes of entropy data based on 1024 - * individual stckf() invocations assuming that each stckf() value - * contributes 0.25 bits of entropy. So the caller gets 256 bit - * entropy per 64 byte or 4 bits entropy per byte. + * This function fills a given buffer with random bytes. The entropy within + * the random bytes given back is assumed to have at least 50% - meaning + * a 64 bytes buffer has at least 64 * 8 / 2 = 256 bits of entropy. + * Within the function the entropy generation is done in junks of 64 bytes. + * So the caller should also ask for buffer fill in multiples of 64 bytes. + * The generation of the entropy is based on the assumption that every stckf() + * invocation produces 0.5 bits of entropy. To accumulate 256 bits of entropy + * at least 512 stckf() values are needed. The entropy relevant part of the + * stckf value is bit 51 (counting starts at the left with bit nr 0) so + * here we use the lower 4 bytes and exor the values into 2k of bufferspace. + * To be on the save side, if there is ever a problem with stckf() the + * other half of the page buffer is filled with bytes from urandom via + * get_random_bytes(), so this function consumes 2k of urandom for each + * requested 64 bytes output data. Finally the buffer page is condensed into + * a 64 byte value by hashing with a SHA512 hash. */ static int generate_entropy(u8 *ebuf, size_t nbytes) { int n, ret = 0; - u8 *pg, *h, hash[64]; - - /* allocate 2 pages */ - pg = (u8 *) __get_free_pages(GFP_KERNEL, 1); + u8 *pg, pblock[80] = { + /* 8 x 64 bit init values */ + 0x6A, 0x09, 0xE6, 0x67, 0xF3, 0xBC, 0xC9, 0x08, + 0xBB, 0x67, 0xAE, 0x85, 0x84, 0xCA, 0xA7, 0x3B, + 0x3C, 0x6E, 0xF3, 0x72, 0xFE, 0x94, 0xF8, 0x2B, + 0xA5, 0x4F, 0xF5, 0x3A, 0x5F, 0x1D, 0x36, 0xF1, + 0x51, 0x0E, 0x52, 0x7F, 0xAD, 0xE6, 0x82, 0xD1, + 0x9B, 0x05, 0x68, 0x8C, 0x2B, 0x3E, 0x6C, 0x1F, + 0x1F, 0x83, 0xD9, 0xAB, 0xFB, 0x41, 0xBD, 0x6B, + 0x5B, 0xE0, 0xCD, 0x19, 0x13, 0x7E, 0x21, 0x79, + /* 128 bit counter total message bit length */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x00 }; + + /* allocate one page stckf buffer */ + pg = (u8 *) __get_free_page(GFP_KERNEL); if (!pg) { prng_errorflag = PRNG_GEN_ENTROPY_FAILED; return -ENOMEM; } + /* fill the ebuf in chunks of 64 byte each */ while (nbytes) { - /* fill pages with urandom bytes */ - get_random_bytes(pg, 2*PAGE_SIZE); - /* exor pages with 1024 stckf values */ - for (n = 0; n < 2 * PAGE_SIZE / sizeof(u64); n++) { - u64 *p = ((u64 *)pg) + n; + /* fill lower 2k with urandom bytes */ + get_random_bytes(pg, PAGE_SIZE / 2); + /* exor upper 2k with 512 stckf values, offset 4 bytes each */ + for (n = 0; n < 512; n++) { + int offset = (PAGE_SIZE / 2) + (n * 4) - 4; + u64 *p = (u64 *)(pg + offset); *p ^= get_tod_clock_fast(); } - n = (nbytes < sizeof(hash)) ? nbytes : sizeof(hash); - if (n < sizeof(hash)) - h = hash; - else - h = ebuf; - /* hash over the filled pages */ - cpacf_kimd(CPACF_KIMD_SHA_512, h, pg, 2*PAGE_SIZE); - if (n < sizeof(hash)) - memcpy(ebuf, hash, n); + /* hash over the filled page */ + cpacf_klmd(CPACF_KLMD_SHA_512, pblock, pg, PAGE_SIZE); + n = (nbytes < 64) ? nbytes : 64; + memcpy(ebuf, pblock, n); ret += n; ebuf += n; nbytes -= n; } - free_pages((unsigned long)pg, 1); + memzero_explicit(pblock, sizeof(pblock)); + memzero_explicit(pg, PAGE_SIZE); + free_page((unsigned long)pg); return ret; } @@ -344,8 +367,8 @@ static int __init prng_sha512_selftest(void) static int __init prng_sha512_instantiate(void) { - int ret, datalen; - u8 seed[64 + 32 + 16]; + int ret, datalen, seedlen; + u8 seed[128 + 16]; pr_debug("prng runs in SHA-512 mode " "with chunksize=%d and reseed_limit=%u\n", @@ -368,16 +391,36 @@ static int __init prng_sha512_instantiate(void) if (ret) goto outfree; - /* generate initial seed bytestring, with 256 + 128 bits entropy */ - ret = generate_entropy(seed, 64 + 32); - if (ret != 64 + 32) - goto outfree; - /* followed by 16 bytes of unique nonce */ - get_tod_clock_ext(seed + 64 + 32); + /* generate initial seed, we need at least 256 + 128 bits entropy. */ + if (trng_available) { + /* + * Trng available, so use it. The trng works in chunks of + * 32 bytes and produces 100% entropy. So we pull 64 bytes + * which gives us 512 bits entropy. + */ + seedlen = 2 * 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* + * No trng available, so use the generate_entropy() function. + * This function works in 64 byte junks and produces + * 50% entropy. So we pull 2*64 bytes which gives us 512 bits + * of entropy. + */ + seedlen = 2 * 64; + ret = generate_entropy(seed, seedlen); + if (ret != seedlen) + goto outfree; + } + + /* append the seed by 16 bytes of unique nonce */ + get_tod_clock_ext(seed + seedlen); + seedlen += 16; - /* initial seed of the prno drng */ + /* now initial seed of the prno drng */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, - &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + &prng_data->prnows, NULL, 0, seed, seedlen); + memzero_explicit(seed, sizeof(seed)); /* if fips mode is enabled, generate a first block of random bytes for the FIPS 140-2 Conditional Self Test */ @@ -405,17 +448,26 @@ static void prng_sha512_deinstantiate(void) static int prng_sha512_reseed(void) { - int ret; + int ret, seedlen; u8 seed[64]; - /* fetch 256 bits of fresh entropy */ - ret = generate_entropy(seed, sizeof(seed)); - if (ret != sizeof(seed)) - return ret; + /* We need at least 256 bits of fresh entropy for reseeding */ + if (trng_available) { + /* trng produces 256 bits entropy in 32 bytes */ + seedlen = 32; + cpacf_trng(NULL, 0, seed, seedlen); + } else { + /* generate_entropy() produces 256 bits entropy in 64 bytes */ + seedlen = 64; + ret = generate_entropy(seed, seedlen); + if (ret != sizeof(seed)) + return ret; + } /* do a reseed of the prno drng with this bytestring */ cpacf_prno(CPACF_PRNO_SHA512_DRNG_SEED, - &prng_data->prnows, NULL, 0, seed, sizeof(seed)); + &prng_data->prnows, NULL, 0, seed, seedlen); + memzero_explicit(seed, sizeof(seed)); return 0; } @@ -592,6 +644,7 @@ static ssize_t prng_sha512_read(struct file *file, char __user *ubuf, ret = -EFAULT; break; } + memzero_explicit(p, n); ubuf += n; nbytes -= n; ret += n; @@ -773,6 +826,10 @@ static int __init prng_init(void) if (!cpacf_query_func(CPACF_KMC, CPACF_KMC_PRNG)) return -EOPNOTSUPP; + /* check if TRNG subfunction is available */ + if (cpacf_query_func(CPACF_PRNO, CPACF_PRNO_TRNG)) + trng_available = true; + /* choose prng mode */ if (prng_mode != PRNG_MODE_TDES) { /* check for MSA5 support for PRNO operations */ diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 4d58a92b5d97..c59b922cb6c5 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -39,6 +39,7 @@ CONFIG_NR_CPUS=256 CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_KEXEC_FILE=y +CONFIG_KEXEC_VERIFY_SIG=y CONFIG_CRASH_DUMP=y CONFIG_HIBERNATION=y CONFIG_PM_DEBUG=y diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c index 72e3140fafb5..3235e4d82f2d 100644 --- a/arch/s390/hypfs/hypfs_diag0c.c +++ b/arch/s390/hypfs/hypfs_diag0c.c @@ -16,26 +16,12 @@ #define DBFS_D0C_HDR_VERSION 0 /* - * Execute diagnose 0c in 31 bit mode - */ -static void diag0c(struct hypfs_diag0c_entry *entry) -{ - diag_stat_inc(DIAG_STAT_X00C); - asm volatile ( - " sam31\n" - " diag %0,%0,0x0c\n" - " sam64\n" - : /* no output register */ - : "a" (entry) - : "memory"); -} - -/* * Get hypfs_diag0c_entry from CPU vector and store diag0c data */ static void diag0c_fn(void *data) { - diag0c(((void **) data)[smp_processor_id()]); + diag_stat_inc(DIAG_STAT_X00C); + diag_dma_ops.diag0c(((void **) data)[smp_processor_id()]); } /* diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 12d77cb11fe5..2531f673f099 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -20,7 +20,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h -generic-y += rwsem.h +generic-y += mmiowb.h generic-y += trace_clock.h generic-y += unaligned.h generic-y += word-at-a-time.h diff --git a/arch/s390/include/asm/airq.h b/arch/s390/include/asm/airq.h index fcf539efb32f..c10d2ee2dfda 100644 --- a/arch/s390/include/asm/airq.h +++ b/arch/s390/include/asm/airq.h @@ -14,7 +14,7 @@ struct airq_struct { struct hlist_node list; /* Handler queueing. */ - void (*handler)(struct airq_struct *); /* Thin-interrupt handler */ + void (*handler)(struct airq_struct *airq, bool floating); u8 *lsi_ptr; /* Local-Summary-Indicator pointer */ u8 lsi_mask; /* Local-Summary-Indicator mask */ u8 isc; /* Interrupt-subclass */ @@ -35,13 +35,15 @@ struct airq_iv { unsigned int *data; /* 32 bit value associated with each bit */ unsigned long bits; /* Number of bits in the vector */ unsigned long end; /* Number of highest allocated bit + 1 */ + unsigned long flags; /* Allocation flags */ spinlock_t lock; /* Lock to protect alloc & free */ }; -#define AIRQ_IV_ALLOC 1 /* Use an allocation bit mask */ -#define AIRQ_IV_BITLOCK 2 /* Allocate the lock bit mask */ -#define AIRQ_IV_PTR 4 /* Allocate the ptr array */ -#define AIRQ_IV_DATA 8 /* Allocate the data array */ +#define AIRQ_IV_ALLOC 1 /* Use an allocation bit mask */ +#define AIRQ_IV_BITLOCK 2 /* Allocate the lock bit mask */ +#define AIRQ_IV_PTR 4 /* Allocate the ptr array */ +#define AIRQ_IV_DATA 8 /* Allocate the data array */ +#define AIRQ_IV_CACHELINE 16 /* Cacheline alignment for the vector */ struct airq_iv *airq_iv_create(unsigned long bits, unsigned long flags); void airq_iv_release(struct airq_iv *iv); diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index d1f8a4d94cca..9900d655014c 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -73,7 +73,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __atomic64_or(mask, addr); + __atomic64_or(mask, (long *)addr); } static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) @@ -94,7 +94,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - __atomic64_and(mask, addr); + __atomic64_and(mask, (long *)addr); } static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) @@ -115,7 +115,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __atomic64_xor(mask, addr); + __atomic64_xor(mask, (long *)addr); } static inline int @@ -125,7 +125,7 @@ test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __atomic64_or_barrier(mask, addr); + old = __atomic64_or_barrier(mask, (long *)addr); return (old & mask) != 0; } @@ -136,7 +136,7 @@ test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - old = __atomic64_and_barrier(mask, addr); + old = __atomic64_and_barrier(mask, (long *)addr); return (old & ~mask) != 0; } @@ -147,7 +147,7 @@ test_and_change_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __atomic64_xor_barrier(mask, addr); + old = __atomic64_xor_barrier(mask, (long *)addr); return (old & mask) != 0; } diff --git a/arch/s390/include/asm/boot_data.h b/arch/s390/include/asm/boot_data.h index 2d999ccb977a..f7eed27b3220 100644 --- a/arch/s390/include/asm/boot_data.h +++ b/arch/s390/include/asm/boot_data.h @@ -5,7 +5,14 @@ #include <asm/ipl.h> extern char early_command_line[COMMAND_LINE_SIZE]; -extern struct ipl_parameter_block early_ipl_block; -extern int early_ipl_block_valid; +extern struct ipl_parameter_block ipl_block; +extern int ipl_block_valid; +extern int ipl_secure_flag; + +extern unsigned long ipl_cert_list_addr; +extern unsigned long ipl_cert_list_size; + +extern unsigned long early_ipl_comp_list_addr; +extern unsigned long early_ipl_comp_list_size; #endif /* _ASM_S390_BOOT_DATA_H */ diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index 429f43a8a8e8..713fc9735ffb 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -15,7 +15,7 @@ ".section .rodata.str,\"aMS\",@progbits,1\n" \ "2: .asciz \""__FILE__"\"\n" \ ".previous\n" \ - ".section __bug_table,\"aw\"\n" \ + ".section __bug_table,\"awM\",@progbits,%2\n" \ "3: .long 1b-3b,2b-3b\n" \ " .short %0,%1\n" \ " .org 3b+%2\n" \ @@ -27,17 +27,17 @@ #else /* CONFIG_DEBUG_BUGVERBOSE */ -#define __EMIT_BUG(x) do { \ - asm volatile( \ - "0: j 0b+2\n" \ - "1:\n" \ - ".section __bug_table,\"aw\"\n" \ - "2: .long 1b-2b\n" \ - " .short %0\n" \ - " .org 2b+%1\n" \ - ".previous\n" \ - : : "i" (x), \ - "i" (sizeof(struct bug_entry))); \ +#define __EMIT_BUG(x) do { \ + asm volatile( \ + "0: j 0b+2\n" \ + "1:\n" \ + ".section __bug_table,\"awM\",@progbits,%1\n" \ + "2: .long 1b-2b\n" \ + " .short %0\n" \ + " .org 2b+%1\n" \ + ".previous\n" \ + : : "i" (x), \ + "i" (sizeof(struct bug_entry))); \ } while (0) #endif /* CONFIG_DEBUG_BUGVERBOSE */ diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h index 19562be22b7e..0036eab14391 100644 --- a/arch/s390/include/asm/diag.h +++ b/arch/s390/include/asm/diag.h @@ -308,4 +308,17 @@ union diag318_info { int diag204(unsigned long subcode, unsigned long size, void *addr); int diag224(void *ptr); int diag26c(void *req, void *resp, enum diag26c_sc subcode); + +struct hypfs_diag0c_entry; + +struct diag_ops { + int (*diag210)(struct diag210 *addr); + int (*diag26c)(void *req, void *resp, enum diag26c_sc subcode); + int (*diag14)(unsigned long rx, unsigned long ry1, unsigned long subcode); + void (*diag0c)(struct hypfs_diag0c_entry *entry); + void (*diag308_reset)(void); +}; + +extern struct diag_ops diag_dma_ops; +extern struct diag210 *__diag210_tmp_dma; #endif /* _ASM_S390_DIAG_H */ diff --git a/arch/s390/include/asm/ebcdic.h b/arch/s390/include/asm/ebcdic.h index 29441beb92e6..efb50fc6866c 100644 --- a/arch/s390/include/asm/ebcdic.h +++ b/arch/s390/include/asm/ebcdic.h @@ -20,7 +20,7 @@ extern __u8 _ebc_tolower[256]; /* EBCDIC -> lowercase */ extern __u8 _ebc_toupper[256]; /* EBCDIC -> uppercase */ static inline void -codepage_convert(const __u8 *codepage, volatile __u8 * addr, unsigned long nr) +codepage_convert(const __u8 *codepage, volatile char *addr, unsigned long nr) { if (nr-- <= 0) return; diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index f74639a05f0f..5775fc22f410 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -107,6 +107,10 @@ #define HWCAP_S390_VXRS_BCD 4096 #define HWCAP_S390_VXRS_EXT 8192 #define HWCAP_S390_GS 16384 +#define HWCAP_S390_VXRS_EXT2 32768 +#define HWCAP_S390_VXRS_PDE 65536 +#define HWCAP_S390_SORT 131072 +#define HWCAP_S390_DFLT 262144 /* Internal bits, not exposed via elf */ #define HWCAP_INT_SIE 1UL diff --git a/arch/s390/include/asm/extable.h b/arch/s390/include/asm/extable.h index 80a4e5a9cb46..ae27f756b409 100644 --- a/arch/s390/include/asm/extable.h +++ b/arch/s390/include/asm/extable.h @@ -19,6 +19,11 @@ struct exception_table_entry int insn, fixup; }; +extern struct exception_table_entry *__start_dma_ex_table; +extern struct exception_table_entry *__stop_dma_ex_table; + +const struct exception_table_entry *s390_search_extables(unsigned long addr); + static inline unsigned long extable_fixup(const struct exception_table_entry *x) { return (unsigned long)&x->fixup + x->fixup; diff --git a/arch/s390/include/asm/ftrace.h b/arch/s390/include/asm/ftrace.h index 5a3c95b11952..68d362f8d6c1 100644 --- a/arch/s390/include/asm/ftrace.h +++ b/arch/s390/include/asm/ftrace.h @@ -11,9 +11,16 @@ #define MCOUNT_RETURN_FIXUP 18 #endif +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifndef __ASSEMBLY__ +#ifdef CONFIG_CC_IS_CLANG +/* https://bugs.llvm.org/show_bug.cgi?id=41424 */ +#define ftrace_return_address(n) 0UL +#else #define ftrace_return_address(n) __builtin_return_address(n) +#endif void _mcount(void); void ftrace_caller(void); diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index f34d729347e4..ca421614722f 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -30,14 +30,8 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr); #define ioremap_wc ioremap_nocache #define ioremap_wt ioremap_nocache -static inline void __iomem *ioremap(unsigned long offset, unsigned long size) -{ - return (void __iomem *) offset; -} - -static inline void iounmap(volatile void __iomem *addr) -{ -} +void __iomem *ioremap(unsigned long offset, unsigned long size); +void iounmap(volatile void __iomem *addr); static inline void __iomem *ioport_map(unsigned long port, unsigned int nr) { @@ -57,14 +51,17 @@ static inline void ioport_unmap(void __iomem *p) * the corresponding device and create the mapping cookie. */ #define pci_iomap pci_iomap +#define pci_iomap_range pci_iomap_range #define pci_iounmap pci_iounmap -#define pci_iomap_wc pci_iomap -#define pci_iomap_wc_range pci_iomap_range +#define pci_iomap_wc pci_iomap_wc +#define pci_iomap_wc_range pci_iomap_wc_range #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) #define memset_io(dst, val, count) zpci_memset_io(dst, val, count) +#define mmiowb() zpci_barrier() + #define __raw_readb zpci_read_u8 #define __raw_readw zpci_read_u16 #define __raw_readl zpci_read_u32 diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index a8389e2d2f03..084e71b7272a 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -12,74 +12,36 @@ #include <asm/types.h> #include <asm/cio.h> #include <asm/setup.h> +#include <uapi/asm/ipl.h> -#define NSS_NAME_SIZE 8 - -#define IPL_PARM_BLK_FCP_LEN (sizeof(struct ipl_list_hdr) + \ - sizeof(struct ipl_block_fcp)) - -#define IPL_PARM_BLK0_FCP_LEN (sizeof(struct ipl_block_fcp) + 16) +struct ipl_parameter_block { + struct ipl_pl_hdr hdr; + union { + struct ipl_pb_hdr pb0_hdr; + struct ipl_pb0_common common; + struct ipl_pb0_fcp fcp; + struct ipl_pb0_ccw ccw; + char raw[PAGE_SIZE - sizeof(struct ipl_pl_hdr)]; + }; +} __packed __aligned(PAGE_SIZE); -#define IPL_PARM_BLK_CCW_LEN (sizeof(struct ipl_list_hdr) + \ - sizeof(struct ipl_block_ccw)) +#define NSS_NAME_SIZE 8 -#define IPL_PARM_BLK0_CCW_LEN (sizeof(struct ipl_block_ccw) + 16) +#define IPL_BP_FCP_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_fcp)) +#define IPL_BP0_FCP_LEN (sizeof(struct ipl_pb0_fcp)) +#define IPL_BP_CCW_LEN (sizeof(struct ipl_pl_hdr) + \ + sizeof(struct ipl_pb0_ccw)) +#define IPL_BP0_CCW_LEN (sizeof(struct ipl_pb0_ccw)) #define IPL_MAX_SUPPORTED_VERSION (0) -struct ipl_list_hdr { - u32 len; - u8 reserved1[3]; - u8 version; - u32 blk0_len; - u8 pbt; - u8 flags; - u16 reserved2; - u8 loadparm[8]; -} __attribute__((packed)); - -struct ipl_block_fcp { - u8 reserved1[305-1]; - u8 opt; - u8 reserved2[3]; - u16 reserved3; - u16 devno; - u8 reserved4[4]; - u64 wwpn; - u64 lun; - u32 bootprog; - u8 reserved5[12]; - u64 br_lba; - u32 scp_data_len; - u8 reserved6[260]; - u8 scp_data[]; -} __attribute__((packed)); - -#define DIAG308_VMPARM_SIZE 64 -#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - (sizeof(struct ipl_list_hdr) + \ - offsetof(struct ipl_block_fcp, scp_data))) - -struct ipl_block_ccw { - u8 reserved1[84]; - u16 reserved2 : 13; - u8 ssid : 3; - u16 devno; - u8 vm_flags; - u8 reserved3[3]; - u32 vm_parm_len; - u8 nss_name[8]; - u8 vm_parm[DIAG308_VMPARM_SIZE]; - u8 reserved4[8]; -} __attribute__((packed)); +#define IPL_RB_CERT_UNKNOWN ((unsigned short)-1) -struct ipl_parameter_block { - struct ipl_list_hdr hdr; - union { - struct ipl_block_fcp fcp; - struct ipl_block_ccw ccw; - char raw[PAGE_SIZE - sizeof(struct ipl_list_hdr)]; - } ipl_info; -} __packed __aligned(PAGE_SIZE); +#define DIAG308_VMPARM_SIZE (64) +#define DIAG308_SCPDATA_OFFSET offsetof(struct ipl_parameter_block, \ + fcp.scp_data) +#define DIAG308_SCPDATA_SIZE (PAGE_SIZE - DIAG308_SCPDATA_OFFSET) struct save_area; struct save_area * __init save_area_alloc(bool is_boot_cpu); @@ -88,7 +50,6 @@ void __init save_area_add_regs(struct save_area *, void *regs); void __init save_area_add_vxrs(struct save_area *, __vector128 *vxrs); extern void s390_reset_system(void); -extern void ipl_store_parameters(void); extern size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, const struct ipl_parameter_block *ipb); @@ -122,6 +83,33 @@ extern struct ipl_info ipl_info; extern void setup_ipl(void); extern void set_os_info_reipl_block(void); +struct ipl_report { + struct ipl_parameter_block *ipib; + struct list_head components; + struct list_head certificates; + size_t size; +}; + +struct ipl_report_component { + struct list_head list; + struct ipl_rb_component_entry entry; +}; + +struct ipl_report_certificate { + struct list_head list; + struct ipl_rb_certificate_entry entry; + void *key; +}; + +struct kexec_buf; +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib); +void *ipl_report_finish(struct ipl_report *report); +int ipl_report_free(struct ipl_report *report); +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert); +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len); + /* * DIAG 308 support */ @@ -133,32 +121,12 @@ enum diag308_subcode { DIAG308_STORE = 6, }; -enum diag308_ipl_type { - DIAG308_IPL_TYPE_FCP = 0, - DIAG308_IPL_TYPE_CCW = 2, -}; - -enum diag308_opt { - DIAG308_IPL_OPT_IPL = 0x10, - DIAG308_IPL_OPT_DUMP = 0x20, -}; - -enum diag308_flags { - DIAG308_FLAGS_LP_VALID = 0x80, -}; - -enum diag308_vm_flags { - DIAG308_VM_FLAGS_NSS_VALID = 0x80, - DIAG308_VM_FLAGS_VP_VALID = 0x40, -}; - enum diag308_rc { DIAG308_RC_OK = 0x0001, DIAG308_RC_NOCONFIG = 0x0102, }; extern int diag308(unsigned long subcode, void *addr); -extern void diag308_reset(void); extern void store_status(void (*fn)(void *), void *data); extern void lgr_info_log(void); diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h index afaf5e3c57fd..9f75d67b8c20 100644 --- a/arch/s390/include/asm/irq.h +++ b/arch/s390/include/asm/irq.h @@ -47,7 +47,6 @@ enum interruption_class { IRQEXT_CMC, IRQEXT_FTP, IRQIO_CIO, - IRQIO_QAI, IRQIO_DAS, IRQIO_C15, IRQIO_C70, @@ -55,12 +54,14 @@ enum interruption_class { IRQIO_VMR, IRQIO_LCS, IRQIO_CTC, - IRQIO_APB, IRQIO_ADM, IRQIO_CSC, - IRQIO_PCI, - IRQIO_MSI, IRQIO_VIR, + IRQIO_QAI, + IRQIO_APB, + IRQIO_PCF, + IRQIO_PCD, + IRQIO_MSI, IRQIO_VAI, IRQIO_GAL, NMI_NMI, diff --git a/arch/s390/include/asm/kexec.h b/arch/s390/include/asm/kexec.h index 825dd0f7f221..ea398a05f643 100644 --- a/arch/s390/include/asm/kexec.h +++ b/arch/s390/include/asm/kexec.h @@ -11,6 +11,7 @@ #include <asm/processor.h> #include <asm/page.h> +#include <asm/setup.h> /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. * I.e. Maximum page that is mapped directly into kernel memory, @@ -42,6 +43,9 @@ /* The native architecture */ #define KEXEC_ARCH KEXEC_ARCH_S390 +/* Allow kexec_file to load a segment to 0 */ +#define KEXEC_BUF_MEM_UNKNOWN -1 + /* Provide a dummy definition to avoid build failures. */ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { } @@ -51,20 +55,24 @@ struct s390_load_data { /* Pointer to the kernel buffer. Used to register cmdline etc.. */ void *kernel_buf; + /* Load address of the kernel_buf. */ + unsigned long kernel_mem; + + /* Parmarea in the kernel buffer. */ + struct parmarea *parm; + /* Total size of loaded segments in memory. Used as an offset. */ size_t memsz; - /* Load address of initrd. Used to register INITRD_START in kernel. */ - unsigned long initrd_load_addr; + struct ipl_report *report; }; -int kexec_file_add_purgatory(struct kimage *image, - struct s390_load_data *data); -int kexec_file_add_initrd(struct kimage *image, - struct s390_load_data *data, - char *initrd, unsigned long initrd_len); -int *kexec_file_update_kernel(struct kimage *iamge, - struct s390_load_data *data); +int s390_verify_sig(const char *kernel, unsigned long kernel_len); +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)); +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr); extern const struct kexec_file_ops s390_kexec_image_ops; extern const struct kexec_file_ops s390_kexec_elf_ops; diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index 1b95da3fdd64..7f22262b0e46 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -28,5 +28,12 @@ .long (_target) - . ; \ .previous +#define EX_TABLE_DMA(_fault, _target) \ + .section .dma.ex_table, "a" ; \ + .align 4 ; \ + .long (_fault) - . ; \ + .long (_target) - . ; \ + .previous + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 5b9f10b1e55d..237ee0c4169f 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -129,7 +129,7 @@ struct lowcore { /* SMP info area */ __u32 cpu_nr; /* 0x03a0 */ __u32 softirq_pending; /* 0x03a4 */ - __u32 preempt_count; /* 0x03a8 */ + __s32 preempt_count; /* 0x03a8 */ __u32 spinlock_lockval; /* 0x03ac */ __u32 spinlock_index; /* 0x03b0 */ __u32 fpu_flags; /* 0x03b4 */ diff --git a/arch/s390/include/asm/nospec-insn.h b/arch/s390/include/asm/nospec-insn.h index 123dac3717b3..0033dcd663b1 100644 --- a/arch/s390/include/asm/nospec-insn.h +++ b/arch/s390/include/asm/nospec-insn.h @@ -32,23 +32,23 @@ _LC_BR_R1 = __LC_BR_R1 .endm .macro __THUNK_PROLOG_BR r1,r2 - __THUNK_PROLOG_NAME __s390x_indirect_jump_r\r2\()use_r\r1 + __THUNK_PROLOG_NAME __s390_indirect_jump_r\r2\()use_r\r1 .endm .macro __THUNK_PROLOG_BC d0,r1,r2 - __THUNK_PROLOG_NAME __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + __THUNK_PROLOG_NAME __s390_indirect_branch_\d0\()_\r2\()use_\r1 .endm .macro __THUNK_BR r1,r2 - jg __s390x_indirect_jump_r\r2\()use_r\r1 + jg __s390_indirect_jump_r\r2\()use_r\r1 .endm .macro __THUNK_BC d0,r1,r2 - jg __s390x_indirect_branch_\d0\()_\r2\()use_\r1 + jg __s390_indirect_branch_\d0\()_\r2\()use_\r1 .endm .macro __THUNK_BRASL r1,r2,r3 - brasl \r1,__s390x_indirect_jump_r\r3\()use_r\r2 + brasl \r1,__s390_indirect_jump_r\r3\()use_r\r2 .endm .macro __DECODE_RR expand,reg,ruse diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 4e0efebc56a9..305befd55326 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -26,6 +26,9 @@ int pci_proc_domain(struct pci_bus *); #define ZPCI_BUS_NR 0 /* default bus number */ #define ZPCI_DEVFN 0 /* default device number */ +#define ZPCI_NR_DMA_SPACES 1 +#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS + /* PCI Function Controls */ #define ZPCI_FC_FN_ENABLED 0x80 #define ZPCI_FC_ERROR 0x40 @@ -83,6 +86,8 @@ enum zpci_state { struct zpci_bar_struct { struct resource *res; /* bus resource */ + void __iomem *mio_wb; + void __iomem *mio_wt; u32 val; /* bar start & 3 flag bits */ u16 map_idx; /* index into bar mapping array */ u8 size; /* order 2 exponent */ @@ -112,6 +117,8 @@ struct zpci_dev { /* IRQ stuff */ u64 msi_addr; /* MSI address */ unsigned int max_msi; /* maximum number of MSI's */ + unsigned int msi_first_bit; + unsigned int msi_nr_irqs; struct airq_iv *aibv; /* adapter interrupt bit vector */ unsigned long aisb; /* number of the summary bit */ @@ -130,6 +137,7 @@ struct zpci_dev { struct iommu_device iommu_dev; /* IOMMU core handle */ char res_name[16]; + bool mio_capable; struct zpci_bar_struct bars[PCI_BAR_COUNT]; u64 start_dma; /* Start of available DMA addresses */ @@ -158,6 +166,7 @@ static inline bool zdev_enabled(struct zpci_dev *zdev) } extern const struct attribute_group *zpci_attr_groups[]; +extern unsigned int s390_pci_force_floating __initdata; /* ----------------------------------------------------------------------------- Prototypes @@ -219,6 +228,9 @@ struct zpci_dev *get_zdev_by_fid(u32); int zpci_dma_init(void); void zpci_dma_exit(void); +int __init zpci_irq_init(void); +void __init zpci_irq_exit(void); + /* FMB */ int zpci_fmb_enable_device(struct zpci_dev *); int zpci_fmb_disable_device(struct zpci_dev *); diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index b3b31b31f0d3..3ec52a05d500 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -43,6 +43,8 @@ struct clp_fh_list_entry { #define CLP_SET_ENABLE_PCI_FN 0 /* Yes, 0 enables it */ #define CLP_SET_DISABLE_PCI_FN 1 /* Yes, 1 disables it */ +#define CLP_SET_ENABLE_MIO 2 +#define CLP_SET_DISABLE_MIO 3 #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 @@ -80,7 +82,8 @@ struct clp_req_query_pci { struct clp_rsp_query_pci { struct clp_rsp_hdr hdr; u16 vfn; /* virtual fn number */ - u16 : 7; + u16 : 6; + u16 mio_addr_avail : 1; u16 util_str_avail : 1; /* utility string available? */ u16 pfgid : 8; /* pci function group id */ u32 fid; /* pci function id */ @@ -96,6 +99,15 @@ struct clp_rsp_query_pci { u32 reserved[11]; u32 uid; /* user defined id */ u8 util_str[CLP_UTIL_STR_LEN]; /* utility string */ + u32 reserved2[16]; + u32 mio_valid : 6; + u32 : 26; + u32 : 32; + struct { + u64 wb; + u64 wt; + } addr[PCI_BAR_COUNT]; + u32 reserved3[6]; } __packed; /* Query PCI function group request */ @@ -118,7 +130,11 @@ struct clp_rsp_query_pci_grp { u8 refresh : 1; /* TLB refresh mode */ u16 reserved2; u16 mui; - u64 reserved3; + u16 : 16; + u16 maxfaal; + u16 : 4; + u16 dnoi : 12; + u16 maxcpu; u64 dasm; /* dma address space mask */ u64 msia; /* MSI address */ u64 reserved4; diff --git a/arch/s390/include/asm/pci_insn.h b/arch/s390/include/asm/pci_insn.h index ba22a6ea51a1..ff81ed19c506 100644 --- a/arch/s390/include/asm/pci_insn.h +++ b/arch/s390/include/asm/pci_insn.h @@ -2,6 +2,8 @@ #ifndef _ASM_S390_PCI_INSN_H #define _ASM_S390_PCI_INSN_H +#include <linux/jump_label.h> + /* Load/Store status codes */ #define ZPCI_PCI_ST_FUNC_NOT_ENABLED 4 #define ZPCI_PCI_ST_FUNC_IN_ERR 8 @@ -38,6 +40,8 @@ #define ZPCI_MOD_FC_RESET_ERROR 7 #define ZPCI_MOD_FC_RESET_BLOCK 9 #define ZPCI_MOD_FC_SET_MEASURE 10 +#define ZPCI_MOD_FC_REG_INT_D 16 +#define ZPCI_MOD_FC_DEREG_INT_D 17 /* FIB function controls */ #define ZPCI_FIB_FC_ENABLED 0x80 @@ -51,16 +55,7 @@ #define ZPCI_FIB_FC_LS_BLOCKED 0x20 #define ZPCI_FIB_FC_DMAAS_REG 0x10 -/* Function Information Block */ -struct zpci_fib { - u32 fmt : 8; /* format */ - u32 : 24; - u32 : 32; - u8 fc; /* function controls */ - u64 : 56; - u64 pba; /* PCI base address */ - u64 pal; /* PCI address limit */ - u64 iota; /* I/O Translation Anchor */ +struct zpci_fib_fmt0 { u32 : 1; u32 isc : 3; /* Interrupt subclass */ u32 noi : 12; /* Number of interrupts */ @@ -72,16 +67,90 @@ struct zpci_fib { u32 : 32; u64 aibv; /* Adapter int bit vector address */ u64 aisb; /* Adapter int summary bit address */ +}; + +struct zpci_fib_fmt1 { + u32 : 4; + u32 noi : 12; + u32 : 16; + u32 dibvo : 16; + u32 : 16; + u64 : 64; + u64 : 64; +}; + +/* Function Information Block */ +struct zpci_fib { + u32 fmt : 8; /* format */ + u32 : 24; + u32 : 32; + u8 fc; /* function controls */ + u64 : 56; + u64 pba; /* PCI base address */ + u64 pal; /* PCI address limit */ + u64 iota; /* I/O Translation Anchor */ + union { + struct zpci_fib_fmt0 fmt0; + struct zpci_fib_fmt1 fmt1; + }; u64 fmb_addr; /* Function measurement block address and key */ u32 : 32; u32 gd; } __packed __aligned(8); +/* directed interruption information block */ +struct zpci_diib { + u32 : 1; + u32 isc : 3; + u32 : 28; + u16 : 16; + u16 nr_cpus; + u64 disb_addr; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +/* cpu directed interruption information block */ +struct zpci_cdiib { + u64 : 64; + u64 dibv_addr; + u64 : 64; + u64 : 64; + u64 : 64; +} __packed __aligned(8); + +union zpci_sic_iib { + struct zpci_diib diib; + struct zpci_cdiib cdiib; +}; + +DECLARE_STATIC_KEY_FALSE(have_mio); + u8 zpci_mod_fc(u64 req, struct zpci_fib *fib, u8 *status); int zpci_refresh_trans(u64 fn, u64 addr, u64 range); -int zpci_load(u64 *data, u64 req, u64 offset); -int zpci_store(u64 data, u64 req, u64 offset); -int zpci_store_block(const u64 *data, u64 req, u64 offset); -int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc); +int __zpci_load(u64 *data, u64 req, u64 offset); +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len); +int __zpci_store(u64 data, u64 req, u64 offset); +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len); +int __zpci_store_block(const u64 *data, u64 req, u64 offset); +void zpci_barrier(void); +int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib); + +static inline int zpci_set_irq_ctrl(u16 ctl, u8 isc) +{ + union zpci_sic_iib iib = {{0}}; + + return __zpci_set_irq_ctrl(ctl, isc, &iib); +} + +#ifdef CONFIG_PCI +static inline void enable_mio_ctl(void) +{ + if (static_branch_likely(&have_mio)) + __ctl_set_bit(2, 5); +} +#else /* CONFIG_PCI */ +static inline void enable_mio_ctl(void) {} +#endif /* CONFIG_PCI */ #endif diff --git a/arch/s390/include/asm/pci_io.h b/arch/s390/include/asm/pci_io.h index cbb9cb9c6547..cd060b5dd8fd 100644 --- a/arch/s390/include/asm/pci_io.h +++ b/arch/s390/include/asm/pci_io.h @@ -37,12 +37,10 @@ extern struct zpci_iomap_entry *zpci_iomap_start; #define zpci_read(LENGTH, RETTYPE) \ static inline RETTYPE zpci_read_##RETTYPE(const volatile void __iomem *addr) \ { \ - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; \ - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, LENGTH); \ u64 data; \ int rc; \ \ - rc = zpci_load(&data, req, ZPCI_OFFSET(addr)); \ + rc = zpci_load(&data, addr, LENGTH); \ if (rc) \ data = -1ULL; \ return (RETTYPE) data; \ @@ -52,11 +50,9 @@ static inline RETTYPE zpci_read_##RETTYPE(const volatile void __iomem *addr) \ static inline void zpci_write_##VALTYPE(VALTYPE val, \ const volatile void __iomem *addr) \ { \ - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; \ - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, LENGTH); \ u64 data = (VALTYPE) val; \ \ - zpci_store(data, req, ZPCI_OFFSET(addr)); \ + zpci_store(addr, data, LENGTH); \ } zpci_read(8, u64) @@ -68,36 +64,38 @@ zpci_write(4, u32) zpci_write(2, u16) zpci_write(1, u8) -static inline int zpci_write_single(u64 req, const u64 *data, u64 offset, u8 len) +static inline int zpci_write_single(volatile void __iomem *dst, const void *src, + unsigned long len) { u64 val; switch (len) { case 1: - val = (u64) *((u8 *) data); + val = (u64) *((u8 *) src); break; case 2: - val = (u64) *((u16 *) data); + val = (u64) *((u16 *) src); break; case 4: - val = (u64) *((u32 *) data); + val = (u64) *((u32 *) src); break; case 8: - val = (u64) *((u64 *) data); + val = (u64) *((u64 *) src); break; default: val = 0; /* let FW report error */ break; } - return zpci_store(val, req, offset); + return zpci_store(dst, val, len); } -static inline int zpci_read_single(u64 req, u64 *dst, u64 offset, u8 len) +static inline int zpci_read_single(void *dst, const volatile void __iomem *src, + unsigned long len) { u64 data; int cc; - cc = zpci_load(&data, req, offset); + cc = zpci_load(&data, src, len); if (cc) goto out; @@ -119,10 +117,8 @@ out: return cc; } -static inline int zpci_write_block(u64 req, const u64 *data, u64 offset) -{ - return zpci_store_block(data, req, offset); -} +int zpci_write_block(volatile void __iomem *dst, const void *src, + unsigned long len); static inline u8 zpci_get_max_write_size(u64 src, u64 dst, int len, int max) { @@ -140,18 +136,15 @@ static inline int zpci_memcpy_fromio(void *dst, const volatile void __iomem *src, unsigned long n) { - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(src)]; - u64 req, offset = ZPCI_OFFSET(src); int size, rc = 0; while (n > 0) { size = zpci_get_max_write_size((u64 __force) src, (u64) dst, n, 8); - req = ZPCI_CREATE_REQ(entry->fh, entry->bar, size); - rc = zpci_read_single(req, dst, offset, size); + rc = zpci_read_single(dst, src, size); if (rc) break; - offset += size; + src += size; dst += size; n -= size; } @@ -161,8 +154,6 @@ static inline int zpci_memcpy_fromio(void *dst, static inline int zpci_memcpy_toio(volatile void __iomem *dst, const void *src, unsigned long n) { - struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; - u64 req, offset = ZPCI_OFFSET(dst); int size, rc = 0; if (!src) @@ -171,16 +162,14 @@ static inline int zpci_memcpy_toio(volatile void __iomem *dst, while (n > 0) { size = zpci_get_max_write_size((u64 __force) dst, (u64) src, n, 128); - req = ZPCI_CREATE_REQ(entry->fh, entry->bar, size); - if (size > 8) /* main path */ - rc = zpci_write_block(req, src, offset); + rc = zpci_write_block(dst, src, size); else - rc = zpci_write_single(req, src, offset, size); + rc = zpci_write_single(dst, src, size); if (rc) break; - offset += size; src += size; + dst += size; n -= size; } return rc; diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 76dc344edb8c..9f0195d5fa16 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -238,7 +238,7 @@ static inline int is_module_addr(void *addr) #define _REGION_ENTRY_NOEXEC 0x100 /* region no-execute bit */ #define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */ #define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */ -#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */ +#define _REGION_ENTRY_TYPE_MASK 0x0c /* region table type mask */ #define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */ #define _REGION_ENTRY_TYPE_R2 0x08 /* region second table type */ #define _REGION_ENTRY_TYPE_R3 0x04 /* region third table type */ @@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr) #define _SEGMENT_ENTRY_PROTECT 0x200 /* segment protection bit */ #define _SEGMENT_ENTRY_NOEXEC 0x100 /* segment no-execute bit */ #define _SEGMENT_ENTRY_INVALID 0x20 /* invalid segment table entry */ +#define _SEGMENT_ENTRY_TYPE_MASK 0x0c /* segment table type mask */ #define _SEGMENT_ENTRY (0) #define _SEGMENT_ENTRY_EMPTY (_SEGMENT_ENTRY_INVALID) @@ -614,15 +615,9 @@ static inline int pgd_none(pgd_t pgd) static inline int pgd_bad(pgd_t pgd) { - /* - * With dynamic page table levels the pgd can be a region table - * entry or a segment table entry. Check for the bit that are - * invalid for either table entry. - */ - unsigned long mask = - ~_SEGMENT_ENTRY_ORIGIN & ~_REGION_ENTRY_INVALID & - ~_REGION_ENTRY_TYPE_MASK & ~_REGION_ENTRY_LENGTH; - return (pgd_val(pgd) & mask) != 0; + if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R1) + return 0; + return (pgd_val(pgd) & ~_REGION_ENTRY_BITS) != 0; } static inline unsigned long pgd_pfn(pgd_t pgd) @@ -703,6 +698,8 @@ static inline int pmd_large(pmd_t pmd) static inline int pmd_bad(pmd_t pmd) { + if ((pmd_val(pmd) & _SEGMENT_ENTRY_TYPE_MASK) > 0) + return 1; if (pmd_large(pmd)) return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS_LARGE) != 0; return (pmd_val(pmd) & ~_SEGMENT_ENTRY_BITS) != 0; @@ -710,8 +707,12 @@ static inline int pmd_bad(pmd_t pmd) static inline int pud_bad(pud_t pud) { - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R3) - return pmd_bad(__pmd(pud_val(pud))); + unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R3) + return 1; + if (type < _REGION_ENTRY_TYPE_R3) + return 0; if (pud_large(pud)) return (pud_val(pud) & ~_REGION_ENTRY_BITS_LARGE) != 0; return (pud_val(pud) & ~_REGION_ENTRY_BITS) != 0; @@ -719,8 +720,12 @@ static inline int pud_bad(pud_t pud) static inline int p4d_bad(p4d_t p4d) { - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) < _REGION_ENTRY_TYPE_R2) - return pud_bad(__pud(p4d_val(p4d))); + unsigned long type = p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK; + + if (type > _REGION_ENTRY_TYPE_R2) + return 1; + if (type < _REGION_ENTRY_TYPE_R2) + return 0; return (p4d_val(p4d) & ~_REGION_ENTRY_BITS) != 0; } @@ -1204,41 +1209,78 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index(address)) -#define pgd_offset_k(address) pgd_offset(&init_mm, address) -#define pgd_offset_raw(pgd, addr) ((pgd) + pgd_index(addr)) - #define pmd_deref(pmd) (pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN) #define pud_deref(pud) (pud_val(pud) & _REGION_ENTRY_ORIGIN) #define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN) #define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) -static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) +/* + * The pgd_offset function *always* adds the index for the top-level + * region/segment table. This is done to get a sequence like the + * following to work: + * pgdp = pgd_offset(current->mm, addr); + * pgd = READ_ONCE(*pgdp); + * p4dp = p4d_offset(&pgd, addr); + * ... + * The subsequent p4d_offset, pud_offset and pmd_offset functions + * only add an index if they dereferenced the pointer. + */ +static inline pgd_t *pgd_offset_raw(pgd_t *pgd, unsigned long address) { - p4d_t *p4d = (p4d_t *) pgd; + unsigned long rste; + unsigned int shift; - if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) - p4d = (p4d_t *) pgd_deref(*pgd); - return p4d + p4d_index(address); + /* Get the first entry of the top level table */ + rste = pgd_val(*pgd); + /* Pick up the shift from the table type of the first entry */ + shift = ((rste & _REGION_ENTRY_TYPE_MASK) >> 2) * 11 + 20; + return pgd + ((address >> shift) & (PTRS_PER_PGD - 1)); } -static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +#define pgd_offset(mm, address) pgd_offset_raw(READ_ONCE((mm)->pgd), address) +#define pgd_offset_k(address) pgd_offset(&init_mm, address) + +static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) { - pud_t *pud = (pud_t *) p4d; + if ((pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R1) + return (p4d_t *) pgd_deref(*pgd) + p4d_index(address); + return (p4d_t *) pgd; +} - if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pud = (pud_t *) p4d_deref(*p4d); - return pud + pud_index(address); +static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) +{ + if ((p4d_val(*p4d) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R2) + return (pud_t *) p4d_deref(*p4d) + pud_index(address); + return (pud_t *) p4d; } static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) { - pmd_t *pmd = (pmd_t *) pud; + if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) >= _REGION_ENTRY_TYPE_R3) + return (pmd_t *) pud_deref(*pud) + pmd_index(address); + return (pmd_t *) pud; +} - if ((pud_val(*pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pmd = (pmd_t *) pud_deref(*pud); - return pmd + pmd_index(address); +static inline pte_t *pte_offset(pmd_t *pmd, unsigned long address) +{ + return (pte_t *) pmd_deref(*pmd) + pte_index(address); +} + +#define pte_offset_kernel(pmd, address) pte_offset(pmd, address) +#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) +#define pte_unmap(pte) do { } while (0) + +static inline bool gup_fast_permitted(unsigned long start, int nr_pages) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + if (end < start) + return false; + return end <= current->mm->context.asce_limit; } +#define gup_fast_permitted gup_fast_permitted #define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot)) #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) @@ -1249,12 +1291,6 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address) #define p4d_page(p4d) pfn_to_page(p4d_pfn(p4d)) #define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) -/* Find an entry in the lowest level page table.. */ -#define pte_offset(pmd, addr) ((pte_t *) pmd_deref(*(pmd)) + pte_index(addr)) -#define pte_offset_kernel(pmd, address) pte_offset(pmd,address) -#define pte_offset_map(pmd, address) pte_offset_kernel(pmd, address) -#define pte_unmap(pte) do { } while (0) - static inline pmd_t pmd_wrprotect(pmd_t pmd) { pmd_val(pmd) &= ~_SEGMENT_ENTRY_WRITE; diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 81038ab357ce..b0fcbc37b637 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -156,25 +156,6 @@ struct thread_struct { typedef struct thread_struct thread_struct; -/* - * Stack layout of a C stack frame. - */ -#ifndef __PACK_STACK -struct stack_frame { - unsigned long back_chain; - unsigned long empty1[5]; - unsigned long gprs[10]; - unsigned int empty2[8]; -}; -#else -struct stack_frame { - unsigned long empty1[5]; - unsigned int empty2[8]; - unsigned long gprs[10]; - unsigned long back_chain; -}; -#endif - #define ARCH_MIN_TASKALIGN 8 #define INIT_THREAD { \ @@ -206,11 +187,7 @@ struct mm_struct; struct seq_file; struct pt_regs; -typedef int (*dump_trace_func_t)(void *data, unsigned long address, int reliable); -void dump_trace(dump_trace_func_t func, void *data, - struct task_struct *task, unsigned long sp); void show_registers(struct pt_regs *regs); - void show_cacheinfo(struct seq_file *m); /* Free all resources held by a thread. */ @@ -244,55 +221,6 @@ static __no_kasan_or_inline unsigned short stap(void) return cpu_address; } -#define CALL_ARGS_0() \ - register unsigned long r2 asm("2") -#define CALL_ARGS_1(arg1) \ - register unsigned long r2 asm("2") = (unsigned long)(arg1) -#define CALL_ARGS_2(arg1, arg2) \ - CALL_ARGS_1(arg1); \ - register unsigned long r3 asm("3") = (unsigned long)(arg2) -#define CALL_ARGS_3(arg1, arg2, arg3) \ - CALL_ARGS_2(arg1, arg2); \ - register unsigned long r4 asm("4") = (unsigned long)(arg3) -#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ - CALL_ARGS_3(arg1, arg2, arg3); \ - register unsigned long r4 asm("5") = (unsigned long)(arg4) -#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ - CALL_ARGS_4(arg1, arg2, arg3, arg4); \ - register unsigned long r4 asm("6") = (unsigned long)(arg5) - -#define CALL_FMT_0 -#define CALL_FMT_1 CALL_FMT_0, "0" (r2) -#define CALL_FMT_2 CALL_FMT_1, "d" (r3) -#define CALL_FMT_3 CALL_FMT_2, "d" (r4) -#define CALL_FMT_4 CALL_FMT_3, "d" (r5) -#define CALL_FMT_5 CALL_FMT_4, "d" (r6) - -#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" -#define CALL_CLOBBER_4 CALL_CLOBBER_5 -#define CALL_CLOBBER_3 CALL_CLOBBER_4, "5" -#define CALL_CLOBBER_2 CALL_CLOBBER_3, "4" -#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" -#define CALL_CLOBBER_0 CALL_CLOBBER_1 - -#define CALL_ON_STACK(fn, stack, nr, args...) \ -({ \ - CALL_ARGS_##nr(args); \ - unsigned long prev; \ - \ - asm volatile( \ - " la %[_prev],0(15)\n" \ - " la 15,0(%[_stack])\n" \ - " stg %[_prev],%[_bc](15)\n" \ - " brasl 14,%[_fn]\n" \ - " la 15,0(%[_prev])\n" \ - : "+&d" (r2), [_prev] "=&a" (prev) \ - : [_stack] "a" (stack), \ - [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ - [_fn] "X" (fn) CALL_FMT_##nr : CALL_CLOBBER_##nr); \ - r2; \ -}) - /* * Give up the time slice of the virtual PU. */ @@ -339,10 +267,10 @@ static __no_kasan_or_inline void __load_psw_mask(unsigned long mask) asm volatile( " larl %0,1f\n" - " stg %0,%O1+8(%R1)\n" - " lpswe %1\n" + " stg %0,%1\n" + " lpswe %2\n" "1:" - : "=&d" (addr), "=Q" (psw) : "Q" (psw) : "memory", "cc"); + : "=&d" (addr), "=Q" (psw.addr) : "Q" (psw) : "memory", "cc"); } /* @@ -387,12 +315,12 @@ void enabled_wait(void); /* * Function to drop a processor into disabled wait state */ -static inline void __noreturn disabled_wait(unsigned long code) +static inline void __noreturn disabled_wait(void) { psw_t psw; psw.mask = PSW_MASK_BASE | PSW_MASK_WAIT | PSW_MASK_BA | PSW_MASK_EA; - psw.addr = code; + psw.addr = _THIS_IP_; __load_psw(psw); while (1); } diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index ef4c9dec06a4..f577c5f6031a 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -79,6 +79,9 @@ struct sclp_info { unsigned char has_kss : 1; unsigned char has_gisaf : 1; unsigned char has_diag318 : 1; + unsigned char has_sipl : 1; + unsigned char has_sipl_g2 : 1; + unsigned char has_dirq : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/arch/s390/include/asm/sections.h b/arch/s390/include/asm/sections.h index 7afe4620685c..42de04ad9c07 100644 --- a/arch/s390/include/asm/sections.h +++ b/arch/s390/include/asm/sections.h @@ -2,8 +2,20 @@ #ifndef _S390_SECTIONS_H #define _S390_SECTIONS_H +#define arch_is_kernel_initmem_freed arch_is_kernel_initmem_freed + #include <asm-generic/sections.h> +extern bool initmem_freed; + +static inline int arch_is_kernel_initmem_freed(unsigned long addr) +{ + if (!initmem_freed) + return 0; + return addr >= (unsigned long)__init_begin && + addr < (unsigned long)__init_end; +} + /* * .boot.data section contains variables "shared" between the decompressor and * the decompressed kernel. The decompressor will store values in them, and @@ -16,4 +28,14 @@ */ #define __bootdata(var) __section(.boot.data.var) var +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define __bootdata_preserved(var) __section(.boot.preserved.data.var) var + +extern unsigned long __sdma, __edma; +extern unsigned long __stext_dma, __etext_dma; + #endif diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index efda97804aa4..925889d360c1 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -12,7 +12,10 @@ #define EP_OFFSET 0x10008 #define EP_STRING "S390EP" #define PARMAREA 0x10400 -#define PARMAREA_END 0x11000 +#define EARLY_SCCB_OFFSET 0x11000 +#define HEAD_END 0x12000 + +#define EARLY_SCCB_SIZE PAGE_SIZE /* * Machine features detected in early.c @@ -65,6 +68,16 @@ #define OLDMEM_SIZE (*(unsigned long *) (OLDMEM_SIZE_OFFSET)) #define COMMAND_LINE ((char *) (COMMAND_LINE_OFFSET)) +struct parmarea { + unsigned long ipl_device; /* 0x10400 */ + unsigned long initrd_start; /* 0x10408 */ + unsigned long initrd_size; /* 0x10410 */ + unsigned long oldmem_base; /* 0x10418 */ + unsigned long oldmem_size; /* 0x10420 */ + char pad1[0x10480 - 0x10428]; /* 0x10428 - 0x10480 */ + char command_line[ARCH_COMMAND_LINE_SIZE]; /* 0x10480 */ +}; + extern int noexec_disabled; extern int memory_end_set; extern unsigned long memory_end; @@ -134,6 +147,12 @@ extern void (*_machine_restart)(char *command); extern void (*_machine_halt)(void); extern void (*_machine_power_off)(void); +extern unsigned long __kaslr_offset; +static inline unsigned long kaslr_offset(void) +{ + return __kaslr_offset; +} + #else /* __ASSEMBLY__ */ #define IPL_DEVICE (IPL_DEVICE_OFFSET) diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h new file mode 100644 index 000000000000..49634bfbecdd --- /dev/null +++ b/arch/s390/include/asm/stacktrace.h @@ -0,0 +1,114 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_STACKTRACE_H +#define _ASM_S390_STACKTRACE_H + +#include <linux/uaccess.h> +#include <linux/ptrace.h> +#include <asm/switch_to.h> + +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_NODAT, + STACK_TYPE_RESTART, +}; + +struct stack_info { + enum stack_type type; + unsigned long begin, end; +}; + +const char *stack_type_name(enum stack_type type); +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +static inline bool on_stack(struct stack_info *info, + unsigned long addr, size_t len) +{ + if (info->type == STACK_TYPE_UNKNOWN) + return false; + if (addr + len < addr) + return false; + return addr >= info->begin && addr + len < info->end; +} + +static inline unsigned long get_stack_pointer(struct task_struct *task, + struct pt_regs *regs) +{ + if (regs) + return (unsigned long) kernel_stack_pointer(regs); + if (task == current) + return current_stack_pointer(); + return (unsigned long) task->thread.ksp; +} + +/* + * Stack layout of a C stack frame. + */ +#ifndef __PACK_STACK +struct stack_frame { + unsigned long back_chain; + unsigned long empty1[5]; + unsigned long gprs[10]; + unsigned int empty2[8]; +}; +#else +struct stack_frame { + unsigned long empty1[5]; + unsigned int empty2[8]; + unsigned long gprs[10]; + unsigned long back_chain; +}; +#endif + +#define CALL_ARGS_0() \ + register unsigned long r2 asm("2") +#define CALL_ARGS_1(arg1) \ + register unsigned long r2 asm("2") = (unsigned long)(arg1) +#define CALL_ARGS_2(arg1, arg2) \ + CALL_ARGS_1(arg1); \ + register unsigned long r3 asm("3") = (unsigned long)(arg2) +#define CALL_ARGS_3(arg1, arg2, arg3) \ + CALL_ARGS_2(arg1, arg2); \ + register unsigned long r4 asm("4") = (unsigned long)(arg3) +#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ + CALL_ARGS_3(arg1, arg2, arg3); \ + register unsigned long r4 asm("5") = (unsigned long)(arg4) +#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ + CALL_ARGS_4(arg1, arg2, arg3, arg4); \ + register unsigned long r4 asm("6") = (unsigned long)(arg5) + +#define CALL_FMT_0 "=&d" (r2) : +#define CALL_FMT_1 "+&d" (r2) : +#define CALL_FMT_2 CALL_FMT_1 "d" (r3), +#define CALL_FMT_3 CALL_FMT_2 "d" (r4), +#define CALL_FMT_4 CALL_FMT_3 "d" (r5), +#define CALL_FMT_5 CALL_FMT_4 "d" (r6), + +#define CALL_CLOBBER_5 "0", "1", "14", "cc", "memory" +#define CALL_CLOBBER_4 CALL_CLOBBER_5 +#define CALL_CLOBBER_3 CALL_CLOBBER_4, "5" +#define CALL_CLOBBER_2 CALL_CLOBBER_3, "4" +#define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" +#define CALL_CLOBBER_0 CALL_CLOBBER_1 + +#define CALL_ON_STACK(fn, stack, nr, args...) \ +({ \ + CALL_ARGS_##nr(args); \ + unsigned long prev; \ + \ + asm volatile( \ + " la %[_prev],0(15)\n" \ + " la 15,0(%[_stack])\n" \ + " stg %[_prev],%[_bc](15)\n" \ + " brasl 14,%[_fn]\n" \ + " la 15,0(%[_prev])\n" \ + : [_prev] "=&a" (prev), CALL_FMT_##nr \ + [_stack] "a" (stack), \ + [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ + [_fn] "X" (fn) : CALL_CLOBBER_##nr); \ + r2; \ +}) + +#endif /* _ASM_S390_STACKTRACE_H */ diff --git a/arch/s390/include/asm/syscall.h b/arch/s390/include/asm/syscall.h index 59c3e91f2cdb..ab3407aa4fd8 100644 --- a/arch/s390/include/asm/syscall.h +++ b/arch/s390/include/asm/syscall.h @@ -14,13 +14,8 @@ #include <linux/err.h> #include <asm/ptrace.h> -/* - * The syscall table always contains 32 bit pointers since we know that the - * address of the function to be called is (way) below 4GB. So the "int" - * type here is what we want [need] for both 32 bit and 64 bit systems. - */ -extern const unsigned int sys_call_table[]; -extern const unsigned int sys_call_table_emu[]; +extern const unsigned long sys_call_table[]; +extern const unsigned long sys_call_table_emu[]; static inline long syscall_get_nr(struct task_struct *task, struct pt_regs *regs) diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 5596c5c625d2..3c3d6fe8e2f0 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -119,8 +119,8 @@ "Type aliasing is used to sanitize syscall arguments");\ asmlinkage long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ __attribute__((alias(__stringify(__se_sys##name)))); \ - ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ - static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ + ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ + long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ __S390_SYS_STUBx(x, name, __VA_ARGS__) \ asmlinkage long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index b31c779cf581..aa406c05a350 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -22,98 +22,39 @@ * Pages used for the page tables is a different story. FIXME: more */ -#include <linux/mm.h> -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <asm/processor.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> - -struct mmu_gather { - struct mm_struct *mm; - struct mmu_table_batch *batch; - unsigned int fullmm; - unsigned long start, end; -}; - -struct mmu_table_batch { - struct rcu_head rcu; - unsigned int nr; - void *tables[0]; -}; - -#define MAX_TABLE_BATCH \ - ((PAGE_SIZE - sizeof(struct mmu_table_batch)) / sizeof(void *)) - -extern void tlb_table_flush(struct mmu_gather *tlb); -extern void tlb_remove_table(struct mmu_gather *tlb, void *table); - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - tlb->batch = NULL; -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - __tlb_flush_mm_lazy(tlb->mm); -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - tlb_table_flush(tlb); -} - +void __tlb_remove_table(void *_table); +static inline void tlb_flush(struct mmu_gather *tlb); +static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, + struct page *page, int page_size); -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} +#define tlb_start_vma(tlb, vma) do { } while (0) +#define tlb_end_vma(tlb, vma) do { } while (0) -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->start = start; - tlb->end = end; - } +#define tlb_flush tlb_flush +#define pte_free_tlb pte_free_tlb +#define pmd_free_tlb pmd_free_tlb +#define p4d_free_tlb p4d_free_tlb +#define pud_free_tlb pud_free_tlb - tlb_flush_mmu(tlb); -} +#include <asm/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm-generic/tlb.h> /* * Release the page cache reference for a pte removed by * tlb_ptep_clear_flush. In both flush modes the tlb for a page cache page * has already been freed, so just do free_page_and_swap_cache. */ -static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); -} - static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - return __tlb_remove_page(tlb, page); + free_page_and_swap_cache(page); + return false; } -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) +static inline void tlb_flush(struct mmu_gather *tlb) { - return tlb_remove_page(tlb, page); + __tlb_flush_mm_lazy(tlb->mm); } /* @@ -121,8 +62,17 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, * page table from the tlb. */ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, - unsigned long address) + unsigned long address) { + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_ptes = 1; + /* + * page_table_free_rcu takes care of the allocation bit masks + * of the 2K table fragments in the 4K page table page, + * then calls tlb_remove_table. + */ page_table_free_rcu(tlb, (unsigned long *) pte, address); } @@ -139,6 +89,10 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, if (mm_pmd_folded(tlb->mm)) return; pgtable_pmd_page_dtor(virt_to_page(pmd)); + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_puds = 1; tlb_remove_table(tlb, pmd); } @@ -154,6 +108,10 @@ static inline void p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, { if (mm_p4d_folded(tlb->mm)) return; + __tlb_adjust_range(tlb, address, PAGE_SIZE); + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_p4ds = 1; tlb_remove_table(tlb, p4d); } @@ -169,21 +127,11 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, { if (mm_pud_folded(tlb->mm)) return; + tlb->mm->context.flush_mm = 1; + tlb->freed_tables = 1; + tlb->cleared_puds = 1; tlb_remove_table(tlb, pud); } -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) -#define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) -#define tlb_migrate_finish(mm) do { } while (0) -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} #endif /* _S390_TLB_H */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 007fcb9aeeb8..bd2fd9a7821d 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -55,8 +55,10 @@ raw_copy_from_user(void *to, const void __user *from, unsigned long n); unsigned long __must_check raw_copy_to_user(void __user *to, const void *from, unsigned long n); +#ifndef CONFIG_KASAN #define INLINE_COPY_FROM_USER #define INLINE_COPY_TO_USER +#endif #ifdef CONFIG_HAVE_MARCH_Z10_FEATURES diff --git a/arch/s390/include/asm/unwind.h b/arch/s390/include/asm/unwind.h new file mode 100644 index 000000000000..6eb2ef105d87 --- /dev/null +++ b/arch/s390/include/asm/unwind.h @@ -0,0 +1,101 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_UNWIND_H +#define _ASM_S390_UNWIND_H + +#include <linux/sched.h> +#include <linux/ftrace.h> +#include <asm/ptrace.h> +#include <asm/stacktrace.h> + +/* + * To use the stack unwinder it has to be initialized with unwind_start. + * There four combinations for task and regs: + * 1) task==NULL, regs==NULL: the unwind starts for the task that is currently + * running, sp/ip picked up from the CPU registers + * 2) task==NULL, regs!=NULL: the unwind starts from the sp/ip found in + * the struct pt_regs of an interrupt frame for the current task + * 3) task!=NULL, regs==NULL: the unwind starts for an inactive task with + * the sp picked up from task->thread.ksp and the ip picked up from the + * return address stored by __switch_to + * 4) task!=NULL, regs!=NULL: the sp/ip are picked up from the interrupt + * frame 'regs' of a inactive task + * If 'first_frame' is not zero unwind_start skips unwind frames until it + * reaches the specified stack pointer. + * The end of the unwinding is indicated with unwind_done, this can be true + * right after unwind_start, e.g. with first_frame!=0 that can not be found. + * unwind_next_frame skips to the next frame. + * Once the unwind is completed unwind_error() can be used to check if there + * has been a situation where the unwinder could not correctly understand + * the tasks call chain. + */ + +struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; + struct task_struct *task; + struct pt_regs *regs; + unsigned long sp, ip; + int graph_idx; + bool reliable; + bool error; +}; + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long first_frame); +bool unwind_next_frame(struct unwind_state *state); +unsigned long unwind_get_return_address(struct unwind_state *state); + +static inline bool unwind_done(struct unwind_state *state) +{ + return state->stack_info.type == STACK_TYPE_UNKNOWN; +} + +static inline bool unwind_error(struct unwind_state *state) +{ + return state->error; +} + +static inline void unwind_start(struct unwind_state *state, + struct task_struct *task, + struct pt_regs *regs, + unsigned long sp) +{ + sp = sp ? : get_stack_pointer(task, regs); + __unwind_start(state, task, regs, sp); +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return unwind_done(state) ? NULL : state->regs; +} + +#define unwind_for_each_frame(state, task, regs, first_frame) \ + for (unwind_start(state, task, regs, first_frame); \ + !unwind_done(state); \ + unwind_next_frame(state)) + +static inline void unwind_init(void) {} +static inline void unwind_module_init(struct module *mod, void *orc_ip, + size_t orc_ip_size, void *orc, + size_t orc_size) {} + +#ifdef CONFIG_KASAN +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) +#else +#define READ_ONCE_TASK_STACK(task, x) READ_ONCE(x) +#endif + +#endif /* _ASM_S390_UNWIND_H */ diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h new file mode 100644 index 000000000000..ef3c00b049ab --- /dev/null +++ b/arch/s390/include/asm/uv.h @@ -0,0 +1,132 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Ultravisor Interfaces + * + * Copyright IBM Corp. 2019 + * + * Author(s): + * Vasily Gorbik <gor@linux.ibm.com> + * Janosch Frank <frankja@linux.ibm.com> + */ +#ifndef _ASM_S390_UV_H +#define _ASM_S390_UV_H + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/bug.h> +#include <asm/page.h> + +#define UVC_RC_EXECUTED 0x0001 +#define UVC_RC_INV_CMD 0x0002 +#define UVC_RC_INV_STATE 0x0003 +#define UVC_RC_INV_LEN 0x0005 +#define UVC_RC_NO_RESUME 0x0007 + +#define UVC_CMD_QUI 0x0001 +#define UVC_CMD_SET_SHARED_ACCESS 0x1000 +#define UVC_CMD_REMOVE_SHARED_ACCESS 0x1001 + +/* Bits in installed uv calls */ +enum uv_cmds_inst { + BIT_UVC_CMD_QUI = 0, + BIT_UVC_CMD_SET_SHARED_ACCESS = 8, + BIT_UVC_CMD_REMOVE_SHARED_ACCESS = 9, +}; + +struct uv_cb_header { + u16 len; + u16 cmd; /* Command Code */ + u16 rc; /* Response Code */ + u16 rrc; /* Return Reason Code */ +} __packed __aligned(8); + +struct uv_cb_qui { + struct uv_cb_header header; + u64 reserved08; + u64 inst_calls_list[4]; + u64 reserved30[15]; +} __packed __aligned(8); + +struct uv_cb_share { + struct uv_cb_header header; + u64 reserved08[3]; + u64 paddr; + u64 reserved28; +} __packed __aligned(8); + +static inline int uv_call(unsigned long r1, unsigned long r2) +{ + int cc; + + asm volatile( + "0: .insn rrf,0xB9A40000,%[r1],%[r2],0,0\n" + " brc 3,0b\n" + " ipm %[cc]\n" + " srl %[cc],28\n" + : [cc] "=d" (cc) + : [r1] "a" (r1), [r2] "a" (r2) + : "memory", "cc"); + return cc; +} + +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +extern int prot_virt_guest; + +static inline int is_prot_virt_guest(void) +{ + return prot_virt_guest; +} + +static inline int share(unsigned long addr, u16 cmd) +{ + struct uv_cb_share uvcb = { + .header.cmd = cmd, + .header.len = sizeof(uvcb), + .paddr = addr + }; + + if (!is_prot_virt_guest()) + return -ENOTSUPP; + /* + * Sharing is page wise, if we encounter addresses that are + * not page aligned, we assume something went wrong. If + * malloced structs are passed to this function, we could leak + * data to the hypervisor. + */ + BUG_ON(addr & ~PAGE_MASK); + + if (!uv_call(0, (u64)&uvcb)) + return 0; + return -EINVAL; +} + +/* + * Guest 2 request to the Ultravisor to make a page shared with the + * hypervisor for IO. + * + * @addr: Real or absolute address of the page to be shared + */ +static inline int uv_set_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_SET_SHARED_ACCESS); +} + +/* + * Guest 2 request to the Ultravisor to make a page unshared. + * + * @addr: Real or absolute address of the page to be unshared + */ +static inline int uv_remove_shared(unsigned long addr) +{ + return share(addr, UVC_CMD_REMOVE_SHARED_ACCESS); +} + +void uv_query_info(void); +#else +#define is_prot_virt_guest() 0 +static inline int uv_set_shared(unsigned long addr) { return 0; } +static inline int uv_remove_shared(unsigned long addr) { return 0; } +static inline void uv_query_info(void) {} +#endif + +#endif /* _ASM_S390_UV_H */ diff --git a/arch/s390/include/asm/vmlinux.lds.h b/arch/s390/include/asm/vmlinux.lds.h index 2d127f900352..cbe670a6861b 100644 --- a/arch/s390/include/asm/vmlinux.lds.h +++ b/arch/s390/include/asm/vmlinux.lds.h @@ -18,3 +18,16 @@ *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.data*))) \ __boot_data_end = .; \ } + +/* + * .boot.preserved.data is similar to .boot.data, but it is not part of the + * .init section and thus will be preserved for later use in the decompressed + * kernel. + */ +#define BOOT_DATA_PRESERVED \ + . = ALIGN(PAGE_SIZE); \ + .boot.preserved.data : { \ + __boot_data_preserved_start = .; \ + *(SORT_BY_ALIGNMENT(SORT_BY_NAME(.boot.preserved.data*))) \ + __boot_data_preserved_end = .; \ + } diff --git a/arch/s390/include/uapi/asm/ipl.h b/arch/s390/include/uapi/asm/ipl.h new file mode 100644 index 000000000000..fd32b1cd80d2 --- /dev/null +++ b/arch/s390/include/uapi/asm/ipl.h @@ -0,0 +1,154 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_UAPI_IPL_H +#define _ASM_S390_UAPI_IPL_H + +#include <linux/types.h> + +/* IPL Parameter List header */ +struct ipl_pl_hdr { + __u32 len; + __u8 flags; + __u8 reserved1[2]; + __u8 version; +} __packed; + +#define IPL_PL_FLAG_IPLPS 0x80 +#define IPL_PL_FLAG_SIPL 0x40 +#define IPL_PL_FLAG_IPLSR 0x20 + +/* IPL Parameter Block header */ +struct ipl_pb_hdr { + __u32 len; + __u8 pbt; +} __packed; + +/* IPL Parameter Block types */ +enum ipl_pbt { + IPL_PBT_FCP = 0, + IPL_PBT_SCP_DATA = 1, + IPL_PBT_CCW = 2, +}; + +/* IPL Parameter Block 0 with common fields */ +struct ipl_pb0_common { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; +} __packed; + +#define IPL_PB0_FLAG_LOADPARM 0x80 + +/* IPL Parameter Block 0 for FCP */ +struct ipl_pb0_fcp { + __u32 len; + __u8 pbt; + __u8 reserved1[3]; + __u8 loadparm[8]; + __u8 reserved2[304]; + __u8 opt; + __u8 reserved3[3]; + __u8 cssid; + __u8 reserved4[1]; + __u16 devno; + __u8 reserved5[4]; + __u64 wwpn; + __u64 lun; + __u32 bootprog; + __u8 reserved6[12]; + __u64 br_lba; + __u32 scp_data_len; + __u8 reserved7[260]; + __u8 scp_data[]; +} __packed; + +#define IPL_PB0_FCP_OPT_IPL 0x10 +#define IPL_PB0_FCP_OPT_DUMP 0x20 + +/* IPL Parameter Block 0 for CCW */ +struct ipl_pb0_ccw { + __u32 len; + __u8 pbt; + __u8 flags; + __u8 reserved1[2]; + __u8 loadparm[8]; + __u8 reserved2[84]; + __u16 reserved3 : 13; + __u8 ssid : 3; + __u16 devno; + __u8 vm_flags; + __u8 reserved4[3]; + __u32 vm_parm_len; + __u8 nss_name[8]; + __u8 vm_parm[64]; + __u8 reserved5[8]; +} __packed; + +#define IPL_PB0_CCW_VM_FLAG_NSS 0x80 +#define IPL_PB0_CCW_VM_FLAG_VP 0x40 + +/* IPL Parameter Block 1 for additional SCP data */ +struct ipl_pb1_scp_data { + __u32 len; + __u8 pbt; + __u8 scp_data[]; +} __packed; + +/* IPL Report List header */ +struct ipl_rl_hdr { + __u32 len; + __u8 flags; + __u8 reserved1[2]; + __u8 version; + __u8 reserved2[8]; +} __packed; + +/* IPL Report Block header */ +struct ipl_rb_hdr { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; +} __packed; + +/* IPL Report Block types */ +enum ipl_rbt { + IPL_RBT_CERTIFICATES = 1, + IPL_RBT_COMPONENTS = 2, +}; + +/* IPL Report Block for the certificate list */ +struct ipl_rb_certificate_entry { + __u64 addr; + __u64 len; +} __packed; + +struct ipl_rb_certificates { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_certificate_entry entries[]; +} __packed; + +/* IPL Report Block for the component list */ +struct ipl_rb_component_entry { + __u64 addr; + __u64 len; + __u8 flags; + __u8 reserved1[5]; + __u16 certificate_index; + __u8 reserved2[8]; +}; + +#define IPL_RB_COMPONENT_FLAG_SIGNED 0x80 +#define IPL_RB_COMPONENT_FLAG_VERIFIED 0x40 + +struct ipl_rb_components { + __u32 len; + __u8 rbt; + __u8 reserved1[11]; + struct ipl_rb_component_entry entries[]; +} __packed; + +#endif diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 8a62c7f72e1b..b0478d01a0c5 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -39,6 +39,7 @@ CFLAGS_smp.o := -Wno-nonnull # CFLAGS_stacktrace.o += -fno-optimize-sibling-calls CFLAGS_dumpstack.o += -fno-optimize-sibling-calls +CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls # # Pass UTS_MACHINE for user_regset definition @@ -51,7 +52,7 @@ obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o early_nobss.o obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o -obj-y += nospec-branch.o ipl_vmparm.o +obj-y += nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o extra-y += head64.o vmlinux.lds @@ -77,6 +78,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_KEXEC_FILE) += machine_kexec_file.o kexec_image.o obj-$(CONFIG_KEXEC_FILE) += kexec_elf.o +obj-$(CONFIG_IMA) += ima_arch.o + obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o @@ -86,7 +89,7 @@ obj-$(CONFIG_TRACEPOINTS) += trace.o # vdso obj-y += vdso64/ -obj-$(CONFIG_COMPAT) += vdso32/ +obj-$(CONFIG_COMPAT_VDSO) += vdso32/ chkbss := head64.o early_nobss.o include $(srctree)/arch/s390/scripts/Makefile.chkbss diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 164bec175628..41ac4ad21311 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -16,6 +16,7 @@ #include <asm/pgtable.h> #include <asm/gmap.h> #include <asm/nmi.h> +#include <asm/stacktrace.h> int main(void) { diff --git a/arch/s390/kernel/base.S b/arch/s390/kernel/base.S index f268fca67e82..2f39ea57f358 100644 --- a/arch/s390/kernel/base.S +++ b/arch/s390/kernel/base.S @@ -28,6 +28,7 @@ ENTRY(s390_base_mcck_handler) 1: la %r1,4095 lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1) lpswe __LC_MCK_OLD_PSW +ENDPROC(s390_base_mcck_handler) .section .bss .align 8 @@ -48,6 +49,7 @@ ENTRY(s390_base_ext_handler) 1: lmg %r0,%r15,__LC_SAVE_AREA_ASYNC ni __LC_EXT_OLD_PSW+1,0xfd # clear wait state bit lpswe __LC_EXT_OLD_PSW +ENDPROC(s390_base_ext_handler) .section .bss .align 8 @@ -68,6 +70,7 @@ ENTRY(s390_base_pgm_handler) lmg %r0,%r15,__LC_SAVE_AREA_SYNC lpswe __LC_PGM_OLD_PSW 1: lpswe disabled_wait_psw-0b(%r13) +ENDPROC(s390_base_pgm_handler) .align 8 disabled_wait_psw: @@ -79,71 +82,3 @@ disabled_wait_psw: s390_base_pgm_handler_fn: .quad 0 .previous - -# -# Calls diag 308 subcode 1 and continues execution -# -ENTRY(diag308_reset) - larl %r4,.Lctlregs # Save control registers - stctg %c0,%c15,0(%r4) - lg %r2,0(%r4) # Disable lowcore protection - nilh %r2,0xefff - larl %r4,.Lctlreg0 - stg %r2,0(%r4) - lctlg %c0,%c0,0(%r4) - larl %r4,.Lfpctl # Floating point control register - stfpc 0(%r4) - larl %r4,.Lprefix # Save prefix register - stpx 0(%r4) - larl %r4,.Lprefix_zero # Set prefix register to 0 - spx 0(%r4) - larl %r4,.Lcontinue_psw # Save PSW flags - epsw %r2,%r3 - stm %r2,%r3,0(%r4) - larl %r4,.Lrestart_psw # Setup restart PSW at absolute 0 - lghi %r3,0 - lg %r4,0(%r4) # Save PSW - sturg %r4,%r3 # Use sturg, because of large pages - lghi %r1,1 - lghi %r0,0 - diag %r0,%r1,0x308 -.Lrestart_part2: - lhi %r0,0 # Load r0 with zero - lhi %r1,2 # Use mode 2 = ESAME (dump) - sigp %r1,%r0,SIGP_SET_ARCHITECTURE # Switch to ESAME mode - sam64 # Switch to 64 bit addressing mode - larl %r4,.Lctlregs # Restore control registers - lctlg %c0,%c15,0(%r4) - larl %r4,.Lfpctl # Restore floating point ctl register - lfpc 0(%r4) - larl %r4,.Lprefix # Restore prefix register - spx 0(%r4) - larl %r4,.Lcontinue_psw # Restore PSW flags - lpswe 0(%r4) -.Lcontinue: - BR_EX %r14 -.align 16 -.Lrestart_psw: - .long 0x00080000,0x80000000 + .Lrestart_part2 - - .section .data..nosave,"aw",@progbits -.align 8 -.Lcontinue_psw: - .quad 0,.Lcontinue - .previous - - .section .bss -.align 8 -.Lctlreg0: - .quad 0 -.Lctlregs: - .rept 16 - .quad 0 - .endr -.Lfpctl: - .long 0 -.Lprefix: - .long 0 -.Lprefix_zero: - .long 0 - .previous diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c index 7edaa733a77f..e9dac9a24d3f 100644 --- a/arch/s390/kernel/diag.c +++ b/arch/s390/kernel/diag.c @@ -13,6 +13,7 @@ #include <linux/debugfs.h> #include <asm/diag.h> #include <asm/trace/diag.h> +#include <asm/sections.h> struct diag_stat { unsigned int counter[NR_DIAG_STAT]; @@ -49,6 +50,9 @@ static const struct diag_desc diag_map[NR_DIAG_STAT] = { [DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" }, }; +struct diag_ops __bootdata_preserved(diag_dma_ops); +struct diag210 *__bootdata_preserved(__diag210_tmp_dma); + static int show_diag_stat(struct seq_file *m, void *v) { struct diag_stat *stat; @@ -139,30 +143,10 @@ EXPORT_SYMBOL(diag_stat_inc_norecursion); /* * Diagnose 14: Input spool file manipulation */ -static inline int __diag14(unsigned long rx, unsigned long ry1, - unsigned long subcode) -{ - register unsigned long _ry1 asm("2") = ry1; - register unsigned long _ry2 asm("3") = subcode; - int rc = 0; - - asm volatile( - " sam31\n" - " diag %2,2,0x14\n" - " sam64\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc), "+d" (_ry2) - : "d" (rx), "d" (_ry1) - : "cc"); - - return rc; -} - int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode) { diag_stat_inc(DIAG_STAT_X014); - return __diag14(rx, ry1, subcode); + return diag_dma_ops.diag14(rx, ry1, subcode); } EXPORT_SYMBOL(diag14); @@ -195,30 +179,17 @@ EXPORT_SYMBOL(diag204); */ int diag210(struct diag210 *addr) { - /* - * diag 210 needs its data below the 2GB border, so we - * use a static data area to be sure - */ - static struct diag210 diag210_tmp; static DEFINE_SPINLOCK(diag210_lock); unsigned long flags; int ccode; spin_lock_irqsave(&diag210_lock, flags); - diag210_tmp = *addr; + *__diag210_tmp_dma = *addr; diag_stat_inc(DIAG_STAT_X210); - asm volatile( - " lhi %0,-1\n" - " sam31\n" - " diag %1,0,0x210\n" - "0: ipm %0\n" - " srl %0,28\n" - "1: sam64\n" - EX_TABLE(0b, 1b) - : "=&d" (ccode) : "a" (&diag210_tmp) : "cc", "memory"); - - *addr = diag210_tmp; + ccode = diag_dma_ops.diag210(__diag210_tmp_dma); + + *addr = *__diag210_tmp_dma; spin_unlock_irqrestore(&diag210_lock, flags); return ccode; @@ -243,27 +214,9 @@ EXPORT_SYMBOL(diag224); /* * Diagnose 26C: Access Certain System Information */ -static inline int __diag26c(void *req, void *resp, enum diag26c_sc subcode) -{ - register unsigned long _req asm("2") = (addr_t) req; - register unsigned long _resp asm("3") = (addr_t) resp; - register unsigned long _subcode asm("4") = subcode; - register unsigned long _rc asm("5") = -EOPNOTSUPP; - - asm volatile( - " sam31\n" - " diag %[rx],%[ry],0x26c\n" - "0: sam64\n" - EX_TABLE(0b,0b) - : "+d" (_rc) - : [rx] "d" (_req), "d" (_resp), [ry] "d" (_subcode) - : "cc", "memory"); - return _rc; -} - int diag26c(void *req, void *resp, enum diag26c_sc subcode) { diag_stat_inc(DIAG_STAT_X26C); - return __diag26c(req, resp, subcode); + return diag_dma_ops.diag26c(req, resp, subcode); } EXPORT_SYMBOL(diag26c); diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c index cb7f55bbe06e..9e87b68be21c 100644 --- a/arch/s390/kernel/dumpstack.c +++ b/arch/s390/kernel/dumpstack.c @@ -21,95 +21,124 @@ #include <asm/debug.h> #include <asm/dis.h> #include <asm/ipl.h> +#include <asm/unwind.h> -/* - * For dump_trace we have tree different stack to consider: - * - the panic stack which is used if the kernel stack has overflown - * - the asynchronous interrupt stack (cpu related) - * - the synchronous kernel stack (process related) - * The stack trace can start at any of the three stacks and can potentially - * touch all of them. The order is: panic stack, async stack, sync stack. - */ -static unsigned long __no_sanitize_address -__dump_trace(dump_trace_func_t func, void *data, unsigned long sp, - unsigned long low, unsigned long high) +const char *stack_type_name(enum stack_type type) { - struct stack_frame *sf; - struct pt_regs *regs; - - while (1) { - if (sp < low || sp > high - sizeof(*sf)) - return sp; - sf = (struct stack_frame *) sp; - if (func(data, sf->gprs[8], 0)) - return sp; - /* Follow the backchain. */ - while (1) { - low = sp; - sp = sf->back_chain; - if (!sp) - break; - if (sp <= low || sp > high - sizeof(*sf)) - return sp; - sf = (struct stack_frame *) sp; - if (func(data, sf->gprs[8], 1)) - return sp; - } - /* Zero backchain detected, check for interrupt frame. */ - sp = (unsigned long) (sf + 1); - if (sp <= low || sp > high - sizeof(*regs)) - return sp; - regs = (struct pt_regs *) sp; - if (!user_mode(regs)) { - if (func(data, regs->psw.addr, 1)) - return sp; - } - low = sp; - sp = regs->gprs[15]; + switch (type) { + case STACK_TYPE_TASK: + return "task"; + case STACK_TYPE_IRQ: + return "irq"; + case STACK_TYPE_NODAT: + return "nodat"; + case STACK_TYPE_RESTART: + return "restart"; + default: + return "unknown"; } } -void dump_trace(dump_trace_func_t func, void *data, struct task_struct *task, - unsigned long sp) +static inline bool in_stack(unsigned long sp, struct stack_info *info, + enum stack_type type, unsigned long low, + unsigned long high) +{ + if (sp < low || sp >= high) + return false; + info->type = type; + info->begin = low; + info->end = high; + return true; +} + +static bool in_task_stack(unsigned long sp, struct task_struct *task, + struct stack_info *info) +{ + unsigned long stack; + + stack = (unsigned long) task_stack_page(task); + return in_stack(sp, info, STACK_TYPE_TASK, stack, stack + THREAD_SIZE); +} + +static bool in_irq_stack(unsigned long sp, struct stack_info *info) { - unsigned long frame_size; + unsigned long frame_size, top; frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); -#ifdef CONFIG_CHECK_STACK - sp = __dump_trace(func, data, sp, - S390_lowcore.nodat_stack + frame_size - THREAD_SIZE, - S390_lowcore.nodat_stack + frame_size); -#endif - sp = __dump_trace(func, data, sp, - S390_lowcore.async_stack + frame_size - THREAD_SIZE, - S390_lowcore.async_stack + frame_size); - task = task ?: current; - __dump_trace(func, data, sp, - (unsigned long)task_stack_page(task), - (unsigned long)task_stack_page(task) + THREAD_SIZE); + top = S390_lowcore.async_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_IRQ, top - THREAD_SIZE, top); +} + +static bool in_nodat_stack(unsigned long sp, struct stack_info *info) +{ + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.nodat_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_NODAT, top - THREAD_SIZE, top); } -EXPORT_SYMBOL_GPL(dump_trace); -static int show_address(void *data, unsigned long address, int reliable) +static bool in_restart_stack(unsigned long sp, struct stack_info *info) { - if (reliable) - printk(" [<%016lx>] %pSR \n", address, (void *)address); - else - printk("([<%016lx>] %pSR)\n", address, (void *)address); + unsigned long frame_size, top; + + frame_size = STACK_FRAME_OVERHEAD + sizeof(struct pt_regs); + top = S390_lowcore.restart_stack + frame_size; + return in_stack(sp, info, STACK_TYPE_RESTART, top - THREAD_SIZE, top); +} + +int get_stack_info(unsigned long sp, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) +{ + if (!sp) + goto unknown; + + task = task ? : current; + + /* Check per-task stack */ + if (in_task_stack(sp, task, info)) + goto recursion_check; + + if (task != current) + goto unknown; + + /* Check per-cpu stacks */ + if (!in_irq_stack(sp, info) && + !in_nodat_stack(sp, info) && + !in_restart_stack(sp, info)) + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING + "WARNING: stack recursion on stack type %d\n", + info->type); + goto unknown; + } + *visit_mask |= 1UL << info->type; return 0; +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } void show_stack(struct task_struct *task, unsigned long *stack) { - unsigned long sp = (unsigned long) stack; + struct unwind_state state; - if (!sp) - sp = task ? task->thread.ksp : current_stack_pointer(); printk("Call Trace:\n"); - dump_trace(show_address, NULL, task, sp); if (!task) task = current; - debug_show_held_locks(task); + unwind_for_each_frame(&state, task, NULL, (unsigned long) stack) + printk(state.reliable ? " [<%016lx>] %pSR \n" : + "([<%016lx>] %pSR)\n", + state.ip, (void *) state.ip); + debug_show_held_locks(task ? : current); } static void show_last_breaking_event(struct pt_regs *regs) diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index d6edf45f93b9..629f173f60cd 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -30,6 +30,7 @@ #include <asm/sclp.h> #include <asm/facility.h> #include <asm/boot_data.h> +#include <asm/pci_insn.h> #include "entry.h" /* @@ -138,9 +139,9 @@ static void early_pgm_check_handler(void) unsigned long addr; addr = S390_lowcore.program_old_psw.addr; - fixup = search_exception_tables(addr); + fixup = s390_search_extables(addr); if (!fixup) - disabled_wait(0); + disabled_wait(); /* Disable low address protection before storing into lowcore. */ __ctl_store(cr0, 0, 0); cr0_new = cr0 & ~(1UL << 28); @@ -235,6 +236,7 @@ static __init void detect_machine_facilities(void) clock_comparator_max = -1ULL >> 1; __ctl_set_bit(0, 53); } + enable_mio_ctl(); } static inline void save_vector_registers(void) @@ -296,7 +298,7 @@ static void __init check_image_bootable(void) sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); sclp_early_printk("This image does not contain all parts necessary for starting up. Use\n"); sclp_early_printk("bzImage or arch/s390/boot/compressed/vmlinux instead.\n"); - disabled_wait(0xbadb007); + disabled_wait(); } void __init startup_init(void) @@ -309,7 +311,6 @@ void __init startup_init(void) setup_facility_list(); detect_machine_type(); setup_arch_string(); - ipl_store_parameters(); setup_boot_command_line(); detect_diag9c(); detect_diag44(); diff --git a/arch/s390/kernel/early_nobss.c b/arch/s390/kernel/early_nobss.c index 8d73f7fae16e..52a3ef959341 100644 --- a/arch/s390/kernel/early_nobss.c +++ b/arch/s390/kernel/early_nobss.c @@ -25,7 +25,7 @@ static void __init reset_tod_clock(void) return; /* TOD clock not running. Set the clock to Unix Epoch. */ if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) - disabled_wait(0); + disabled_wait(); memset(tod_clock_base, 0, 16); *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 583d65ef5007..3f4d272577d3 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -224,6 +224,7 @@ ENTRY(__bpon) .globl __bpon BPON BR_EX %r14 +ENDPROC(__bpon) /* * Scheduler resume function, called by switch_to @@ -248,6 +249,7 @@ ENTRY(__switch_to) lmg %r6,%r15,__SF_GPRS(%r15) # load gprs of next task ALTERNATIVE "", ".insn s,0xb2800000,_LPP_OFFSET", 40 BR_EX %r14 +ENDPROC(__switch_to) .L__critical_start: @@ -324,6 +326,7 @@ sie_exit: EX_TABLE(.Lrewind_pad4,.Lsie_fault) EX_TABLE(.Lrewind_pad2,.Lsie_fault) EX_TABLE(sie_exit,.Lsie_fault) +ENDPROC(sie64a) EXPORT_SYMBOL(sie64a) EXPORT_SYMBOL(sie_exit) #endif @@ -358,19 +361,19 @@ ENTRY(system_call) # load address of system call table lg %r10,__THREAD_sysc_table(%r13,%r12) llgh %r8,__PT_INT_CODE+2(%r11) - slag %r8,%r8,2 # shift and test for svc 0 + slag %r8,%r8,3 # shift and test for svc 0 jnz .Lsysc_nr_ok # svc 0: system call number in %r1 llgfr %r1,%r1 # clear high word in r1 cghi %r1,NR_syscalls jnl .Lsysc_nr_ok sth %r1,__PT_INT_CODE+2(%r11) - slag %r8,%r1,2 + slag %r8,%r1,3 .Lsysc_nr_ok: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stg %r2,__PT_ORIG_GPR2(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) - lgf %r9,0(%r8,%r10) # get system call add. + lg %r9,0(%r8,%r10) # get system call add. TSTMSK __TI_flags(%r12),_TIF_TRACE jnz .Lsysc_tracesys BASR_EX %r14,%r9 # call sys_xxxx @@ -556,8 +559,8 @@ ENTRY(system_call) lghi %r0,NR_syscalls clgr %r0,%r2 jnh .Lsysc_tracenogo - sllg %r8,%r2,2 - lgf %r9,0(%r8,%r10) + sllg %r8,%r2,3 + lg %r9,0(%r8,%r10) .Lsysc_tracego: lmg %r3,%r7,__PT_R3(%r11) stg %r7,STACK_FRAME_OVERHEAD(%r15) @@ -570,6 +573,7 @@ ENTRY(system_call) lgr %r2,%r11 # pass pointer to pt_regs larl %r14,.Lsysc_return jg do_syscall_trace_exit +ENDPROC(system_call) # # a new process exits the kernel with ret_from_fork @@ -584,10 +588,16 @@ ENTRY(ret_from_fork) jne .Lsysc_tracenogo # it's a kernel thread lmg %r9,%r10,__PT_R9(%r11) # load gprs + la %r2,0(%r10) + BASR_EX %r14,%r9 + j .Lsysc_tracenogo +ENDPROC(ret_from_fork) + ENTRY(kernel_thread_starter) la %r2,0(%r10) BASR_EX %r14,%r9 j .Lsysc_tracenogo +ENDPROC(kernel_thread_starter) /* * Program check handler routine @@ -665,9 +675,9 @@ ENTRY(pgm_check_handler) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) nill %r10,0x007f - sll %r10,2 + sll %r10,3 je .Lpgm_return - lgf %r9,0(%r10,%r1) # load address of handler routine + lg %r9,0(%r10,%r1) # load address of handler routine lgr %r2,%r11 # pass pointer to pt_regs BASR_EX %r14,%r9 # branch to interrupt-handler .Lpgm_return: @@ -698,6 +708,7 @@ ENTRY(pgm_check_handler) stg %r14,__LC_RETURN_PSW+8 lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP lpswe __LC_RETURN_PSW # branch to .Lsysc_per and enable irqs +ENDPROC(pgm_check_handler) /* * IO interrupt handler routine @@ -926,6 +937,7 @@ ENTRY(io_int_handler) ssm __LC_PGM_NEW_PSW # disable I/O and ext. interrupts TRACE_IRQS_OFF j .Lio_return +ENDPROC(io_int_handler) /* * External interrupt handler routine @@ -965,6 +977,7 @@ ENTRY(ext_int_handler) lghi %r3,EXT_INTERRUPT brasl %r14,do_IRQ j .Lio_return +ENDPROC(ext_int_handler) /* * Load idle PSW. The second "half" of this function is in .Lcleanup_idle. @@ -989,6 +1002,7 @@ ENTRY(psw_idle) lpswe __SF_EMPTY(%r15) BR_EX %r14 .Lpsw_idle_end: +ENDPROC(psw_idle) /* * Store floating-point controls and floating-point or vector register @@ -1031,6 +1045,7 @@ ENTRY(save_fpu_regs) .Lsave_fpu_regs_exit: BR_EX %r14 .Lsave_fpu_regs_end: +ENDPROC(save_fpu_regs) EXPORT_SYMBOL(save_fpu_regs) /* @@ -1077,6 +1092,7 @@ load_fpu_regs: .Lload_fpu_regs_exit: BR_EX %r14 .Lload_fpu_regs_end: +ENDPROC(load_fpu_regs) .L__critical_end: @@ -1206,6 +1222,7 @@ ENTRY(mcck_int_handler) lg %r15,__LC_NODAT_STACK la %r11,STACK_FRAME_OVERHEAD(%r15) j .Lmcck_skip +ENDPROC(mcck_int_handler) # # PSW restart interrupt handler @@ -1232,6 +1249,7 @@ ENTRY(restart_int_handler) 2: sigp %r4,%r3,SIGP_STOP # sigp stop to current cpu brc 2,2b 3: j 3b +ENDPROC(restart_int_handler) .section .kprobes.text, "ax" @@ -1241,7 +1259,7 @@ ENTRY(restart_int_handler) * No need to properly save the registers, we are going to panic anyway. * Setup a pt_regs so that show_trace can provide a good call trace. */ -stack_overflow: +ENTRY(stack_overflow) lg %r15,__LC_NODAT_STACK # change to panic stack la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) @@ -1251,9 +1269,10 @@ stack_overflow: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs jg kernel_stack_overflow +ENDPROC(stack_overflow) #endif -cleanup_critical: +ENTRY(cleanup_critical) #if IS_ENABLED(CONFIG_KVM) clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap jl 0f @@ -1289,6 +1308,7 @@ cleanup_critical: clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end jl .Lcleanup_load_fpu_regs 0: BR_EX %r14,%r11 +ENDPROC(cleanup_critical) .align 8 .Lcleanup_table: @@ -1512,7 +1532,7 @@ cleanup_critical: .quad .Lsie_skip - .Lsie_entry #endif .section .rodata, "a" -#define SYSCALL(esame,emu) .long __s390x_ ## esame +#define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table sys_call_table: #include "asm/syscall_table.h" @@ -1520,7 +1540,7 @@ sys_call_table: #ifdef CONFIG_COMPAT -#define SYSCALL(esame,emu) .long __s390_ ## emu +#define SYSCALL(esame,emu) .quad __s390_ ## emu .globl sys_call_table_emu sys_call_table_emu: #include "asm/syscall_table.h" diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index c3816ae108b0..20420c2b8a14 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -65,7 +65,7 @@ int setup_profiling_timer(unsigned int multiplier); void __init time_init(void); int pfn_is_nosave(unsigned long); void s390_early_resume(void); -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip); +unsigned long prepare_ftrace_return(unsigned long parent, unsigned long sp, unsigned long ip); struct s390_mmap_arg_struct; struct fadvise64_64_args; diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index 594464f2129d..0da378e2eb25 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -23,7 +23,7 @@ void __kernel_fpu_begin(struct kernel_fpu *state, u32 flags) if (flags & KERNEL_FPC) /* Save floating point control */ - asm volatile("stfpc %0" : "=m" (state->fpc)); + asm volatile("stfpc %0" : "=Q" (state->fpc)); if (!MACHINE_HAS_VX) { if (flags & KERNEL_VXR_V0V7) { diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 39b13d71a8fe..1bb85f60c0dd 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -201,17 +201,18 @@ device_initcall(ftrace_plt_init); * Hook the return address and push it in the stack of return addresses * in current thread info. */ -unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) +unsigned long prepare_ftrace_return(unsigned long ra, unsigned long sp, + unsigned long ip) { if (unlikely(ftrace_graph_is_dead())) goto out; if (unlikely(atomic_read(¤t->tracing_graph_pause))) goto out; ip -= MCOUNT_INSN_SIZE; - if (!function_graph_enter(parent, ip, 0, NULL)) - parent = (unsigned long) return_to_handler; + if (!function_graph_enter(ra, ip, 0, (void *) sp)) + ra = (unsigned long) return_to_handler; out: - return parent; + return ra; } NOKPROBE_SYMBOL(prepare_ftrace_return); diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 56491e636eab..5aea1a527443 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -26,7 +26,6 @@ ENTRY(startup_continue) 0: larl %r1,tod_clock_base mvc 0(16,%r1),__LC_BOOT_CLOCK larl %r13,.LPG1 # get base - lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers larl %r0,boot_vdso_data stg %r0,__LC_VDSO_PER_CPU # @@ -61,22 +60,6 @@ ENTRY(startup_continue) .align 16 .LPG1: -.Lctl: .quad 0x04040000 # cr0: AFP registers & secondary space - .quad 0 # cr1: primary space segment table - .quad .Lduct # cr2: dispatchable unit control table - .quad 0 # cr3: instruction authorization - .quad 0xffff # cr4: instruction authorization - .quad .Lduct # cr5: primary-aste origin - .quad 0 # cr6: I/O interrupts - .quad 0 # cr7: secondary space segment table - .quad 0 # cr8: access registers translation - .quad 0 # cr9: tracing off - .quad 0 # cr10: tracing off - .quad 0 # cr11: tracing off - .quad 0 # cr12: tracing off - .quad 0 # cr13: home space segment table - .quad 0xc0000000 # cr14: machine check handling off - .quad .Llinkage_stack # cr15: linkage stack operations .Lpcmsk:.quad 0x0000000180000000 .L4malign:.quad 0xffffffffffc00000 .Lscan2g:.quad 0x80000000 + 0x20000 - 8 # 2GB + 128K - 8 @@ -84,14 +67,5 @@ ENTRY(startup_continue) .Lparmaddr: .quad PARMAREA .align 64 -.Lduct: .long 0,.Laste,.Laste,0,.Lduald,0,0,0 - .long 0,0,0,0,0,0,0,0 -.Laste: .quad 0,0xffffffffffffffff,0,0,0,0,0,0 - .align 128 -.Lduald:.rept 8 - .long 0x80000000,0,0,0 # invalid access-list entries - .endr -.Llinkage_stack: - .long 0,0,0x89000000,0,0,0,0x8a000000,0 .Ldw: .quad 0x0002000180000000,0x0000000000000000 .Laregs:.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 diff --git a/arch/s390/kernel/ima_arch.c b/arch/s390/kernel/ima_arch.c new file mode 100644 index 000000000000..f3c3e6e1c5d3 --- /dev/null +++ b/arch/s390/kernel/ima_arch.c @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/ima.h> +#include <asm/boot_data.h> + +bool arch_ima_get_secureboot(void) +{ + return ipl_secure_flag; +} + +const char * const *arch_get_ima_policy(void) +{ + return NULL; +} diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 18a5d6317acc..d836af3ccc38 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -31,6 +31,7 @@ #include <asm/os_info.h> #include <asm/sections.h> #include <asm/boot_data.h> +#include <asm/uv.h> #include "entry.h" #define IPL_PARM_BLOCK_VERSION 0 @@ -119,11 +120,15 @@ static char *dump_type_str(enum dump_type type) } } -struct ipl_parameter_block __bootdata(early_ipl_block); -int __bootdata(early_ipl_block_valid); +int __bootdata_preserved(ipl_block_valid); +struct ipl_parameter_block __bootdata_preserved(ipl_block); +int __bootdata_preserved(ipl_secure_flag); -static int ipl_block_valid; -static struct ipl_parameter_block ipl_block; +unsigned long __bootdata_preserved(ipl_cert_list_addr); +unsigned long __bootdata_preserved(ipl_cert_list_size); + +unsigned long __bootdata(early_ipl_comp_list_addr); +unsigned long __bootdata(early_ipl_comp_list_size); static int reipl_capabilities = IPL_TYPE_UNKNOWN; @@ -246,11 +251,11 @@ static __init enum ipl_type get_ipl_type(void) if (!ipl_block_valid) return IPL_TYPE_UNKNOWN; - switch (ipl_block.hdr.pbt) { - case DIAG308_IPL_TYPE_CCW: + switch (ipl_block.pb0_hdr.pbt) { + case IPL_PBT_CCW: return IPL_TYPE_CCW; - case DIAG308_IPL_TYPE_FCP: - if (ipl_block.ipl_info.fcp.opt == DIAG308_IPL_OPT_DUMP) + case IPL_PBT_FCP: + if (ipl_block.fcp.opt == IPL_PB0_FCP_OPT_DUMP) return IPL_TYPE_FCP_DUMP; else return IPL_TYPE_FCP; @@ -269,12 +274,35 @@ static ssize_t ipl_type_show(struct kobject *kobj, struct kobj_attribute *attr, static struct kobj_attribute sys_ipl_type_attr = __ATTR_RO(ipl_type); +static ssize_t ipl_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + return sprintf(page, "%i\n", !!ipl_secure_flag); +} + +static struct kobj_attribute sys_ipl_secure_attr = + __ATTR(secure, 0444, ipl_secure_show, NULL); + +static ssize_t ipl_has_secure_show(struct kobject *kobj, + struct kobj_attribute *attr, char *page) +{ + if (MACHINE_IS_LPAR) + return sprintf(page, "%i\n", !!sclp.has_sipl); + else if (MACHINE_IS_VM) + return sprintf(page, "%i\n", !!sclp.has_sipl_g2); + else + return sprintf(page, "%i\n", 0); +} + +static struct kobj_attribute sys_ipl_has_secure_attr = + __ATTR(has_secure, 0444, ipl_has_secure_show, NULL); + static ssize_t ipl_vm_parm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) { char parm[DIAG308_VMPARM_SIZE + 1] = {}; - if (ipl_block_valid && (ipl_block.hdr.pbt == DIAG308_IPL_TYPE_CCW)) + if (ipl_block_valid && (ipl_block.pb0_hdr.pbt == IPL_PBT_CCW)) ipl_block_get_ascii_vmparm(parm, sizeof(parm), &ipl_block); return sprintf(page, "%s\n", parm); } @@ -287,12 +315,11 @@ static ssize_t sys_ipl_device_show(struct kobject *kobj, { switch (ipl_info.type) { case IPL_TYPE_CCW: - return sprintf(page, "0.%x.%04x\n", ipl_block.ipl_info.ccw.ssid, - ipl_block.ipl_info.ccw.devno); + return sprintf(page, "0.%x.%04x\n", ipl_block.ccw.ssid, + ipl_block.ccw.devno); case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: - return sprintf(page, "0.0.%04x\n", - ipl_block.ipl_info.fcp.devno); + return sprintf(page, "0.0.%04x\n", ipl_block.fcp.devno); default: return 0; } @@ -316,8 +343,8 @@ static ssize_t ipl_scp_data_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - unsigned int size = ipl_block.ipl_info.fcp.scp_data_len; - void *scp_data = &ipl_block.ipl_info.fcp.scp_data; + unsigned int size = ipl_block.fcp.scp_data_len; + void *scp_data = &ipl_block.fcp.scp_data; return memory_read_from_buffer(buf, count, &off, scp_data, size); } @@ -333,13 +360,13 @@ static struct bin_attribute *ipl_fcp_bin_attrs[] = { /* FCP ipl device attributes */ DEFINE_IPL_ATTR_RO(ipl_fcp, wwpn, "0x%016llx\n", - (unsigned long long)ipl_block.ipl_info.fcp.wwpn); + (unsigned long long)ipl_block.fcp.wwpn); DEFINE_IPL_ATTR_RO(ipl_fcp, lun, "0x%016llx\n", - (unsigned long long)ipl_block.ipl_info.fcp.lun); + (unsigned long long)ipl_block.fcp.lun); DEFINE_IPL_ATTR_RO(ipl_fcp, bootprog, "%lld\n", - (unsigned long long)ipl_block.ipl_info.fcp.bootprog); + (unsigned long long)ipl_block.fcp.bootprog); DEFINE_IPL_ATTR_RO(ipl_fcp, br_lba, "%lld\n", - (unsigned long long)ipl_block.ipl_info.fcp.br_lba); + (unsigned long long)ipl_block.fcp.br_lba); static ssize_t ipl_ccw_loadparm_show(struct kobject *kobj, struct kobj_attribute *attr, char *page) @@ -365,6 +392,8 @@ static struct attribute *ipl_fcp_attrs[] = { &sys_ipl_fcp_bootprog_attr.attr, &sys_ipl_fcp_br_lba_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -380,6 +409,8 @@ static struct attribute *ipl_ccw_attrs_vm[] = { &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, &sys_ipl_vm_parm_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -387,6 +418,8 @@ static struct attribute *ipl_ccw_attrs_lpar[] = { &sys_ipl_type_attr.attr, &sys_ipl_device_attr.attr, &sys_ipl_ccw_loadparm_attr.attr, + &sys_ipl_secure_attr.attr, + &sys_ipl_has_secure_attr.attr, NULL, }; @@ -495,14 +528,14 @@ static ssize_t reipl_generic_vmparm_store(struct ipl_parameter_block *ipb, if (!(isalnum(buf[i]) || isascii(buf[i]) || isprint(buf[i]))) return -EINVAL; - memset(ipb->ipl_info.ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); - ipb->ipl_info.ccw.vm_parm_len = ip_len; + memset(ipb->ccw.vm_parm, 0, DIAG308_VMPARM_SIZE); + ipb->ccw.vm_parm_len = ip_len; if (ip_len > 0) { - ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; - memcpy(ipb->ipl_info.ccw.vm_parm, buf, ip_len); - ASCEBC(ipb->ipl_info.ccw.vm_parm, ip_len); + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; + memcpy(ipb->ccw.vm_parm, buf, ip_len); + ASCEBC(ipb->ccw.vm_parm, ip_len); } else { - ipb->ipl_info.ccw.vm_flags &= ~DIAG308_VM_FLAGS_VP_VALID; + ipb->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_VP; } return len; @@ -549,8 +582,8 @@ static ssize_t reipl_fcp_scpdata_read(struct file *filp, struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { - size_t size = reipl_block_fcp->ipl_info.fcp.scp_data_len; - void *scp_data = reipl_block_fcp->ipl_info.fcp.scp_data; + size_t size = reipl_block_fcp->fcp.scp_data_len; + void *scp_data = reipl_block_fcp->fcp.scp_data; return memory_read_from_buffer(buf, count, &off, scp_data, size); } @@ -566,17 +599,17 @@ static ssize_t reipl_fcp_scpdata_write(struct file *filp, struct kobject *kobj, if (off) return -EINVAL; - memcpy(reipl_block_fcp->ipl_info.fcp.scp_data, buf, count); + memcpy(reipl_block_fcp->fcp.scp_data, buf, count); if (scpdata_len % 8) { padding = 8 - (scpdata_len % 8); - memset(reipl_block_fcp->ipl_info.fcp.scp_data + scpdata_len, + memset(reipl_block_fcp->fcp.scp_data + scpdata_len, 0, padding); scpdata_len += padding; } - reipl_block_fcp->ipl_info.fcp.scp_data_len = scpdata_len; - reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN + scpdata_len; - reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN + scpdata_len; + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN + scpdata_len; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN + scpdata_len; + reipl_block_fcp->fcp.scp_data_len = scpdata_len; return count; } @@ -590,20 +623,20 @@ static struct bin_attribute *reipl_fcp_bin_attrs[] = { }; DEFINE_IPL_ATTR_RW(reipl_fcp, wwpn, "0x%016llx\n", "%llx\n", - reipl_block_fcp->ipl_info.fcp.wwpn); + reipl_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(reipl_fcp, lun, "0x%016llx\n", "%llx\n", - reipl_block_fcp->ipl_info.fcp.lun); + reipl_block_fcp->fcp.lun); DEFINE_IPL_ATTR_RW(reipl_fcp, bootprog, "%lld\n", "%lld\n", - reipl_block_fcp->ipl_info.fcp.bootprog); + reipl_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(reipl_fcp, br_lba, "%lld\n", "%lld\n", - reipl_block_fcp->ipl_info.fcp.br_lba); + reipl_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(reipl_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", - reipl_block_fcp->ipl_info.fcp.devno); + reipl_block_fcp->fcp.devno); static void reipl_get_ascii_loadparm(char *loadparm, struct ipl_parameter_block *ibp) { - memcpy(loadparm, ibp->hdr.loadparm, LOADPARM_LEN); + memcpy(loadparm, ibp->common.loadparm, LOADPARM_LEN); EBCASC(loadparm, LOADPARM_LEN); loadparm[LOADPARM_LEN] = 0; strim(loadparm); @@ -638,11 +671,11 @@ static ssize_t reipl_generic_loadparm_store(struct ipl_parameter_block *ipb, return -EINVAL; } /* initialize loadparm with blanks */ - memset(ipb->hdr.loadparm, ' ', LOADPARM_LEN); + memset(ipb->common.loadparm, ' ', LOADPARM_LEN); /* copy and convert to ebcdic */ - memcpy(ipb->hdr.loadparm, buf, lp_len); - ASCEBC(ipb->hdr.loadparm, LOADPARM_LEN); - ipb->hdr.flags |= DIAG308_FLAGS_LP_VALID; + memcpy(ipb->common.loadparm, buf, lp_len); + ASCEBC(ipb->common.loadparm, LOADPARM_LEN); + ipb->common.flags |= IPL_PB0_FLAG_LOADPARM; return len; } @@ -680,7 +713,7 @@ static struct attribute_group reipl_fcp_attr_group = { }; /* CCW reipl device attributes */ -DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ipl_info.ccw); +DEFINE_IPL_CCW_ATTR_RW(reipl_ccw, device, reipl_block_ccw->ccw); /* NSS wrapper */ static ssize_t reipl_nss_loadparm_show(struct kobject *kobj, @@ -742,7 +775,7 @@ static struct attribute_group reipl_ccw_attr_group_lpar = { static void reipl_get_ascii_nss_name(char *dst, struct ipl_parameter_block *ipb) { - memcpy(dst, ipb->ipl_info.ccw.nss_name, NSS_NAME_SIZE); + memcpy(dst, ipb->ccw.nss_name, NSS_NAME_SIZE); EBCASC(dst, NSS_NAME_SIZE); dst[NSS_NAME_SIZE] = 0; } @@ -770,16 +803,14 @@ static ssize_t reipl_nss_name_store(struct kobject *kobj, if (nss_len > NSS_NAME_SIZE) return -EINVAL; - memset(reipl_block_nss->ipl_info.ccw.nss_name, 0x40, NSS_NAME_SIZE); + memset(reipl_block_nss->ccw.nss_name, 0x40, NSS_NAME_SIZE); if (nss_len > 0) { - reipl_block_nss->ipl_info.ccw.vm_flags |= - DIAG308_VM_FLAGS_NSS_VALID; - memcpy(reipl_block_nss->ipl_info.ccw.nss_name, buf, nss_len); - ASCEBC(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); - EBC_TOUPPER(reipl_block_nss->ipl_info.ccw.nss_name, nss_len); + reipl_block_nss->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_NSS; + memcpy(reipl_block_nss->ccw.nss_name, buf, nss_len); + ASCEBC(reipl_block_nss->ccw.nss_name, nss_len); + EBC_TOUPPER(reipl_block_nss->ccw.nss_name, nss_len); } else { - reipl_block_nss->ipl_info.ccw.vm_flags &= - ~DIAG308_VM_FLAGS_NSS_VALID; + reipl_block_nss->ccw.vm_flags &= ~IPL_PB0_CCW_VM_FLAG_NSS; } return len; @@ -866,15 +897,21 @@ static void __reipl_run(void *unused) { switch (reipl_type) { case IPL_TYPE_CCW: + uv_set_shared(__pa(reipl_block_ccw)); diag308(DIAG308_SET, reipl_block_ccw); + uv_remove_shared(__pa(reipl_block_ccw)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_FCP: + uv_set_shared(__pa(reipl_block_fcp)); diag308(DIAG308_SET, reipl_block_fcp); + uv_remove_shared(__pa(reipl_block_fcp)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_NSS: + uv_set_shared(__pa(reipl_block_nss)); diag308(DIAG308_SET, reipl_block_nss); + uv_remove_shared(__pa(reipl_block_nss)); diag308(DIAG308_LOAD_CLEAR, NULL); break; case IPL_TYPE_UNKNOWN: @@ -883,7 +920,7 @@ static void __reipl_run(void *unused) case IPL_TYPE_FCP_DUMP: break; } - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } static void reipl_run(struct shutdown_trigger *trigger) @@ -893,10 +930,10 @@ static void reipl_run(struct shutdown_trigger *trigger) static void reipl_block_ccw_init(struct ipl_parameter_block *ipb) { - ipb->hdr.len = IPL_PARM_BLK_CCW_LEN; + ipb->hdr.len = IPL_BP_CCW_LEN; ipb->hdr.version = IPL_PARM_BLOCK_VERSION; - ipb->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; - ipb->hdr.pbt = DIAG308_IPL_TYPE_CCW; + ipb->pb0_hdr.len = IPL_BP0_CCW_LEN; + ipb->pb0_hdr.pbt = IPL_PBT_CCW; } static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) @@ -904,21 +941,20 @@ static void reipl_block_ccw_fill_parms(struct ipl_parameter_block *ipb) /* LOADPARM */ /* check if read scp info worked and set loadparm */ if (sclp_ipl_info.is_valid) - memcpy(ipb->hdr.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); + memcpy(ipb->ccw.loadparm, &sclp_ipl_info.loadparm, LOADPARM_LEN); else /* read scp info failed: set empty loadparm (EBCDIC blanks) */ - memset(ipb->hdr.loadparm, 0x40, LOADPARM_LEN); - ipb->hdr.flags = DIAG308_FLAGS_LP_VALID; + memset(ipb->ccw.loadparm, 0x40, LOADPARM_LEN); + ipb->ccw.flags = IPL_PB0_FLAG_LOADPARM; /* VM PARM */ if (MACHINE_IS_VM && ipl_block_valid && - (ipl_block.ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID)) { + (ipl_block.ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP)) { - ipb->ipl_info.ccw.vm_flags |= DIAG308_VM_FLAGS_VP_VALID; - ipb->ipl_info.ccw.vm_parm_len = - ipl_block.ipl_info.ccw.vm_parm_len; - memcpy(ipb->ipl_info.ccw.vm_parm, - ipl_block.ipl_info.ccw.vm_parm, DIAG308_VMPARM_SIZE); + ipb->ccw.vm_flags |= IPL_PB0_CCW_VM_FLAG_VP; + ipb->ccw.vm_parm_len = ipl_block.ccw.vm_parm_len; + memcpy(ipb->ccw.vm_parm, + ipl_block.ccw.vm_parm, DIAG308_VMPARM_SIZE); } } @@ -958,8 +994,8 @@ static int __init reipl_ccw_init(void) reipl_block_ccw_init(reipl_block_ccw); if (ipl_info.type == IPL_TYPE_CCW) { - reipl_block_ccw->ipl_info.ccw.ssid = ipl_block.ipl_info.ccw.ssid; - reipl_block_ccw->ipl_info.ccw.devno = ipl_block.ipl_info.ccw.devno; + reipl_block_ccw->ccw.ssid = ipl_block.ccw.ssid; + reipl_block_ccw->ccw.devno = ipl_block.ccw.devno; reipl_block_ccw_fill_parms(reipl_block_ccw); } @@ -997,14 +1033,14 @@ static int __init reipl_fcp_init(void) * is invalid in the SCSI IPL parameter block, so take it * always from sclp_ipl_info. */ - memcpy(reipl_block_fcp->hdr.loadparm, sclp_ipl_info.loadparm, + memcpy(reipl_block_fcp->fcp.loadparm, sclp_ipl_info.loadparm, LOADPARM_LEN); } else { - reipl_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + reipl_block_fcp->hdr.len = IPL_BP_FCP_LEN; reipl_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - reipl_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; - reipl_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - reipl_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_IPL; + reipl_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + reipl_block_fcp->fcp.pbt = IPL_PBT_FCP; + reipl_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_IPL; } reipl_capabilities |= IPL_TYPE_FCP; return 0; @@ -1022,10 +1058,10 @@ static int __init reipl_type_init(void) /* * If we have an OS info reipl block, this will be used */ - if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_FCP) { + if (reipl_block->pb0_hdr.pbt == IPL_PBT_FCP) { memcpy(reipl_block_fcp, reipl_block, size); reipl_type = IPL_TYPE_FCP; - } else if (reipl_block->hdr.pbt == DIAG308_IPL_TYPE_CCW) { + } else if (reipl_block->pb0_hdr.pbt == IPL_PBT_CCW) { memcpy(reipl_block_ccw, reipl_block, size); reipl_type = IPL_TYPE_CCW; } @@ -1070,15 +1106,15 @@ static struct shutdown_action __refdata reipl_action = { /* FCP dump device attributes */ DEFINE_IPL_ATTR_RW(dump_fcp, wwpn, "0x%016llx\n", "%llx\n", - dump_block_fcp->ipl_info.fcp.wwpn); + dump_block_fcp->fcp.wwpn); DEFINE_IPL_ATTR_RW(dump_fcp, lun, "0x%016llx\n", "%llx\n", - dump_block_fcp->ipl_info.fcp.lun); + dump_block_fcp->fcp.lun); DEFINE_IPL_ATTR_RW(dump_fcp, bootprog, "%lld\n", "%lld\n", - dump_block_fcp->ipl_info.fcp.bootprog); + dump_block_fcp->fcp.bootprog); DEFINE_IPL_ATTR_RW(dump_fcp, br_lba, "%lld\n", "%lld\n", - dump_block_fcp->ipl_info.fcp.br_lba); + dump_block_fcp->fcp.br_lba); DEFINE_IPL_ATTR_RW(dump_fcp, device, "0.0.%04llx\n", "0.0.%llx\n", - dump_block_fcp->ipl_info.fcp.devno); + dump_block_fcp->fcp.devno); static struct attribute *dump_fcp_attrs[] = { &sys_dump_fcp_device_attr.attr, @@ -1095,7 +1131,7 @@ static struct attribute_group dump_fcp_attr_group = { }; /* CCW dump device attributes */ -DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ipl_info.ccw); +DEFINE_IPL_CCW_ATTR_RW(dump_ccw, device, dump_block_ccw->ccw); static struct attribute *dump_ccw_attrs[] = { &sys_dump_ccw_device_attr.attr, @@ -1145,7 +1181,9 @@ static struct kset *dump_kset; static void diag308_dump(void *dump_block) { + uv_set_shared(__pa(dump_block)); diag308(DIAG308_SET, dump_block); + uv_remove_shared(__pa(dump_block)); while (1) { if (diag308(DIAG308_LOAD_NORMAL_DUMP, NULL) != 0x302) break; @@ -1187,10 +1225,10 @@ static int __init dump_ccw_init(void) free_page((unsigned long)dump_block_ccw); return rc; } - dump_block_ccw->hdr.len = IPL_PARM_BLK_CCW_LEN; + dump_block_ccw->hdr.len = IPL_BP_CCW_LEN; dump_block_ccw->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_ccw->hdr.blk0_len = IPL_PARM_BLK0_CCW_LEN; - dump_block_ccw->hdr.pbt = DIAG308_IPL_TYPE_CCW; + dump_block_ccw->ccw.len = IPL_BP0_CCW_LEN; + dump_block_ccw->ccw.pbt = IPL_PBT_CCW; dump_capabilities |= DUMP_TYPE_CCW; return 0; } @@ -1209,11 +1247,11 @@ static int __init dump_fcp_init(void) free_page((unsigned long)dump_block_fcp); return rc; } - dump_block_fcp->hdr.len = IPL_PARM_BLK_FCP_LEN; + dump_block_fcp->hdr.len = IPL_BP_FCP_LEN; dump_block_fcp->hdr.version = IPL_PARM_BLOCK_VERSION; - dump_block_fcp->hdr.blk0_len = IPL_PARM_BLK0_FCP_LEN; - dump_block_fcp->hdr.pbt = DIAG308_IPL_TYPE_FCP; - dump_block_fcp->ipl_info.fcp.opt = DIAG308_IPL_OPT_DUMP; + dump_block_fcp->fcp.len = IPL_BP0_FCP_LEN; + dump_block_fcp->fcp.pbt = IPL_PBT_FCP; + dump_block_fcp->fcp.opt = IPL_PB0_FCP_OPT_DUMP; dump_capabilities |= DUMP_TYPE_FCP; return 0; } @@ -1337,7 +1375,7 @@ static void stop_run(struct shutdown_trigger *trigger) { if (strcmp(trigger->name, ON_PANIC_STR) == 0 || strcmp(trigger->name, ON_RESTART_STR) == 0) - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); smp_stop_cpu(); } @@ -1572,7 +1610,7 @@ static int __init s390_ipl_init(void) * READ SCP info provides the correct value. */ if (memcmp(sclp_ipl_info.loadparm, str, sizeof(str)) == 0 && ipl_block_valid) - memcpy(sclp_ipl_info.loadparm, ipl_block.hdr.loadparm, LOADPARM_LEN); + memcpy(sclp_ipl_info.loadparm, ipl_block.ccw.loadparm, LOADPARM_LEN); shutdown_actions_init(); shutdown_triggers_init(); return 0; @@ -1657,15 +1695,15 @@ void __init setup_ipl(void) ipl_info.type = get_ipl_type(); switch (ipl_info.type) { case IPL_TYPE_CCW: - ipl_info.data.ccw.dev_id.ssid = ipl_block.ipl_info.ccw.ssid; - ipl_info.data.ccw.dev_id.devno = ipl_block.ipl_info.ccw.devno; + ipl_info.data.ccw.dev_id.ssid = ipl_block.ccw.ssid; + ipl_info.data.ccw.dev_id.devno = ipl_block.ccw.devno; break; case IPL_TYPE_FCP: case IPL_TYPE_FCP_DUMP: ipl_info.data.fcp.dev_id.ssid = 0; - ipl_info.data.fcp.dev_id.devno = ipl_block.ipl_info.fcp.devno; - ipl_info.data.fcp.wwpn = ipl_block.ipl_info.fcp.wwpn; - ipl_info.data.fcp.lun = ipl_block.ipl_info.fcp.lun; + ipl_info.data.fcp.dev_id.devno = ipl_block.fcp.devno; + ipl_info.data.fcp.wwpn = ipl_block.fcp.wwpn; + ipl_info.data.fcp.lun = ipl_block.fcp.lun; break; case IPL_TYPE_NSS: case IPL_TYPE_UNKNOWN: @@ -1675,14 +1713,6 @@ void __init setup_ipl(void) atomic_notifier_chain_register(&panic_notifier_list, &on_panic_nb); } -void __init ipl_store_parameters(void) -{ - if (early_ipl_block_valid) { - memcpy(&ipl_block, &early_ipl_block, sizeof(ipl_block)); - ipl_block_valid = 1; - } -} - void s390_reset_system(void) { /* Disable prefixing */ @@ -1690,5 +1720,139 @@ void s390_reset_system(void) /* Disable lowcore protection */ __ctl_clear_bit(0, 28); - diag308_reset(); + diag_dma_ops.diag308_reset(); +} + +#ifdef CONFIG_KEXEC_FILE + +int ipl_report_add_component(struct ipl_report *report, struct kexec_buf *kbuf, + unsigned char flags, unsigned short cert) +{ + struct ipl_report_component *comp; + + comp = vzalloc(sizeof(*comp)); + if (!comp) + return -ENOMEM; + list_add_tail(&comp->list, &report->components); + + comp->entry.addr = kbuf->mem; + comp->entry.len = kbuf->memsz; + comp->entry.flags = flags; + comp->entry.certificate_index = cert; + + report->size += sizeof(comp->entry); + + return 0; +} + +int ipl_report_add_certificate(struct ipl_report *report, void *key, + unsigned long addr, unsigned long len) +{ + struct ipl_report_certificate *cert; + + cert = vzalloc(sizeof(*cert)); + if (!cert) + return -ENOMEM; + list_add_tail(&cert->list, &report->certificates); + + cert->entry.addr = addr; + cert->entry.len = len; + cert->key = key; + + report->size += sizeof(cert->entry); + report->size += cert->entry.len; + + return 0; +} + +struct ipl_report *ipl_report_init(struct ipl_parameter_block *ipib) +{ + struct ipl_report *report; + + report = vzalloc(sizeof(*report)); + if (!report) + return ERR_PTR(-ENOMEM); + + report->ipib = ipib; + INIT_LIST_HEAD(&report->components); + INIT_LIST_HEAD(&report->certificates); + + report->size = ALIGN(ipib->hdr.len, 8); + report->size += sizeof(struct ipl_rl_hdr); + report->size += sizeof(struct ipl_rb_components); + report->size += sizeof(struct ipl_rb_certificates); + + return report; +} + +void *ipl_report_finish(struct ipl_report *report) +{ + struct ipl_report_certificate *cert; + struct ipl_report_component *comp; + struct ipl_rb_certificates *certs; + struct ipl_parameter_block *ipib; + struct ipl_rb_components *comps; + struct ipl_rl_hdr *rl_hdr; + void *buf, *ptr; + + buf = vzalloc(report->size); + if (!buf) + return ERR_PTR(-ENOMEM); + ptr = buf; + + memcpy(ptr, report->ipib, report->ipib->hdr.len); + ipib = ptr; + if (ipl_secure_flag) + ipib->hdr.flags |= IPL_PL_FLAG_SIPL; + ipib->hdr.flags |= IPL_PL_FLAG_IPLSR; + ptr += report->ipib->hdr.len; + ptr = PTR_ALIGN(ptr, 8); + + rl_hdr = ptr; + ptr += sizeof(*rl_hdr); + + comps = ptr; + comps->rbt = IPL_RBT_COMPONENTS; + ptr += sizeof(*comps); + list_for_each_entry(comp, &report->components, list) { + memcpy(ptr, &comp->entry, sizeof(comp->entry)); + ptr += sizeof(comp->entry); + } + comps->len = ptr - (void *)comps; + + certs = ptr; + certs->rbt = IPL_RBT_CERTIFICATES; + ptr += sizeof(*certs); + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, &cert->entry, sizeof(cert->entry)); + ptr += sizeof(cert->entry); + } + certs->len = ptr - (void *)certs; + rl_hdr->len = ptr - (void *)rl_hdr; + + list_for_each_entry(cert, &report->certificates, list) { + memcpy(ptr, cert->key, cert->entry.len); + ptr += cert->entry.len; + } + + BUG_ON(ptr > buf + report->size); + return buf; +} + +int ipl_report_free(struct ipl_report *report) +{ + struct ipl_report_component *comp, *ncomp; + struct ipl_report_certificate *cert, *ncert; + + list_for_each_entry_safe(comp, ncomp, &report->components, list) + vfree(comp); + + list_for_each_entry_safe(cert, ncert, &report->certificates, list) + vfree(cert); + + vfree(report); + + return 0; } + +#endif diff --git a/arch/s390/kernel/ipl_vmparm.c b/arch/s390/kernel/ipl_vmparm.c index 411838c0a0af..af43535a976d 100644 --- a/arch/s390/kernel/ipl_vmparm.c +++ b/arch/s390/kernel/ipl_vmparm.c @@ -11,11 +11,11 @@ size_t ipl_block_get_ascii_vmparm(char *dest, size_t size, char has_lowercase = 0; len = 0; - if ((ipb->ipl_info.ccw.vm_flags & DIAG308_VM_FLAGS_VP_VALID) && - (ipb->ipl_info.ccw.vm_parm_len > 0)) { + if ((ipb->ccw.vm_flags & IPL_PB0_CCW_VM_FLAG_VP) && + (ipb->ccw.vm_parm_len > 0)) { - len = min_t(size_t, size - 1, ipb->ipl_info.ccw.vm_parm_len); - memcpy(dest, ipb->ipl_info.ccw.vm_parm, len); + len = min_t(size_t, size - 1, ipb->ccw.vm_parm_len); + memcpy(dest, ipb->ccw.vm_parm, len); /* If at least one character is lowercase, we assume mixed * case; otherwise we convert everything to lowercase. */ diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 0cd5a5f96729..8371855042dc 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -26,6 +26,7 @@ #include <asm/lowcore.h> #include <asm/irq.h> #include <asm/hw_irq.h> +#include <asm/stacktrace.h> #include "entry.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_stat, irq_stat); @@ -73,7 +74,6 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"}, {.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"}, {.irq = IRQIO_CIO, .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"}, - {.irq = IRQIO_QAI, .name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"}, {.irq = IRQIO_DAS, .name = "DAS", .desc = "[I/O] DASD"}, {.irq = IRQIO_C15, .name = "C15", .desc = "[I/O] 3215"}, {.irq = IRQIO_C70, .name = "C70", .desc = "[I/O] 3270"}, @@ -81,14 +81,16 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = IRQIO_VMR, .name = "VMR", .desc = "[I/O] Unit Record Devices"}, {.irq = IRQIO_LCS, .name = "LCS", .desc = "[I/O] LCS"}, {.irq = IRQIO_CTC, .name = "CTC", .desc = "[I/O] CTC"}, - {.irq = IRQIO_APB, .name = "APB", .desc = "[I/O] AP Bus"}, {.irq = IRQIO_ADM, .name = "ADM", .desc = "[I/O] EADM Subchannel"}, {.irq = IRQIO_CSC, .name = "CSC", .desc = "[I/O] CHSC Subchannel"}, - {.irq = IRQIO_PCI, .name = "PCI", .desc = "[I/O] PCI Interrupt" }, - {.irq = IRQIO_MSI, .name = "MSI", .desc = "[I/O] MSI Interrupt" }, {.irq = IRQIO_VIR, .name = "VIR", .desc = "[I/O] Virtual I/O Devices"}, - {.irq = IRQIO_VAI, .name = "VAI", .desc = "[I/O] Virtual I/O Devices AI"}, - {.irq = IRQIO_GAL, .name = "GAL", .desc = "[I/O] GIB Alert"}, + {.irq = IRQIO_QAI, .name = "QAI", .desc = "[AIO] QDIO Adapter Interrupt"}, + {.irq = IRQIO_APB, .name = "APB", .desc = "[AIO] AP Bus"}, + {.irq = IRQIO_PCF, .name = "PCF", .desc = "[AIO] PCI Floating Interrupt"}, + {.irq = IRQIO_PCD, .name = "PCD", .desc = "[AIO] PCI Directed Interrupt"}, + {.irq = IRQIO_MSI, .name = "MSI", .desc = "[AIO] MSI Interrupt"}, + {.irq = IRQIO_VAI, .name = "VAI", .desc = "[AIO] Virtual I/O Devices AI"}, + {.irq = IRQIO_GAL, .name = "GAL", .desc = "[AIO] GIB Alert"}, {.irq = NMI_NMI, .name = "NMI", .desc = "[NMI] Machine Check"}, {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; @@ -116,6 +118,34 @@ void do_IRQ(struct pt_regs *regs, int irq) set_irq_regs(old_regs); } +static void show_msi_interrupt(struct seq_file *p, int irq) +{ + struct irq_desc *desc; + unsigned long flags; + int cpu; + + irq_lock_sparse(); + desc = irq_to_desc(irq); + if (!desc) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + seq_printf(p, "%3d: ", irq); + for_each_online_cpu(cpu) + seq_printf(p, "%10u ", kstat_irqs_cpu(irq, cpu)); + + if (desc->irq_data.chip) + seq_printf(p, " %8s", desc->irq_data.chip->name); + + if (desc->action) + seq_printf(p, " %s", desc->action->name); + + seq_putc(p, '\n'); + raw_spin_unlock_irqrestore(&desc->lock, flags); +out: + irq_unlock_sparse(); +} + /* * show_interrupts is needed by /proc/interrupts. */ @@ -128,7 +158,7 @@ int show_interrupts(struct seq_file *p, void *v) if (index == 0) { seq_puts(p, " "); for_each_online_cpu(cpu) - seq_printf(p, "CPU%d ", cpu); + seq_printf(p, "CPU%-8d", cpu); seq_putc(p, '\n'); } if (index < NR_IRQS_BASE) { @@ -139,9 +169,10 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); goto out; } - if (index > NR_IRQS_BASE) + if (index < nr_irqs) { + show_msi_interrupt(p, index); goto out; - + } for (index = 0; index < NR_ARCH_IRQS; index++) { seq_printf(p, "%s: ", irqclass_sub_desc[index].name); irq = irqclass_sub_desc[index].irq; diff --git a/arch/s390/kernel/kexec_elf.c b/arch/s390/kernel/kexec_elf.c index 5a286b012043..6d0635ceddd0 100644 --- a/arch/s390/kernel/kexec_elf.c +++ b/arch/s390/kernel/kexec_elf.c @@ -10,19 +10,26 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/kexec.h> +#include <asm/ipl.h> #include <asm/setup.h> -static int kexec_file_add_elf_kernel(struct kimage *image, - struct s390_load_data *data, - char *kernel, unsigned long kernel_len) +static int kexec_file_add_kernel_elf(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; + Elf_Addr entry; + void *kernel; int i, ret; + kernel = image->kernel_buf; ehdr = (Elf_Ehdr *)kernel; buf.image = image; + if (image->type == KEXEC_TYPE_CRASH) + entry = STARTUP_KDUMP_OFFSET; + else + entry = ehdr->e_entry; phdr = (void *)ehdr + ehdr->e_phoff; for (i = 0; i < ehdr->e_phnum; i++, phdr++) { @@ -33,30 +40,27 @@ static int kexec_file_add_elf_kernel(struct kimage *image, buf.bufsz = phdr->p_filesz; buf.mem = ALIGN(phdr->p_paddr, phdr->p_align); + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; buf.memsz = phdr->p_memsz; + data->memsz = ALIGN(data->memsz, phdr->p_align) + buf.memsz; - if (phdr->p_paddr == 0) { + if (entry - phdr->p_paddr < phdr->p_memsz) { data->kernel_buf = buf.buffer; - data->memsz += STARTUP_NORMAL_OFFSET; - - buf.buffer += STARTUP_NORMAL_OFFSET; - buf.bufsz -= STARTUP_NORMAL_OFFSET; - - buf.mem += STARTUP_NORMAL_OFFSET; - buf.memsz -= STARTUP_NORMAL_OFFSET; + data->kernel_mem = buf.mem; + data->parm = buf.buffer + PARMAREA; } - if (image->type == KEXEC_TYPE_CRASH) - buf.mem += crashk_res.start; - + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); ret = kexec_add_buffer(&buf); if (ret) return ret; - - data->memsz += buf.memsz; } - return 0; + return data->memsz ? 0 : -EINVAL; } static void *s390_elf_load(struct kimage *image, @@ -64,11 +68,10 @@ static void *s390_elf_load(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len) { - struct s390_load_data data = {0}; const Elf_Ehdr *ehdr; const Elf_Phdr *phdr; size_t size; - int i, ret; + int i; /* image->fobs->probe already checked for valid ELF magic number. */ ehdr = (Elf_Ehdr *)kernel; @@ -101,24 +104,7 @@ static void *s390_elf_load(struct kimage *image, if (size > kernel_len) return ERR_PTR(-EINVAL); - ret = kexec_file_add_elf_kernel(image, &data, kernel, kernel_len); - if (ret) - return ERR_PTR(ret); - - if (!data.memsz) - return ERR_PTR(-EINVAL); - - if (initrd) { - ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); - if (ret) - return ERR_PTR(ret); - } - - ret = kexec_file_add_purgatory(image, &data); - if (ret) - return ERR_PTR(ret); - - return kexec_file_update_kernel(image, &data); + return kexec_file_add_components(image, kexec_file_add_kernel_elf); } static int s390_elf_probe(const char *buf, unsigned long len) @@ -144,4 +130,7 @@ static int s390_elf_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_elf_ops = { .probe = s390_elf_probe, .load = s390_elf_load, +#ifdef CONFIG_KEXEC_VERIFY_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_VERIFY_SIG */ }; diff --git a/arch/s390/kernel/kexec_image.c b/arch/s390/kernel/kexec_image.c index 3800852595e8..58318bf89fd9 100644 --- a/arch/s390/kernel/kexec_image.c +++ b/arch/s390/kernel/kexec_image.c @@ -10,31 +10,34 @@ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/kexec.h> +#include <asm/ipl.h> #include <asm/setup.h> -static int kexec_file_add_image_kernel(struct kimage *image, - struct s390_load_data *data, - char *kernel, unsigned long kernel_len) +static int kexec_file_add_kernel_image(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; - int ret; buf.image = image; - buf.buffer = kernel + STARTUP_NORMAL_OFFSET; - buf.bufsz = kernel_len - STARTUP_NORMAL_OFFSET; + buf.buffer = image->kernel_buf; + buf.bufsz = image->kernel_buf_len; - buf.mem = STARTUP_NORMAL_OFFSET; + buf.mem = 0; if (image->type == KEXEC_TYPE_CRASH) buf.mem += crashk_res.start; buf.memsz = buf.bufsz; - ret = kexec_add_buffer(&buf); + data->kernel_buf = image->kernel_buf; + data->kernel_mem = buf.mem; + data->parm = image->kernel_buf + PARMAREA; + data->memsz += buf.memsz; - data->kernel_buf = kernel; - data->memsz += buf.memsz + STARTUP_NORMAL_OFFSET; - - return ret; + ipl_report_add_component(data->report, &buf, + IPL_RB_COMPONENT_FLAG_SIGNED | + IPL_RB_COMPONENT_FLAG_VERIFIED, + IPL_RB_CERT_UNKNOWN); + return kexec_add_buffer(&buf); } static void *s390_image_load(struct kimage *image, @@ -42,24 +45,7 @@ static void *s390_image_load(struct kimage *image, char *initrd, unsigned long initrd_len, char *cmdline, unsigned long cmdline_len) { - struct s390_load_data data = {0}; - int ret; - - ret = kexec_file_add_image_kernel(image, &data, kernel, kernel_len); - if (ret) - return ERR_PTR(ret); - - if (initrd) { - ret = kexec_file_add_initrd(image, &data, initrd, initrd_len); - if (ret) - return ERR_PTR(ret); - } - - ret = kexec_file_add_purgatory(image, &data); - if (ret) - return ERR_PTR(ret); - - return kexec_file_update_kernel(image, &data); + return kexec_file_add_components(image, kexec_file_add_kernel_image); } static int s390_image_probe(const char *buf, unsigned long len) @@ -73,4 +59,7 @@ static int s390_image_probe(const char *buf, unsigned long len) const struct kexec_file_ops s390_kexec_image_ops = { .probe = s390_image_probe, .load = s390_image_load, +#ifdef CONFIG_KEXEC_VERIFY_SIG + .verify_sig = s390_verify_sig, +#endif /* CONFIG_KEXEC_VERIFY_SIG */ }; diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 7c0a095e9c5f..6f1388391620 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -27,29 +27,30 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); struct kretprobe_blackpoint kretprobe_blacklist[] = { }; -DEFINE_INSN_CACHE_OPS(dmainsn); +DEFINE_INSN_CACHE_OPS(s390_insn); -static void *alloc_dmainsn_page(void) -{ - void *page; +static int insn_page_in_use; +static char insn_page[PAGE_SIZE] __aligned(PAGE_SIZE); - page = (void *) __get_free_page(GFP_KERNEL | GFP_DMA); - if (page) - set_memory_x((unsigned long) page, 1); - return page; +static void *alloc_s390_insn_page(void) +{ + if (xchg(&insn_page_in_use, 1) == 1) + return NULL; + set_memory_x((unsigned long) &insn_page, 1); + return &insn_page; } -static void free_dmainsn_page(void *page) +static void free_s390_insn_page(void *page) { set_memory_nx((unsigned long) page, 1); - free_page((unsigned long)page); + xchg(&insn_page_in_use, 0); } -struct kprobe_insn_cache kprobe_dmainsn_slots = { - .mutex = __MUTEX_INITIALIZER(kprobe_dmainsn_slots.mutex), - .alloc = alloc_dmainsn_page, - .free = free_dmainsn_page, - .pages = LIST_HEAD_INIT(kprobe_dmainsn_slots.pages), +struct kprobe_insn_cache kprobe_s390_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_s390_insn_slots.mutex), + .alloc = alloc_s390_insn_page, + .free = free_s390_insn_page, + .pages = LIST_HEAD_INIT(kprobe_s390_insn_slots.pages), .insn_size = MAX_INSN_SIZE, }; @@ -102,7 +103,7 @@ static int s390_get_insn_slot(struct kprobe *p) */ p->ainsn.insn = NULL; if (is_kernel_addr(p->addr)) - p->ainsn.insn = get_dmainsn_slot(); + p->ainsn.insn = get_s390_insn_slot(); else if (is_module_addr(p->addr)) p->ainsn.insn = get_insn_slot(); return p->ainsn.insn ? 0 : -ENOMEM; @@ -114,7 +115,7 @@ static void s390_free_insn_slot(struct kprobe *p) if (!p->ainsn.insn) return; if (is_kernel_addr(p->addr)) - free_dmainsn_slot(p->ainsn.insn, 0); + free_s390_insn_slot(p->ainsn.insn, 0); else free_insn_slot(p->ainsn.insn, 0); p->ainsn.insn = NULL; @@ -572,7 +573,7 @@ static int kprobe_trap_handler(struct pt_regs *regs, int trapnr) * In case the user-specified fault handler returned * zero, try to fix up. */ - entry = search_exception_tables(regs->psw.addr); + entry = s390_search_extables(regs->psw.addr); if (entry) { regs->psw.addr = extable_fixup(entry); return 1; diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index cb582649aba6..8a1ae140c5e2 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -27,6 +27,7 @@ #include <asm/cacheflush.h> #include <asm/os_info.h> #include <asm/set_memory.h> +#include <asm/stacktrace.h> #include <asm/switch_to.h> #include <asm/nmi.h> @@ -95,7 +96,7 @@ static void __do_machine_kdump(void *image) start_kdump(1); /* Die if start_kdump returns */ - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } /* @@ -253,6 +254,9 @@ void arch_crash_save_vmcoreinfo(void) VMCOREINFO_SYMBOL(high_memory); VMCOREINFO_LENGTH(lowcore_ptr, NR_CPUS); mem_assign_absolute(S390_lowcore.vmcore_info, paddr_vmcoreinfo_note()); + vmcoreinfo_append_str("SDMA=%lx\n", __sdma); + vmcoreinfo_append_str("EDMA=%lx\n", __edma); + vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); } void machine_shutdown(void) @@ -280,7 +284,7 @@ static void __do_machine_kexec(void *data) (*data_mover)(&image->head, image->start); /* Die if kexec returns */ - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); } /* diff --git a/arch/s390/kernel/machine_kexec_file.c b/arch/s390/kernel/machine_kexec_file.c index 32023b4f9dc0..fbdd3ea73667 100644 --- a/arch/s390/kernel/machine_kexec_file.c +++ b/arch/s390/kernel/machine_kexec_file.c @@ -8,7 +8,12 @@ */ #include <linux/elf.h> +#include <linux/errno.h> #include <linux/kexec.h> +#include <linux/module.h> +#include <linux/verification.h> +#include <asm/boot_data.h> +#include <asm/ipl.h> #include <asm/setup.h> const struct kexec_file_ops * const kexec_file_loaders[] = { @@ -17,38 +22,78 @@ const struct kexec_file_ops * const kexec_file_loaders[] = { NULL, }; -int *kexec_file_update_kernel(struct kimage *image, - struct s390_load_data *data) -{ - unsigned long *loc; - - if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) - return ERR_PTR(-EINVAL); - - if (image->cmdline_buf_len) - memcpy(data->kernel_buf + COMMAND_LINE_OFFSET, - image->cmdline_buf, image->cmdline_buf_len); - - if (image->type == KEXEC_TYPE_CRASH) { - loc = (unsigned long *)(data->kernel_buf + OLDMEM_BASE_OFFSET); - *loc = crashk_res.start; - - loc = (unsigned long *)(data->kernel_buf + OLDMEM_SIZE_OFFSET); - *loc = crashk_res.end - crashk_res.start + 1; - } +#ifdef CONFIG_KEXEC_VERIFY_SIG +/* + * Module signature information block. + * + * The constituents of the signature section are, in order: + * + * - Signer's name + * - Key identifier + * - Signature data + * - Information block + */ +struct module_signature { + u8 algo; /* Public-key crypto algorithm [0] */ + u8 hash; /* Digest algorithm [0] */ + u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */ + u8 signer_len; /* Length of signer's name [0] */ + u8 key_id_len; /* Length of key identifier [0] */ + u8 __pad[3]; + __be32 sig_len; /* Length of signature data */ +}; - if (image->initrd_buf) { - loc = (unsigned long *)(data->kernel_buf + INITRD_START_OFFSET); - *loc = data->initrd_load_addr; +#define PKEY_ID_PKCS7 2 - loc = (unsigned long *)(data->kernel_buf + INITRD_SIZE_OFFSET); - *loc = image->initrd_buf_len; +int s390_verify_sig(const char *kernel, unsigned long kernel_len) +{ + const unsigned long marker_len = sizeof(MODULE_SIG_STRING) - 1; + struct module_signature *ms; + unsigned long sig_len; + + /* Skip signature verification when not secure IPLed. */ + if (!ipl_secure_flag) + return 0; + + if (marker_len > kernel_len) + return -EKEYREJECTED; + + if (memcmp(kernel + kernel_len - marker_len, MODULE_SIG_STRING, + marker_len)) + return -EKEYREJECTED; + kernel_len -= marker_len; + + ms = (void *)kernel + kernel_len - sizeof(*ms); + kernel_len -= sizeof(*ms); + + sig_len = be32_to_cpu(ms->sig_len); + if (sig_len >= kernel_len) + return -EKEYREJECTED; + kernel_len -= sig_len; + + if (ms->id_type != PKEY_ID_PKCS7) + return -EKEYREJECTED; + + if (ms->algo != 0 || + ms->hash != 0 || + ms->signer_len != 0 || + ms->key_id_len != 0 || + ms->__pad[0] != 0 || + ms->__pad[1] != 0 || + ms->__pad[2] != 0) { + return -EBADMSG; } - return NULL; + return verify_pkcs7_signature(kernel, kernel_len, + kernel + kernel_len, sig_len, + VERIFY_USE_PLATFORM_KEYRING, + VERIFYING_MODULE_SIGNATURE, + NULL, NULL); } +#endif /* CONFIG_KEXEC_VERIFY_SIG */ -static int kexec_file_update_purgatory(struct kimage *image) +static int kexec_file_update_purgatory(struct kimage *image, + struct s390_load_data *data) { u64 entry, type; int ret; @@ -90,7 +135,8 @@ static int kexec_file_update_purgatory(struct kimage *image) return ret; } -int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) +static int kexec_file_add_purgatory(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; int ret; @@ -105,21 +151,21 @@ int kexec_file_add_purgatory(struct kimage *image, struct s390_load_data *data) ret = kexec_load_purgatory(image, &buf); if (ret) return ret; + data->memsz += buf.memsz; - ret = kexec_file_update_purgatory(image); - return ret; + return kexec_file_update_purgatory(image, data); } -int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, - char *initrd, unsigned long initrd_len) +static int kexec_file_add_initrd(struct kimage *image, + struct s390_load_data *data) { struct kexec_buf buf; int ret; buf.image = image; - buf.buffer = initrd; - buf.bufsz = initrd_len; + buf.buffer = image->initrd_buf; + buf.bufsz = image->initrd_buf_len; data->memsz = ALIGN(data->memsz, PAGE_SIZE); buf.mem = data->memsz; @@ -127,11 +173,115 @@ int kexec_file_add_initrd(struct kimage *image, struct s390_load_data *data, buf.mem += crashk_res.start; buf.memsz = buf.bufsz; - data->initrd_load_addr = buf.mem; + data->parm->initrd_start = buf.mem; + data->parm->initrd_size = buf.memsz; data->memsz += buf.memsz; ret = kexec_add_buffer(&buf); - return ret; + if (ret) + return ret; + + return ipl_report_add_component(data->report, &buf, 0, 0); +} + +static int kexec_file_add_ipl_report(struct kimage *image, + struct s390_load_data *data) +{ + __u32 *lc_ipl_parmblock_ptr; + unsigned int len, ncerts; + struct kexec_buf buf; + unsigned long addr; + void *ptr, *end; + + buf.image = image; + + data->memsz = ALIGN(data->memsz, PAGE_SIZE); + buf.mem = data->memsz; + if (image->type == KEXEC_TYPE_CRASH) + buf.mem += crashk_res.start; + + ptr = (void *)ipl_cert_list_addr; + end = ptr + ipl_cert_list_size; + ncerts = 0; + while (ptr < end) { + ncerts++; + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ptr += len; + } + + addr = data->memsz + data->report->size; + addr += ncerts * sizeof(struct ipl_rb_certificate_entry); + ptr = (void *)ipl_cert_list_addr; + while (ptr < end) { + len = *(unsigned int *)ptr; + ptr += sizeof(len); + ipl_report_add_certificate(data->report, ptr, addr, len); + addr += len; + ptr += len; + } + + buf.buffer = ipl_report_finish(data->report); + buf.bufsz = data->report->size; + buf.memsz = buf.bufsz; + + data->memsz += buf.memsz; + + lc_ipl_parmblock_ptr = + data->kernel_buf + offsetof(struct lowcore, ipl_parmblock_ptr); + *lc_ipl_parmblock_ptr = (__u32)buf.mem; + + return kexec_add_buffer(&buf); +} + +void *kexec_file_add_components(struct kimage *image, + int (*add_kernel)(struct kimage *image, + struct s390_load_data *data)) +{ + struct s390_load_data data = {0}; + int ret; + + data.report = ipl_report_init(&ipl_block); + if (IS_ERR(data.report)) + return data.report; + + ret = add_kernel(image, &data); + if (ret) + goto out; + + if (image->cmdline_buf_len >= ARCH_COMMAND_LINE_SIZE) { + ret = -EINVAL; + goto out; + } + memcpy(data.parm->command_line, image->cmdline_buf, + image->cmdline_buf_len); + + if (image->type == KEXEC_TYPE_CRASH) { + data.parm->oldmem_base = crashk_res.start; + data.parm->oldmem_size = crashk_res.end - crashk_res.start + 1; + } + + if (image->initrd_buf) { + ret = kexec_file_add_initrd(image, &data); + if (ret) + goto out; + } + + ret = kexec_file_add_purgatory(image, &data); + if (ret) + goto out; + + if (data.kernel_mem == 0) { + unsigned long restart_psw = 0x0008000080000000UL; + restart_psw += image->start; + memcpy(data.kernel_buf, &restart_psw, sizeof(restart_psw)); + image->start = 0; + } + + ret = kexec_file_add_ipl_report(image, &data); +out: + ipl_report_free(data.report); + return ERR_PTR(ret); } int arch_kexec_apply_relocations_add(struct purgatory_info *pi, @@ -140,7 +290,7 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, const Elf_Shdr *symtab) { Elf_Rela *relas; - int i; + int i, r_type; relas = (void *)pi->ehdr + relsec->sh_offset; @@ -174,46 +324,8 @@ int arch_kexec_apply_relocations_add(struct purgatory_info *pi, addr = section->sh_addr + relas[i].r_offset; - switch (ELF64_R_TYPE(relas[i].r_info)) { - case R_390_8: /* Direct 8 bit. */ - *(u8 *)loc = val; - break; - case R_390_12: /* Direct 12 bit. */ - *(u16 *)loc &= 0xf000; - *(u16 *)loc |= val & 0xfff; - break; - case R_390_16: /* Direct 16 bit. */ - *(u16 *)loc = val; - break; - case R_390_20: /* Direct 20 bit. */ - *(u32 *)loc &= 0xf00000ff; - *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ - *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ - break; - case R_390_32: /* Direct 32 bit. */ - *(u32 *)loc = val; - break; - case R_390_64: /* Direct 64 bit. */ - *(u64 *)loc = val; - break; - case R_390_PC16: /* PC relative 16 bit. */ - *(u16 *)loc = (val - addr); - break; - case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ - *(u16 *)loc = (val - addr) >> 1; - break; - case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ - *(u32 *)loc = (val - addr) >> 1; - break; - case R_390_PC32: /* PC relative 32 bit. */ - *(u32 *)loc = (val - addr); - break; - case R_390_PC64: /* PC relative 64 bit. */ - *(u64 *)loc = (val - addr); - break; - default: - break; - } + r_type = ELF64_R_TYPE(relas[i].r_info); + arch_kexec_do_relocs(r_type, loc, val, addr); } return 0; } @@ -225,10 +337,8 @@ int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, * load memory in head.S will be accessed, e.g. to register the next * command line. If the next kernel were smaller the current kernel * will panic at load. - * - * 0x11000 = sizeof(head.S) */ - if (buf_len < 0x11000) + if (buf_len < HEAD_END) return -ENOEXEC; return kexec_image_probe_default(image, buf, buf_len); diff --git a/arch/s390/kernel/machine_kexec_reloc.c b/arch/s390/kernel/machine_kexec_reloc.c new file mode 100644 index 000000000000..1dded39239f8 --- /dev/null +++ b/arch/s390/kernel/machine_kexec_reloc.c @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/elf.h> + +int arch_kexec_do_relocs(int r_type, void *loc, unsigned long val, + unsigned long addr) +{ + switch (r_type) { + case R_390_NONE: + break; + case R_390_8: /* Direct 8 bit. */ + *(u8 *)loc = val; + break; + case R_390_12: /* Direct 12 bit. */ + *(u16 *)loc &= 0xf000; + *(u16 *)loc |= val & 0xfff; + break; + case R_390_16: /* Direct 16 bit. */ + *(u16 *)loc = val; + break; + case R_390_20: /* Direct 20 bit. */ + *(u32 *)loc &= 0xf00000ff; + *(u32 *)loc |= (val & 0xfff) << 16; /* DL */ + *(u32 *)loc |= (val & 0xff000) >> 4; /* DH */ + break; + case R_390_32: /* Direct 32 bit. */ + *(u32 *)loc = val; + break; + case R_390_64: /* Direct 64 bit. */ + *(u64 *)loc = val; + break; + case R_390_PC16: /* PC relative 16 bit. */ + *(u16 *)loc = (val - addr); + break; + case R_390_PC16DBL: /* PC relative 16 bit shifted by 1. */ + *(u16 *)loc = (val - addr) >> 1; + break; + case R_390_PC32DBL: /* PC relative 32 bit shifted by 1. */ + *(u32 *)loc = (val - addr) >> 1; + break; + case R_390_PC32: /* PC relative 32 bit. */ + *(u32 *)loc = (val - addr); + break; + case R_390_PC64: /* PC relative 64 bit. */ + *(u64 *)loc = (val - addr); + break; + case R_390_RELATIVE: + *(unsigned long *) loc = val; + break; + default: + return 1; + } + return 0; +} diff --git a/arch/s390/kernel/mcount.S b/arch/s390/kernel/mcount.S index e93fbf02490c..9e1660a6b9db 100644 --- a/arch/s390/kernel/mcount.S +++ b/arch/s390/kernel/mcount.S @@ -20,6 +20,7 @@ ENTRY(ftrace_stub) BR_EX %r14 +ENDPROC(ftrace_stub) #define STACK_FRAME_SIZE (STACK_FRAME_OVERHEAD + __PT_SIZE) #define STACK_PTREGS (STACK_FRAME_OVERHEAD) @@ -28,7 +29,7 @@ ENTRY(ftrace_stub) ENTRY(_mcount) BR_EX %r14 - +ENDPROC(_mcount) EXPORT_SYMBOL(_mcount) ENTRY(ftrace_caller) @@ -61,10 +62,11 @@ ENTRY(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER # The j instruction gets runtime patched to a nop instruction. # See ftrace_enable_ftrace_graph_caller. -ENTRY(ftrace_graph_caller) + .globl ftrace_graph_caller +ftrace_graph_caller: j ftrace_graph_caller_end - lg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) - lg %r3,(STACK_PTREGS_PSW+8)(%r15) + lmg %r2,%r3,(STACK_PTREGS_GPRS+14*8)(%r15) + lg %r4,(STACK_PTREGS_PSW+8)(%r15) brasl %r14,prepare_ftrace_return stg %r2,(STACK_PTREGS_GPRS+14*8)(%r15) ftrace_graph_caller_end: @@ -73,6 +75,7 @@ ftrace_graph_caller_end: lg %r1,(STACK_PTREGS_PSW+8)(%r15) lmg %r2,%r15,(STACK_PTREGS_GPRS+2*8)(%r15) BR_EX %r1 +ENDPROC(ftrace_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -86,5 +89,6 @@ ENTRY(return_to_handler) lgr %r14,%r2 lmg %r2,%r5,32(%r15) BR_EX %r14 +ENDPROC(return_to_handler) #endif diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 8c867b43c8eb..0a487fae763e 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -125,7 +125,7 @@ void nmi_free_per_cpu(struct lowcore *lc) static notrace void s390_handle_damage(void) { smp_emergency_stop(); - disabled_wait((unsigned long) __builtin_return_address(0)); + disabled_wait(); while (1); } NOKPROBE_SYMBOL(s390_handle_damage); diff --git a/arch/s390/kernel/nospec-branch.c b/arch/s390/kernel/nospec-branch.c index bdddaae96559..29e511f5bf06 100644 --- a/arch/s390/kernel/nospec-branch.c +++ b/arch/s390/kernel/nospec-branch.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/module.h> #include <linux/device.h> +#include <linux/cpu.h> #include <asm/nospec-branch.h> static int __init nobp_setup_early(char *str) @@ -37,7 +38,7 @@ static int __init nospec_report(void) { if (test_facility(156)) pr_info("Spectre V2 mitigation: etokens\n"); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) pr_info("Spectre V2 mitigation: execute trampolines\n"); if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) pr_info("Spectre V2 mitigation: limited branch prediction\n"); @@ -58,15 +59,15 @@ early_param("nospectre_v2", nospectre_v2_setup_early); void __init nospec_auto_detect(void) { - if (test_facility(156)) { + if (test_facility(156) || cpu_mitigations_off()) { /* * The machine supports etokens. * Disable expolines and disable nobp. */ - if (IS_ENABLED(CC_USING_EXPOLINE)) + if (__is_defined(CC_USING_EXPOLINE)) nospec_disable = 1; __clear_facility(82, S390_lowcore.alt_stfle_fac_list); - } else if (IS_ENABLED(CC_USING_EXPOLINE)) { + } else if (__is_defined(CC_USING_EXPOLINE)) { /* * The kernel has been compiled with expolines. * Keep expolines enabled and disable nobp. diff --git a/arch/s390/kernel/nospec-sysfs.c b/arch/s390/kernel/nospec-sysfs.c index e30e580ae362..48f472bf9290 100644 --- a/arch/s390/kernel/nospec-sysfs.c +++ b/arch/s390/kernel/nospec-sysfs.c @@ -15,7 +15,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev, { if (test_facility(156)) return sprintf(buf, "Mitigation: etokens\n"); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) return sprintf(buf, "Mitigation: execute trampolines\n"); if (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) return sprintf(buf, "Mitigation: limited branch prediction\n"); diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index e1c54d28713a..48d48b6187c0 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,8 +2,8 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2017 - * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + * Copyright IBM Corp. 2012, 2019 + * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> */ #define KMSG_COMPONENT "cpum_cf" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -26,7 +26,7 @@ static enum cpumf_ctr_set get_counter_set(u64 event) set = CPUMF_CTR_SET_USER; else if (event < 128) set = CPUMF_CTR_SET_CRYPTO; - else if (event < 256) + else if (event < 288) set = CPUMF_CTR_SET_EXT; else if (event >= 448 && event < 496) set = CPUMF_CTR_SET_MT_DIAG; @@ -50,12 +50,19 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_CRYPTO: + if ((cpuhw->info.csvn >= 1 && cpuhw->info.csvn <= 5 && + hwc->config > 79) || + (cpuhw->info.csvn >= 6 && hwc->config > 83)) + err = -EOPNOTSUPP; + break; case CPUMF_CTR_SET_EXT: if (cpuhw->info.csvn < 1) err = -EOPNOTSUPP; if ((cpuhw->info.csvn == 1 && hwc->config > 159) || (cpuhw->info.csvn == 2 && hwc->config > 175) || - (cpuhw->info.csvn > 2 && hwc->config > 255)) + (cpuhw->info.csvn >= 3 && cpuhw->info.csvn <= 5 + && hwc->config > 255) || + (cpuhw->info.csvn >= 6 && hwc->config > 287)) err = -EOPNOTSUPP; break; case CPUMF_CTR_SET_MT_DIAG: diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index b6854812d2ed..d4e031f7b9c8 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -306,15 +306,20 @@ static size_t cf_diag_ctrset_size(enum cpumf_ctr_set ctrset, ctrset_size = 2; break; case CPUMF_CTR_SET_CRYPTO: - ctrset_size = 16; + if (info->csvn >= 1 && info->csvn <= 5) + ctrset_size = 16; + else if (info->csvn == 6) + ctrset_size = 20; break; case CPUMF_CTR_SET_EXT: if (info->csvn == 1) ctrset_size = 32; else if (info->csvn == 2) ctrset_size = 48; - else if (info->csvn >= 3) + else if (info->csvn >= 3 && info->csvn <= 5) ctrset_size = 128; + else if (info->csvn == 6) + ctrset_size = 160; break; case CPUMF_CTR_SET_MT_DIAG: if (info->csvn > 3) diff --git a/arch/s390/kernel/perf_cpum_cf_events.c b/arch/s390/kernel/perf_cpum_cf_events.c index b45238c89728..34cc96449b30 100644 --- a/arch/s390/kernel/perf_cpum_cf_events.c +++ b/arch/s390/kernel/perf_cpum_cf_events.c @@ -31,22 +31,26 @@ CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_CPU_CYCLES, 0x0020); CPUMF_EVENT_ATTR(cf_fvn3, PROBLEM_STATE_INSTRUCTIONS, 0x0021); CPUMF_EVENT_ATTR(cf_fvn3, L1D_DIR_WRITES, 0x0004); CPUMF_EVENT_ATTR(cf_fvn3, L1D_PENALTY_CYCLES, 0x0005); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_FUNCTIONS, 0x0040); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_CYCLES, 0x0041); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS, 0x0042); -CPUMF_EVENT_ATTR(cf_svn_generic, PRNG_BLOCKED_CYCLES, 0x0043); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_FUNCTIONS, 0x0044); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_CYCLES, 0x0045); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS, 0x0046); -CPUMF_EVENT_ATTR(cf_svn_generic, SHA_BLOCKED_CYCLES, 0x0047); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_FUNCTIONS, 0x0048); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_CYCLES, 0x0049); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS, 0x004a); -CPUMF_EVENT_ATTR(cf_svn_generic, DEA_BLOCKED_CYCLES, 0x004b); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_FUNCTIONS, 0x004c); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_CYCLES, 0x004d); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS, 0x004e); -CPUMF_EVENT_ATTR(cf_svn_generic, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_FUNCTIONS, 0x0040); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_CYCLES, 0x0041); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS, 0x0042); +CPUMF_EVENT_ATTR(cf_svn_12345, PRNG_BLOCKED_CYCLES, 0x0043); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_FUNCTIONS, 0x0044); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_CYCLES, 0x0045); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS, 0x0046); +CPUMF_EVENT_ATTR(cf_svn_12345, SHA_BLOCKED_CYCLES, 0x0047); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_FUNCTIONS, 0x0048); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_CYCLES, 0x0049); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS, 0x004a); +CPUMF_EVENT_ATTR(cf_svn_12345, DEA_BLOCKED_CYCLES, 0x004b); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_FUNCTIONS, 0x004c); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_CYCLES, 0x004d); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS, 0x004e); +CPUMF_EVENT_ATTR(cf_svn_12345, AES_BLOCKED_CYCLES, 0x004f); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_FUNCTION_COUNT, 0x0050); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_CYCLES_COUNT, 0x0051); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT, 0x0052); +CPUMF_EVENT_ATTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT, 0x0053); CPUMF_EVENT_ATTR(cf_z10, L1I_L2_SOURCED_WRITES, 0x0080); CPUMF_EVENT_ATTR(cf_z10, L1D_L2_SOURCED_WRITES, 0x0081); CPUMF_EVENT_ATTR(cf_z10, L1I_L3_LOCAL_WRITES, 0x0082); @@ -262,23 +266,47 @@ static struct attribute *cpumcf_fvn3_pmu_event_attr[] __initdata = { NULL, }; -static struct attribute *cpumcf_svn_generic_pmu_event_attr[] __initdata = { - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, PRNG_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, SHA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, DEA_BLOCKED_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, AES_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, AES_CYCLES), - CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_FUNCTIONS), - CPUMF_EVENT_PTR(cf_svn_generic, AES_BLOCKED_CYCLES), +static struct attribute *cpumcf_svn_12345_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + NULL, +}; + +static struct attribute *cpumcf_svn_6_pmu_event_attr[] __initdata = { + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, PRNG_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, SHA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, DEA_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_CYCLES), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_FUNCTIONS), + CPUMF_EVENT_PTR(cf_svn_12345, AES_BLOCKED_CYCLES), + CPUMF_EVENT_PTR(cf_svn_6, ECC_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_CYCLES_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_FUNCTION_COUNT), + CPUMF_EVENT_PTR(cf_svn_6, ECC_BLOCKED_CYCLES_COUNT), NULL, }; @@ -562,7 +590,18 @@ __init const struct attribute_group **cpumf_cf_event_group(void) default: cfvn = none; } - csvn = cpumcf_svn_generic_pmu_event_attr; + + /* Determine version specific crypto set */ + switch (ci.csvn) { + case 1 ... 5: + csvn = cpumcf_svn_12345_pmu_event_attr; + break; + case 6: + csvn = cpumcf_svn_6_pmu_event_attr; + break; + default: + csvn = none; + } /* Determine model-specific counter set(s) */ get_cpu_id(&cpu_id); diff --git a/arch/s390/kernel/perf_event.c b/arch/s390/kernel/perf_event.c index 0d770e513abf..fcb6c2e92b07 100644 --- a/arch/s390/kernel/perf_event.c +++ b/arch/s390/kernel/perf_event.c @@ -21,6 +21,7 @@ #include <asm/lowcore.h> #include <asm/processor.h> #include <asm/sysinfo.h> +#include <asm/unwind.h> const char *perf_pmu_name(void) { @@ -219,20 +220,13 @@ static int __init service_level_perf_register(void) } arch_initcall(service_level_perf_register); -static int __perf_callchain_kernel(void *data, unsigned long address, int reliable) -{ - struct perf_callchain_entry_ctx *entry = data; - - perf_callchain_store(entry, address); - return 0; -} - void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { - if (user_mode(regs)) - return; - dump_trace(__perf_callchain_kernel, entry, NULL, regs->gprs[15]); + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) + perf_callchain_store(entry, state.ip); } /* Perf definitions for PMU event attributes in sysfs */ diff --git a/arch/s390/kernel/pgm_check.S b/arch/s390/kernel/pgm_check.S index 3e62aae34ea3..59dee9d3bebf 100644 --- a/arch/s390/kernel/pgm_check.S +++ b/arch/s390/kernel/pgm_check.S @@ -7,7 +7,7 @@ #include <linux/linkage.h> -#define PGM_CHECK(handler) .long handler +#define PGM_CHECK(handler) .quad handler #define PGM_CHECK_DEFAULT PGM_CHECK(default_trap_handler) /* diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 6e758bb6cd29..63873aa6693f 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -37,6 +37,7 @@ #include <asm/irq.h> #include <asm/nmi.h> #include <asm/smp.h> +#include <asm/stacktrace.h> #include <asm/switch_to.h> #include <asm/runtime_instr.h> #include "entry.h" diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 6fe2e1875058..5de13307b703 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -109,7 +109,8 @@ static void show_cpu_summary(struct seq_file *m, void *v) { static const char *hwcap_str[] = { "esan3", "zarch", "stfle", "msa", "ldisp", "eimm", "dfp", - "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs" + "edat", "etf3eh", "highgprs", "te", "vx", "vxd", "vxe", "gs", + "vxe2", "vxp", "sort", "dflt" }; static const char * const int_hwcap_str[] = { "sie" diff --git a/arch/s390/kernel/reipl.S b/arch/s390/kernel/reipl.S index 7f14adf512c6..4a22163962eb 100644 --- a/arch/s390/kernel/reipl.S +++ b/arch/s390/kernel/reipl.S @@ -73,6 +73,7 @@ ENTRY(store_status) lgr %r9,%r2 lgr %r2,%r3 BR_EX %r9 +ENDPROC(store_status) .section .bss .align 8 diff --git a/arch/s390/kernel/relocate_kernel.S b/arch/s390/kernel/relocate_kernel.S index c97c2d40fe15..fe396673e8a6 100644 --- a/arch/s390/kernel/relocate_kernel.S +++ b/arch/s390/kernel/relocate_kernel.S @@ -58,11 +58,15 @@ ENTRY(relocate_kernel) j .base .done: sgr %r0,%r0 # clear register r0 + cghi %r3,0 + je .diag la %r4,load_psw-.base(%r13) # load psw-address into the register o %r3,4(%r4) # or load address into psw st %r3,4(%r4) mvc 0(8,%r0),0(%r4) # copy psw to absolute address 0 + .diag: diag %r0,%r0,0x308 +ENDPROC(relocate_kernel) .align 8 load_psw: diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 2c642af526ce..f8544d517430 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -50,6 +50,7 @@ #include <linux/compat.h> #include <linux/start_kernel.h> +#include <asm/boot_data.h> #include <asm/ipl.h> #include <asm/facility.h> #include <asm/smp.h> @@ -65,11 +66,13 @@ #include <asm/diag.h> #include <asm/os_info.h> #include <asm/sclp.h> +#include <asm/stacktrace.h> #include <asm/sysinfo.h> #include <asm/numa.h> #include <asm/alternative.h> #include <asm/nospec-branch.h> #include <asm/mem_detect.h> +#include <asm/uv.h> #include "entry.h" /* @@ -89,12 +92,25 @@ char elf_platform[ELF_PLATFORM_SIZE]; unsigned long int_hwcap = 0; +#ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST +int __bootdata_preserved(prot_virt_guest); +#endif + int __bootdata(noexec_disabled); int __bootdata(memory_end_set); unsigned long __bootdata(memory_end); unsigned long __bootdata(max_physmem_end); struct mem_detect_info __bootdata(mem_detect); +struct exception_table_entry *__bootdata_preserved(__start_dma_ex_table); +struct exception_table_entry *__bootdata_preserved(__stop_dma_ex_table); +unsigned long __bootdata_preserved(__swsusp_reset_dma); +unsigned long __bootdata_preserved(__stext_dma); +unsigned long __bootdata_preserved(__etext_dma); +unsigned long __bootdata_preserved(__sdma); +unsigned long __bootdata_preserved(__edma); +unsigned long __bootdata_preserved(__kaslr_offset); + unsigned long VMALLOC_START; EXPORT_SYMBOL(VMALLOC_START); @@ -736,6 +752,15 @@ static void __init reserve_initrd(void) #endif } +/* + * Reserve the memory area used to pass the certificate lists + */ +static void __init reserve_certificate_list(void) +{ + if (ipl_cert_list_addr) + memblock_reserve(ipl_cert_list_addr, ipl_cert_list_size); +} + static void __init reserve_mem_detect_info(void) { unsigned long start, size; @@ -814,9 +839,10 @@ static void __init reserve_kernel(void) { unsigned long start_pfn = PFN_UP(__pa(_end)); - memblock_reserve(0, PARMAREA_END); + memblock_reserve(0, HEAD_END); memblock_reserve((unsigned long)_stext, PFN_PHYS(start_pfn) - (unsigned long)_stext); + memblock_reserve(__sdma, __edma - __sdma); } static void __init setup_memory(void) @@ -914,7 +940,15 @@ static int __init setup_hwcaps(void) elf_hwcap |= HWCAP_S390_VXRS_EXT; if (test_facility(135)) elf_hwcap |= HWCAP_S390_VXRS_BCD; + if (test_facility(148)) + elf_hwcap |= HWCAP_S390_VXRS_EXT2; + if (test_facility(152)) + elf_hwcap |= HWCAP_S390_VXRS_PDE; } + if (test_facility(150)) + elf_hwcap |= HWCAP_S390_SORT; + if (test_facility(151)) + elf_hwcap |= HWCAP_S390_DFLT; /* * Guarded storage support HWCAP_S390_GS is bit 12. @@ -1023,6 +1057,38 @@ static void __init setup_control_program_code(void) } /* + * Print the component list from the IPL report + */ +static void __init log_component_list(void) +{ + struct ipl_rb_component_entry *ptr, *end; + char *str; + + if (!early_ipl_comp_list_addr) + return; + if (ipl_block.hdr.flags & IPL_PL_FLAG_IPLSR) + pr_info("Linux is running with Secure-IPL enabled\n"); + else + pr_info("Linux is running with Secure-IPL disabled\n"); + ptr = (void *) early_ipl_comp_list_addr; + end = (void *) ptr + early_ipl_comp_list_size; + pr_info("The IPL report contains the following components:\n"); + while (ptr < end) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_SIGNED) { + if (ptr->flags & IPL_RB_COMPONENT_FLAG_VERIFIED) + str = "signed, verified"; + else + str = "signed, verification failed"; + } else { + str = "not signed"; + } + pr_info("%016llx - %016llx (%s)\n", + ptr->addr, ptr->addr + ptr->len, str); + ptr++; + } +} + +/* * Setup function called from init/main.c just after the banner * was printed. */ @@ -1042,6 +1108,8 @@ void __init setup_arch(char **cmdline_p) else pr_info("Linux is running as a guest in 64-bit mode\n"); + log_component_list(); + /* Have one command line that is parsed and saved in /proc/cmdline */ /* boot_command_line has been already set up in early.c */ *cmdline_p = boot_command_line; @@ -1073,6 +1141,7 @@ void __init setup_arch(char **cmdline_p) reserve_oldmem(); reserve_kernel(); reserve_initrd(); + reserve_certificate_list(); reserve_mem_detect_info(); memblock_allow_resize(); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index bd197baf1dc3..35fafa2b91a8 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -53,6 +53,7 @@ #include <asm/sigp.h> #include <asm/idle.h> #include <asm/nmi.h> +#include <asm/stacktrace.h> #include <asm/topology.h> #include "entry.h" @@ -689,7 +690,7 @@ void __init smp_save_dump_cpus(void) smp_save_cpu_regs(sa, addr, is_boot_cpu, page); } memblock_free(page, PAGE_SIZE); - diag308_reset(); + diag_dma_ops.diag308_reset(); pcpu_set_smt(0); } #endif /* CONFIG_CRASH_DUMP */ diff --git a/arch/s390/kernel/stacktrace.c b/arch/s390/kernel/stacktrace.c index 460dcfba7d4e..f6a620f854e1 100644 --- a/arch/s390/kernel/stacktrace.c +++ b/arch/s390/kernel/stacktrace.c @@ -11,65 +11,52 @@ #include <linux/stacktrace.h> #include <linux/kallsyms.h> #include <linux/export.h> - -static int __save_address(void *data, unsigned long address, int nosched) -{ - struct stack_trace *trace = data; - - if (nosched && in_sched_functions(address)) - return 0; - if (trace->skip > 0) { - trace->skip--; - return 0; - } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = address; - return 0; - } - return 1; -} - -static int save_address(void *data, unsigned long address, int reliable) -{ - return __save_address(data, address, 0); -} - -static int save_address_nosched(void *data, unsigned long address, int reliable) -{ - return __save_address(data, address, 1); -} +#include <asm/stacktrace.h> +#include <asm/unwind.h> void save_stack_trace(struct stack_trace *trace) { - unsigned long sp; - - sp = current_stack_pointer(); - dump_trace(save_address, trace, NULL, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + struct unwind_state state; + + unwind_for_each_frame(&state, current, NULL, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { - unsigned long sp; - - sp = tsk->thread.ksp; - if (tsk == current) - sp = current_stack_pointer(); - dump_trace(save_address_nosched, trace, tsk, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + struct unwind_state state; + + unwind_for_each_frame(&state, tsk, NULL, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (in_sched_functions(state.ip)) + continue; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - unsigned long sp; - - sp = kernel_stack_pointer(regs); - dump_trace(save_address, trace, NULL, sp); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) { + if (trace->nr_entries >= trace->max_entries) + break; + if (trace->skip > 0) + trace->skip--; + else + trace->entries[trace->nr_entries++] = state.ip; + } } EXPORT_SYMBOL_GPL(save_stack_trace_regs); diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 993100c31d65..19a3c427801a 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -108,6 +108,7 @@ ENTRY(swsusp_arch_suspend) lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 BR_EX %r14 +ENDPROC(swsusp_arch_suspend) /* * Restore saved memory image to correct place and restore register context. @@ -154,20 +155,13 @@ ENTRY(swsusp_arch_resume) ptlb /* flush tlb */ /* Reset System */ - larl %r1,restart_entry - larl %r2,.Lrestart_diag308_psw - og %r1,0(%r2) - stg %r1,0(%r0) larl %r1,.Lnew_pgm_check_psw epsw %r2,%r3 stm %r2,%r3,0(%r1) mvc __LC_PGM_NEW_PSW(16,%r0),0(%r1) - lghi %r0,0 - diag %r0,%r0,0x308 -restart_entry: - lhi %r1,1 - sigp %r1,%r0,SIGP_SET_ARCHITECTURE - sam64 + larl %r1,__swsusp_reset_dma + lg %r1,0(%r1) + BASR_EX %r14,%r1 #ifdef CONFIG_SMP larl %r1,smp_cpu_mt_shift icm %r1,15,0(%r1) @@ -267,6 +261,7 @@ restore_registers: lmg %r6,%r15,STACK_FRAME_OVERHEAD + __SF_GPRS(%r15) lghi %r2,0 BR_EX %r14 +ENDPROC(swsusp_arch_resume) .section .data..nosave,"aw",@progbits .align 8 @@ -275,8 +270,6 @@ restore_registers: .Lpanic_string: .asciz "Resume not possible because suspend CPU is no longer available\n" .align 8 -.Lrestart_diag308_psw: - .long 0x00080000,0x80000000 .Lrestart_suspend_psw: .quad 0x0000000180000000,restart_suspend .Lnew_pgm_check_psw: diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 02579f95f391..061418f787c3 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -426,3 +426,7 @@ 421 32 rt_sigtimedwait_time64 - compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 - sys_futex 423 32 sched_rr_get_interval_time64 - sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register sys_io_uring_register diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 8003b38c1688..82e81a9f7112 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -49,7 +49,7 @@ void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str) report_user_fault(regs, si_signo, 0); } else { const struct exception_table_entry *fixup; - fixup = search_exception_tables(regs->psw.addr); + fixup = s390_search_extables(regs->psw.addr); if (fixup) regs->psw.addr = extable_fixup(fixup); else { @@ -263,5 +263,6 @@ NOKPROBE_SYMBOL(kernel_stack_overflow); void __init trap_init(void) { + sort_extable(__start_dma_ex_table, __stop_dma_ex_table); local_mcck_enable(); } diff --git a/arch/s390/kernel/unwind_bc.c b/arch/s390/kernel/unwind_bc.c new file mode 100644 index 000000000000..57fd4e902f1f --- /dev/null +++ b/arch/s390/kernel/unwind_bc.c @@ -0,0 +1,155 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/task_stack.h> +#include <linux/interrupt.h> +#include <asm/sections.h> +#include <asm/ptrace.h> +#include <asm/bitops.h> +#include <asm/stacktrace.h> +#include <asm/unwind.h> + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + return __kernel_text_address(state->ip) ? state->ip : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool outside_of_stack(struct unwind_state *state, unsigned long sp) +{ + return (sp <= state->sp) || + (sp + sizeof(struct stack_frame) > state->stack_info.end); +} + +static bool update_stack_info(struct unwind_state *state, unsigned long sp) +{ + struct stack_info *info = &state->stack_info; + unsigned long *mask = &state->stack_mask; + + /* New stack pointer leaves the current stack */ + if (get_stack_info(sp, state->task, info, mask) != 0 || + !on_stack(info, sp, sizeof(struct stack_frame))) + /* 'sp' does not point to a valid stack */ + return false; + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + struct stack_frame *sf; + struct pt_regs *regs; + unsigned long sp, ip; + bool reliable; + + regs = state->regs; + if (unlikely(regs)) { + sp = READ_ONCE_TASK_STACK(state->task, regs->gprs[15]); + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = false; + regs = NULL; + } else { + sf = (struct stack_frame *) state->sp; + sp = READ_ONCE_TASK_STACK(state->task, sf->back_chain); + if (likely(sp)) { + /* Non-zero back-chain points to the previous frame */ + if (unlikely(outside_of_stack(state, sp))) { + if (!update_stack_info(state, sp)) + goto out_err; + } + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = true; + } else { + /* No back-chain, look for a pt_regs structure */ + sp = state->sp + STACK_FRAME_OVERHEAD; + if (!on_stack(info, sp, sizeof(struct pt_regs))) + goto out_stop; + regs = (struct pt_regs *) sp; + if (user_mode(regs)) + goto out_stop; + ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + reliable = true; + } + } + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Decode any ftrace redirection */ + if (ip == (unsigned long) return_to_handler) + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + ip, (void *) sp); +#endif + + /* Update unwind state */ + state->sp = sp; + state->ip = ip; + state->regs = regs; + state->reliable = reliable; + return true; + +out_err: + state->error = true; +out_stop: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long sp) +{ + struct stack_info *info = &state->stack_info; + unsigned long *mask = &state->stack_mask; + struct stack_frame *sf; + unsigned long ip; + bool reliable; + + memset(state, 0, sizeof(*state)); + state->task = task; + state->regs = regs; + + /* Don't even attempt to start from user mode regs: */ + if (regs && user_mode(regs)) { + info->type = STACK_TYPE_UNKNOWN; + return; + } + + /* Get current stack pointer and initialize stack info */ + if (get_stack_info(sp, task, info, mask) != 0 || + !on_stack(info, sp, sizeof(struct stack_frame))) { + /* Something is wrong with the stack pointer */ + info->type = STACK_TYPE_UNKNOWN; + state->error = true; + return; + } + + /* Get the instruction pointer from pt_regs or the stack frame */ + if (regs) { + ip = READ_ONCE_TASK_STACK(state->task, regs->psw.addr); + reliable = true; + } else { + sf = (struct stack_frame *) sp; + ip = READ_ONCE_TASK_STACK(state->task, sf->gprs[8]); + reliable = false; + } + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* Decode any ftrace redirection */ + if (ip == (unsigned long) return_to_handler) + ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + ip, NULL); +#endif + + /* Update unwind state */ + state->sp = sp; + state->ip = ip; + state->reliable = reliable; +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index e7920a68a12e..243d8b1185bf 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -29,7 +29,7 @@ #include <asm/vdso.h> #include <asm/facility.h> -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO extern char vdso32_start, vdso32_end; static void *vdso32_kbase = &vdso32_start; static unsigned int vdso32_pages; @@ -55,7 +55,7 @@ static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, vdso_pagelist = vdso64_pagelist; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO if (vma->vm_mm->context.compat_mm) { vdso_pagelist = vdso32_pagelist; vdso_pages = vdso32_pages; @@ -76,7 +76,7 @@ static int vdso_mremap(const struct vm_special_mapping *sm, unsigned long vdso_pages; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO if (vma->vm_mm->context.compat_mm) vdso_pages = vdso32_pages; #endif @@ -223,7 +223,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return 0; vdso_pages = vdso64_pages; -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO mm->context.compat_mm = is_compat_task(); if (mm->context.compat_mm) vdso_pages = vdso32_pages; @@ -280,7 +280,7 @@ static int __init vdso_init(void) int i; vdso_init_data(vdso_data); -#ifdef CONFIG_COMPAT +#ifdef CONFIG_COMPAT_VDSO /* Calculate the size of the 32 bit vDSO */ vdso32_pages = ((&vdso32_end - &vdso32_start + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index e76309fbbcb3..aee9ffbccb54 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS_31 += -m31 -s KBUILD_CFLAGS_31 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_31 += -m31 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_31 += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) + -Wl,--hash-style=both $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_31) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_31) diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index f849ac61c5da..bec19e7e6e1c 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS_64 += -m64 -s KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=both) + -Wl,--hash-style=both $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index 8429ab079715..49d55327de0b 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -72,6 +72,7 @@ SECTIONS __end_ro_after_init = .; RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE) + BOOT_DATA_PRESERVED _edata = .; /* End of data section */ @@ -143,6 +144,18 @@ SECTIONS INIT_DATA_SECTION(0x100) PERCPU_SECTION(0x100) + + .dynsym ALIGN(8) : { + __dynsym_start = .; + *(.dynsym) + __dynsym_end = .; + } + .rela.dyn ALIGN(8) : { + __rela_dyn_start = .; + *(.rela*) + __rela_dyn_end = .; + } + . = ALIGN(PAGE_SIZE); __init_end = .; /* freed after init ends here */ @@ -161,6 +174,12 @@ SECTIONS QUAD(__bss_stop - __bss_start) /* bss_size */ QUAD(__boot_data_start) /* bootdata_off */ QUAD(__boot_data_end - __boot_data_start) /* bootdata_size */ + QUAD(__boot_data_preserved_start) /* bootdata_preserved_off */ + QUAD(__boot_data_preserved_end - + __boot_data_preserved_start) /* bootdata_preserved_size */ + QUAD(__dynsym_start) /* dynsym_start */ + QUAD(__rela_dyn_start) /* rela_dyn_start */ + QUAD(__rela_dyn_end) /* rela_dyn_end */ } :NONE /* Debugging sections. */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index a69a0911ed0e..c475ca49cfc6 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -37,7 +37,7 @@ static inline u64 get_vtimer(void) { u64 timer; - asm volatile("stpt %0" : "=m" (timer)); + asm volatile("stpt %0" : "=Q" (timer)); return timer; } @@ -48,7 +48,7 @@ static inline void set_vtimer(u64 expires) asm volatile( " stpt %0\n" /* Store current cpu timer value */ " spt %1" /* Set new value imm. afterwards */ - : "=m" (timer) : "m" (expires)); + : "=Q" (timer) : "Q" (expires)); S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; S390_lowcore.last_update_timer = expires; } @@ -135,8 +135,8 @@ static int do_account_vtime(struct task_struct *tsk) #else " stck %1" /* Store current tod clock value */ #endif - : "=m" (S390_lowcore.last_update_timer), - "=m" (S390_lowcore.last_update_clock)); + : "=Q" (S390_lowcore.last_update_timer), + "=Q" (S390_lowcore.last_update_clock)); clock = S390_lowcore.last_update_clock - clock; timer -= S390_lowcore.last_update_timer; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 82162867f378..37503ae62486 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -3194,7 +3194,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_s390_gisc_unregister); -static void gib_alert_irq_handler(struct airq_struct *airq) +static void gib_alert_irq_handler(struct airq_struct *airq, bool floating) { inc_irq_stat(IRQIO_GAL); process_gib_alert_list(); diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index 53008da05190..dc0874f2e203 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -178,6 +178,7 @@ ENTRY(__memset\bits) BR_EX %r14 .L__memset_mvc\bits: mvc \bytes(1,%r1),0(%r1) +ENDPROC(__memset\bits) .endm __MEMSET 16,2,sth diff --git a/arch/s390/mm/Makefile b/arch/s390/mm/Makefile index f5880bfd1b0c..3175413186b9 100644 --- a/arch/s390/mm/Makefile +++ b/arch/s390/mm/Makefile @@ -4,7 +4,7 @@ # obj-y := init.o fault.o extmem.o mmap.o vmem.o maccess.o -obj-y += page-states.o gup.o pageattr.o pgtable.o pgalloc.o +obj-y += page-states.o pageattr.o pgtable.o pgalloc.o obj-$(CONFIG_CMM) += cmm.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 11613362c4e7..c220399ae196 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -247,12 +247,24 @@ static noinline void do_sigsegv(struct pt_regs *regs, int si_code) current); } +const struct exception_table_entry *s390_search_extables(unsigned long addr) +{ + const struct exception_table_entry *fixup; + + fixup = search_extable(__start_dma_ex_table, + __stop_dma_ex_table - __start_dma_ex_table, + addr); + if (!fixup) + fixup = search_exception_tables(addr); + return fixup; +} + static noinline void do_no_context(struct pt_regs *regs) { const struct exception_table_entry *fixup; /* Are we prepared to handle this kernel fault? */ - fixup = search_exception_tables(regs->psw.addr); + fixup = s390_search_extables(regs->psw.addr); if (fixup) { regs->psw.addr = extable_fixup(fixup); return; diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c deleted file mode 100644 index 2809d11c7a28..000000000000 --- a/arch/s390/mm/gup.c +++ /dev/null @@ -1,300 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Lockless get_user_pages_fast for s390 - * - * Copyright IBM Corp. 2010 - * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> - */ -#include <linux/sched.h> -#include <linux/mm.h> -#include <linux/hugetlb.h> -#include <linux/vmstat.h> -#include <linux/pagemap.h> -#include <linux/rwsem.h> -#include <asm/pgtable.h> - -/* - * The performance critical leaf functions are made noinline otherwise gcc - * inlines everything into a single function which results in too much - * register pressure. - */ -static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - pte_t *ptep, pte; - - mask = (write ? _PAGE_PROTECT : 0) | _PAGE_INVALID | _PAGE_SPECIAL; - - ptep = ((pte_t *) pmd_deref(pmd)) + pte_index(addr); - do { - pte = *ptep; - barrier(); - /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_protnone(pte)) - return 0; - if ((pte_val(pte) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pte_pfn(pte))); - page = pte_page(pte); - head = compound_head(page); - if (!page_cache_get_speculative(head)) - return 0; - if (unlikely(pte_val(pte) != pte_val(*ptep))) { - put_page(head); - return 0; - } - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - - } while (ptep++, addr += PAGE_SIZE, addr != end); - - return 1; -} - -static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - int refs; - - mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; - if ((pmd_val(pmd) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); - - refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON(compound_head(page) != head); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pmd_val(pmd) != pmd_val(*pmdp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - - -static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pmd_t *pmdp, pmd; - - pmdp = (pmd_t *) pudp; - if ((pud_val(pud) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R3) - pmdp = (pmd_t *) pud_deref(pud); - pmdp += pmd_index(addr); - do { - pmd = *pmdp; - barrier(); - next = pmd_addr_end(addr, end); - if (pmd_none(pmd)) - return 0; - if (unlikely(pmd_large(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) - return 0; - if (!gup_huge_pmd(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } else if (!gup_pte_range(pmdp, pmd, addr, next, - write, pages, nr)) - return 0; - } while (pmdp++, addr = next, addr != end); - - return 1; -} - -static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - struct page *head, *page; - unsigned long mask; - int refs; - - mask = (write ? _REGION_ENTRY_PROTECT : 0) | _REGION_ENTRY_INVALID; - if ((pud_val(pud) & mask) != 0) - return 0; - VM_BUG_ON(!pfn_valid(pud_pfn(pud))); - - refs = 0; - head = pud_page(pud); - page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - do { - VM_BUG_ON_PAGE(compound_head(page) != head, page); - pages[*nr] = page; - (*nr)++; - page++; - refs++; - } while (addr += PAGE_SIZE, addr != end); - - if (!page_cache_add_speculative(head, refs)) { - *nr -= refs; - return 0; - } - - if (unlikely(pud_val(pud) != pud_val(*pudp))) { - *nr -= refs; - while (refs--) - put_page(head); - return 0; - } - - return 1; -} - -static inline int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - pud_t *pudp, pud; - - pudp = (pud_t *) p4dp; - if ((p4d_val(p4d) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R2) - pudp = (pud_t *) p4d_deref(p4d); - pudp += pud_index(addr); - do { - pud = *pudp; - barrier(); - next = pud_addr_end(addr, end); - if (pud_none(pud)) - return 0; - if (unlikely(pud_large(pud))) { - if (!gup_huge_pud(pudp, pud, addr, next, write, pages, - nr)) - return 0; - } else if (!gup_pmd_range(pudp, pud, addr, next, write, pages, - nr)) - return 0; - } while (pudp++, addr = next, addr != end); - - return 1; -} - -static inline int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, - unsigned long end, int write, struct page **pages, int *nr) -{ - unsigned long next; - p4d_t *p4dp, p4d; - - p4dp = (p4d_t *) pgdp; - if ((pgd_val(pgd) & _REGION_ENTRY_TYPE_MASK) == _REGION_ENTRY_TYPE_R1) - p4dp = (p4d_t *) pgd_deref(pgd); - p4dp += p4d_index(addr); - do { - p4d = *p4dp; - barrier(); - next = p4d_addr_end(addr, end); - if (p4d_none(p4d)) - return 0; - if (!gup_pud_range(p4dp, p4d, addr, next, write, pages, nr)) - return 0; - } while (p4dp++, addr = next, addr != end); - - return 1; -} - -/* - * Like get_user_pages_fast() except its IRQ-safe in that it won't fall - * back to the regular GUP. - * Note a difference with get_user_pages_fast: this always returns the - * number of pages pinned, 0 if no pages were pinned. - */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - struct mm_struct *mm = current->mm; - unsigned long addr, len, end; - unsigned long next, flags; - pgd_t *pgdp, pgd; - int nr = 0; - - start &= PAGE_MASK; - addr = start; - len = (unsigned long) nr_pages << PAGE_SHIFT; - end = start + len; - if ((end <= start) || (end > mm->context.asce_limit)) - return 0; - /* - * local_irq_save() doesn't prevent pagetable teardown, but does - * prevent the pagetables from being freed on s390. - * - * So long as we atomically load page table pointers versus teardown, - * we can follow the address down to the the page and take a ref on it. - */ - local_irq_save(flags); - pgdp = pgd_offset(mm, addr); - do { - pgd = *pgdp; - barrier(); - next = pgd_addr_end(addr, end); - if (pgd_none(pgd)) - break; - if (!gup_p4d_range(pgdp, pgd, addr, next, write, pages, &nr)) - break; - } while (pgdp++, addr = next, addr != end); - local_irq_restore(flags); - - return nr; -} - -/** - * get_user_pages_fast() - pin user pages in memory - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. - * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) -{ - int nr, ret; - - might_sleep(); - start &= PAGE_MASK; - nr = __get_user_pages_fast(start, nr_pages, write, pages); - if (nr == nr_pages) - return nr; - - /* Try to get the remaining pages with get_user_pages */ - start += nr << PAGE_SHIFT; - pages += nr; - ret = get_user_pages_unlocked(start, nr_pages - nr, pages, - write ? FOLL_WRITE : 0); - /* Have to be a bit careful with return values */ - if (nr > 0) - ret = (ret < 0) ? nr : ret + nr; - return ret; -} diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 3e82f66d5c61..7cf48eefec8f 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -49,6 +49,8 @@ unsigned long empty_zero_page, zero_page_mask; EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(zero_page_mask); +bool initmem_freed; + static void __init setup_zero_pages(void) { unsigned int order; @@ -148,6 +150,7 @@ void __init mem_init(void) void free_initmem(void) { + initmem_freed = true; __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RW | SET_MEMORY_NX); diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 97b3ee53852b..818deeb1ebc3 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -16,6 +16,7 @@ #include <linux/cpu.h> #include <asm/ctl_reg.h> #include <asm/io.h> +#include <asm/stacktrace.h> static notrace long s390_kernel_write_odd(void *dst, const void *src, size_t size) { diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index db6bb2f97a2c..99e06213a22b 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -290,7 +290,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, tlb_remove_table(tlb, table); } -static void __tlb_remove_table(void *_table) +void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 3; void *table = (void *)((unsigned long) _table ^ mask); @@ -316,67 +316,6 @@ static void __tlb_remove_table(void *_table) } } -static void tlb_remove_table_smp_sync(void *arg) -{ - /* Simply deliver the interrupt */ -} - -static void tlb_remove_table_one(void *table) -{ - /* - * This isn't an RCU grace period and hence the page-tables cannot be - * assumed to be actually RCU-freed. - * - * It is however sufficient for software page-table walkers that rely - * on IRQ disabling. See the comment near struct mmu_table_batch. - */ - smp_call_function(tlb_remove_table_smp_sync, NULL, 1); - __tlb_remove_table(table); -} - -static void tlb_remove_table_rcu(struct rcu_head *head) -{ - struct mmu_table_batch *batch; - int i; - - batch = container_of(head, struct mmu_table_batch, rcu); - - for (i = 0; i < batch->nr; i++) - __tlb_remove_table(batch->tables[i]); - - free_page((unsigned long)batch); -} - -void tlb_table_flush(struct mmu_gather *tlb) -{ - struct mmu_table_batch **batch = &tlb->batch; - - if (*batch) { - call_rcu(&(*batch)->rcu, tlb_remove_table_rcu); - *batch = NULL; - } -} - -void tlb_remove_table(struct mmu_gather *tlb, void *table) -{ - struct mmu_table_batch **batch = &tlb->batch; - - tlb->mm->context.flush_mm = 1; - if (*batch == NULL) { - *batch = (struct mmu_table_batch *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - if (*batch == NULL) { - __tlb_flush_mm_lazy(tlb->mm); - tlb_remove_table_one(table); - return; - } - (*batch)->nr = 0; - } - (*batch)->tables[(*batch)->nr++] = table; - if ((*batch)->nr == MAX_TABLE_BATCH) - tlb_flush_mmu(tlb); -} - /* * Base infrastructure required to generate basic asces, region, segment, * and page tables that do not make use of enhanced features like EDAT1. diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 8485d6dc2754..9ebd01219812 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -410,6 +410,7 @@ static inline pmd_t pmdp_flush_lazy(struct mm_struct *mm, return old; } +#ifdef CONFIG_PGSTE static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -427,6 +428,7 @@ static pmd_t *pmd_alloc_map(struct mm_struct *mm, unsigned long addr) pmd = pmd_alloc(mm, pud, addr); return pmd; } +#endif pmd_t pmdp_xchg_direct(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t new) diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 0472e27febdf..b403fa14847d 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -413,6 +413,8 @@ void __init vmem_map_init(void) __set_memory((unsigned long)_sinittext, (unsigned long)(_einittext - _sinittext) >> PAGE_SHIFT, SET_MEMORY_RO | SET_MEMORY_X); + __set_memory(__stext_dma, (__etext_dma - __stext_dma) >> PAGE_SHIFT, + SET_MEMORY_RO | SET_MEMORY_X); pr_info("Write protected kernel read-only data: %luk\n", (unsigned long)(__end_rodata - _stext) >> 10); } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 51dd0267d014..5e7c63033159 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -455,7 +455,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) EMIT4(0xb9040000, REG_2, BPF_REG_0); /* Restore registers */ save_restore_regs(jit, REGS_RESTORE, stack_depth); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { jit->r14_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r14 thunk */ if (test_facility(35)) { @@ -473,7 +473,7 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) /* br %r14 */ _EMIT2(0x07fe); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable && + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable && (jit->seen & SEEN_FUNC)) { jit->r1_thunk_ip = jit->prg; /* Generate __s390_indirect_jump_r1 thunk */ @@ -999,7 +999,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i /* lg %w1,<d(imm)>(%l) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L, EMIT_CONST_U64(func)); - if (IS_ENABLED(CC_USING_EXPOLINE) && !nospec_disable) { + if (__is_defined(CC_USING_EXPOLINE) && !nospec_disable) { /* brasl %r14,__s390_indirect_jump_r1 */ EMIT6_PCREL_RILB(0xc0050000, REG_14, jit->r1_thunk_ip); } else { diff --git a/arch/s390/oprofile/init.c b/arch/s390/oprofile/init.c index 43d9525c36fc..7441857df51b 100644 --- a/arch/s390/oprofile/init.c +++ b/arch/s390/oprofile/init.c @@ -13,23 +13,17 @@ #include <linux/oprofile.h> #include <linux/init.h> #include <asm/processor.h> - -static int __s390_backtrace(void *data, unsigned long address, int reliable) -{ - unsigned int *depth = data; - - if (*depth == 0) - return 1; - (*depth)--; - oprofile_add_trace(address); - return 0; -} +#include <asm/unwind.h> static void s390_backtrace(struct pt_regs *regs, unsigned int depth) { - if (user_mode(regs)) - return; - dump_trace(__s390_backtrace, &depth, NULL, regs->gprs[15]); + struct unwind_state state; + + unwind_for_each_frame(&state, current, regs, 0) { + if (depth-- == 0) + break; + oprofile_add_trace(state.ip); + } } int __init oprofile_arch_init(struct oprofile_operations *ops) diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index 22d0871291ee..748626a33028 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -3,5 +3,5 @@ # Makefile for the s390 PCI subsystem. # -obj-$(CONFIG_PCI) += pci.o pci_dma.o pci_clp.o pci_sysfs.o \ +obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index dc9bc82c072c..0ebb7c405a25 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -24,11 +24,9 @@ #include <linux/err.h> #include <linux/export.h> #include <linux/delay.h> -#include <linux/irq.h> -#include <linux/kernel_stat.h> #include <linux/seq_file.h> +#include <linux/jump_label.h> #include <linux/pci.h> -#include <linux/msi.h> #include <asm/isc.h> #include <asm/airq.h> @@ -37,30 +35,13 @@ #include <asm/pci_clp.h> #include <asm/pci_dma.h> -#define DEBUG /* enable pr_debug */ - -#define SIC_IRQ_MODE_ALL 0 -#define SIC_IRQ_MODE_SINGLE 1 - -#define ZPCI_NR_DMA_SPACES 1 -#define ZPCI_NR_DEVICES CONFIG_PCI_NR_FUNCTIONS - /* list of all detected zpci devices */ static LIST_HEAD(zpci_list); static DEFINE_SPINLOCK(zpci_list_lock); -static struct irq_chip zpci_irq_chip = { - .name = "zPCI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, -}; - static DECLARE_BITMAP(zpci_domain, ZPCI_NR_DEVICES); static DEFINE_SPINLOCK(zpci_domain_lock); -static struct airq_iv *zpci_aisb_iv; -static struct airq_iv *zpci_aibv[ZPCI_NR_DEVICES]; - #define ZPCI_IOMAP_ENTRIES \ min(((unsigned long) ZPCI_NR_DEVICES * PCI_BAR_COUNT / 2), \ ZPCI_IOMAP_MAX_ENTRIES) @@ -70,6 +51,8 @@ static unsigned long *zpci_iomap_bitmap; struct zpci_iomap_entry *zpci_iomap_start; EXPORT_SYMBOL_GPL(zpci_iomap_start); +DEFINE_STATIC_KEY_FALSE(have_mio); + static struct kmem_cache *zdev_fmb_cache; struct zpci_dev *get_zdev_by_fid(u32 fid) @@ -123,39 +106,6 @@ int pci_proc_domain(struct pci_bus *bus) } EXPORT_SYMBOL_GPL(pci_proc_domain); -/* Modify PCI: Register adapter interruptions */ -static int zpci_set_airq(struct zpci_dev *zdev) -{ - u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); - struct zpci_fib fib = {0}; - u8 status; - - fib.isc = PCI_ISC; - fib.sum = 1; /* enable summary notifications */ - fib.noi = airq_iv_end(zdev->aibv); - fib.aibv = (unsigned long) zdev->aibv->vector; - fib.aibvo = 0; /* each zdev has its own interrupt vector */ - fib.aisb = (unsigned long) zpci_aisb_iv->vector + (zdev->aisb/64)*8; - fib.aisbo = zdev->aisb & 63; - - return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; -} - -/* Modify PCI: Unregister adapter interruptions */ -static int zpci_clear_airq(struct zpci_dev *zdev) -{ - u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); - struct zpci_fib fib = {0}; - u8 cc, status; - - cc = zpci_mod_fc(req, &fib, &status); - if (cc == 3 || (cc == 1 && status == 24)) - /* Function already gone or IRQs already deregistered. */ - cc = 0; - - return cc ? -EIO : 0; -} - /* Modify PCI: Register I/O address translation parameters */ int zpci_register_ioat(struct zpci_dev *zdev, u8 dmaas, u64 base, u64 limit, u64 iota) @@ -241,7 +191,7 @@ static int zpci_cfg_load(struct zpci_dev *zdev, int offset, u32 *val, u8 len) u64 data; int rc; - rc = zpci_load(&data, req, offset); + rc = __zpci_load(&data, req, offset); if (!rc) { data = le64_to_cpu((__force __le64) data); data >>= (8 - len) * 8; @@ -259,7 +209,7 @@ static int zpci_cfg_store(struct zpci_dev *zdev, int offset, u32 val, u8 len) data <<= (8 - len) * 8; data = (__force u64) cpu_to_le64(data); - rc = zpci_store(data, req, offset); + rc = __zpci_store(data, req, offset); return rc; } @@ -276,18 +226,48 @@ void __iowrite64_copy(void __iomem *to, const void *from, size_t count) zpci_memcpy_toio(to, from, count); } +void __iomem *ioremap(unsigned long ioaddr, unsigned long size) +{ + struct vm_struct *area; + unsigned long offset; + + if (!size) + return NULL; + + if (!static_branch_unlikely(&have_mio)) + return (void __iomem *) ioaddr; + + offset = ioaddr & ~PAGE_MASK; + ioaddr &= PAGE_MASK; + size = PAGE_ALIGN(size + offset); + area = get_vm_area(size, VM_IOREMAP); + if (!area) + return NULL; + + if (ioremap_page_range((unsigned long) area->addr, + (unsigned long) area->addr + size, + ioaddr, PAGE_KERNEL)) { + vunmap(area->addr); + return NULL; + } + return (void __iomem *) ((unsigned long) area->addr + offset); +} +EXPORT_SYMBOL(ioremap); + +void iounmap(volatile void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + vunmap((__force void *) ((unsigned long) addr & PAGE_MASK)); +} +EXPORT_SYMBOL(iounmap); + /* Create a virtual mapping cookie for a PCI BAR */ -void __iomem *pci_iomap_range(struct pci_dev *pdev, - int bar, - unsigned long offset, - unsigned long max) +static void __iomem *pci_iomap_range_fh(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) { struct zpci_dev *zdev = to_zpci(pdev); int idx; - if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) - return NULL; - idx = zdev->bars[bar].map_idx; spin_lock(&zpci_iomap_lock); /* Detect overrun */ @@ -298,6 +278,30 @@ void __iomem *pci_iomap_range(struct pci_dev *pdev, return (void __iomem *) ZPCI_ADDR(idx) + offset; } + +static void __iomem *pci_iomap_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, + unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wt, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} EXPORT_SYMBOL(pci_iomap_range); void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) @@ -306,7 +310,37 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) } EXPORT_SYMBOL(pci_iomap); -void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) +static void __iomem *pci_iomap_wc_range_mio(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + unsigned long barsize = pci_resource_len(pdev, bar); + struct zpci_dev *zdev = to_zpci(pdev); + void __iomem *iova; + + iova = ioremap((unsigned long) zdev->bars[bar].mio_wb, barsize); + return iova ? iova + offset : iova; +} + +void __iomem *pci_iomap_wc_range(struct pci_dev *pdev, int bar, + unsigned long offset, unsigned long max) +{ + if (!pci_resource_len(pdev, bar) || bar >= PCI_BAR_COUNT) + return NULL; + + if (static_branch_likely(&have_mio)) + return pci_iomap_wc_range_mio(pdev, bar, offset, max); + else + return pci_iomap_range_fh(pdev, bar, offset, max); +} +EXPORT_SYMBOL(pci_iomap_wc_range); + +void __iomem *pci_iomap_wc(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + return pci_iomap_wc_range(dev, bar, 0, maxlen); +} +EXPORT_SYMBOL(pci_iomap_wc); + +static void pci_iounmap_fh(struct pci_dev *pdev, void __iomem *addr) { unsigned int idx = ZPCI_IDX(addr); @@ -319,6 +353,19 @@ void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) } spin_unlock(&zpci_iomap_lock); } + +static void pci_iounmap_mio(struct pci_dev *pdev, void __iomem *addr) +{ + iounmap(addr); +} + +void pci_iounmap(struct pci_dev *pdev, void __iomem *addr) +{ + if (static_branch_likely(&have_mio)) + pci_iounmap_mio(pdev, addr); + else + pci_iounmap_fh(pdev, addr); +} EXPORT_SYMBOL(pci_iounmap); static int pci_read(struct pci_bus *bus, unsigned int devfn, int where, @@ -354,136 +401,6 @@ static struct pci_ops pci_root_ops = { .write = pci_write, }; -static void zpci_irq_handler(struct airq_struct *airq) -{ - unsigned long si, ai; - struct airq_iv *aibv; - int irqs_on = 0; - - inc_irq_stat(IRQIO_PCI); - for (si = 0;;) { - /* Scan adapter summary indicator bit vector */ - si = airq_iv_scan(zpci_aisb_iv, si, airq_iv_end(zpci_aisb_iv)); - if (si == -1UL) { - if (irqs_on++) - /* End of second scan with interrupts on. */ - break; - /* First scan complete, reenable interrupts. */ - if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC)) - break; - si = 0; - continue; - } - - /* Scan the adapter interrupt vector for this device. */ - aibv = zpci_aibv[si]; - for (ai = 0;;) { - ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); - if (ai == -1UL) - break; - inc_irq_stat(IRQIO_MSI); - airq_iv_lock(aibv, ai); - generic_handle_irq(airq_iv_get_data(aibv, ai)); - airq_iv_unlock(aibv, ai); - } - } -} - -int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) -{ - struct zpci_dev *zdev = to_zpci(pdev); - unsigned int hwirq, msi_vecs; - unsigned long aisb; - struct msi_desc *msi; - struct msi_msg msg; - int rc, irq; - - zdev->aisb = -1UL; - if (type == PCI_CAP_ID_MSI && nvec > 1) - return 1; - msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); - - /* Allocate adapter summary indicator bit */ - aisb = airq_iv_alloc_bit(zpci_aisb_iv); - if (aisb == -1UL) - return -EIO; - zdev->aisb = aisb; - - /* Create adapter interrupt vector */ - zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); - if (!zdev->aibv) - return -ENOMEM; - - /* Wire up shortcut pointer */ - zpci_aibv[aisb] = zdev->aibv; - - /* Request MSI interrupts */ - hwirq = 0; - for_each_pci_msi_entry(msi, pdev) { - if (hwirq >= msi_vecs) - break; - irq = irq_alloc_desc(0); /* Alloc irq on node 0 */ - if (irq < 0) - return -ENOMEM; - rc = irq_set_msi_desc(irq, msi); - if (rc) - return rc; - irq_set_chip_and_handler(irq, &zpci_irq_chip, - handle_simple_irq); - msg.data = hwirq; - msg.address_lo = zdev->msi_addr & 0xffffffff; - msg.address_hi = zdev->msi_addr >> 32; - pci_write_msi_msg(irq, &msg); - airq_iv_set_data(zdev->aibv, hwirq, irq); - hwirq++; - } - - /* Enable adapter interrupts */ - rc = zpci_set_airq(zdev); - if (rc) - return rc; - - return (msi_vecs == nvec) ? 0 : msi_vecs; -} - -void arch_teardown_msi_irqs(struct pci_dev *pdev) -{ - struct zpci_dev *zdev = to_zpci(pdev); - struct msi_desc *msi; - int rc; - - /* Disable adapter interrupts */ - rc = zpci_clear_airq(zdev); - if (rc) - return; - - /* Release MSI interrupts */ - for_each_pci_msi_entry(msi, pdev) { - if (!msi->irq) - continue; - if (msi->msi_attrib.is_msix) - __pci_msix_desc_mask_irq(msi, 1); - else - __pci_msi_desc_mask_irq(msi, 1, 1); - irq_set_msi_desc(msi->irq, NULL); - irq_free_desc(msi->irq); - msi->msg.address_lo = 0; - msi->msg.address_hi = 0; - msi->msg.data = 0; - msi->irq = 0; - } - - if (zdev->aisb != -1UL) { - zpci_aibv[zdev->aisb] = NULL; - airq_iv_free_bit(zpci_aisb_iv, zdev->aisb); - zdev->aisb = -1UL; - } - if (zdev->aibv) { - airq_iv_release(zdev->aibv); - zdev->aibv = NULL; - } -} - #ifdef CONFIG_PCI_IOV static struct resource iov_res = { .name = "PCI IOV res", @@ -495,6 +412,7 @@ static struct resource iov_res = { static void zpci_map_resources(struct pci_dev *pdev) { + struct zpci_dev *zdev = to_zpci(pdev); resource_size_t len; int i; @@ -502,8 +420,13 @@ static void zpci_map_resources(struct pci_dev *pdev) len = pci_resource_len(pdev, i); if (!len) continue; - pdev->resource[i].start = - (resource_size_t __force) pci_iomap(pdev, i, 0); + + if (static_branch_likely(&have_mio)) + pdev->resource[i].start = + (resource_size_t __force) zdev->bars[i].mio_wb; + else + pdev->resource[i].start = + (resource_size_t __force) pci_iomap(pdev, i, 0); pdev->resource[i].end = pdev->resource[i].start + len - 1; } @@ -524,6 +447,9 @@ static void zpci_unmap_resources(struct pci_dev *pdev) resource_size_t len; int i; + if (static_branch_likely(&have_mio)) + return; + for (i = 0; i < PCI_BAR_COUNT; i++) { len = pci_resource_len(pdev, i); if (!len) @@ -533,41 +459,6 @@ static void zpci_unmap_resources(struct pci_dev *pdev) } } -static struct airq_struct zpci_airq = { - .handler = zpci_irq_handler, - .isc = PCI_ISC, -}; - -static int __init zpci_irq_init(void) -{ - int rc; - - rc = register_adapter_interrupt(&zpci_airq); - if (rc) - goto out; - /* Set summary to 1 to be called every time for the ISC. */ - *zpci_airq.lsi_ptr = 1; - - rc = -ENOMEM; - zpci_aisb_iv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); - if (!zpci_aisb_iv) - goto out_airq; - - zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, NULL, PCI_ISC); - return 0; - -out_airq: - unregister_adapter_interrupt(&zpci_airq); -out: - return rc; -} - -static void zpci_irq_exit(void) -{ - airq_iv_release(zpci_aisb_iv); - unregister_adapter_interrupt(&zpci_airq); -} - static int zpci_alloc_iomap(struct zpci_dev *zdev) { unsigned long entry; @@ -958,7 +849,9 @@ static void zpci_mem_exit(void) kmem_cache_destroy(zdev_fmb_cache); } -static unsigned int s390_pci_probe = 1; +static unsigned int s390_pci_probe __initdata = 1; +static unsigned int s390_pci_no_mio __initdata; +unsigned int s390_pci_force_floating __initdata; static unsigned int s390_pci_initialized; char * __init pcibios_setup(char *str) @@ -967,6 +860,14 @@ char * __init pcibios_setup(char *str) s390_pci_probe = 0; return NULL; } + if (!strcmp(str, "nomio")) { + s390_pci_no_mio = 1; + return NULL; + } + if (!strcmp(str, "force_floating")) { + s390_pci_force_floating = 1; + return NULL; + } return str; } @@ -985,6 +886,9 @@ static int __init pci_base_init(void) if (!test_facility(69) || !test_facility(71)) return 0; + if (test_facility(153) && !s390_pci_no_mio) + static_branch_enable(&have_mio); + rc = zpci_debug_init(); if (rc) goto out; diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index eeb7450db18c..3a36b07a5571 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -163,7 +163,14 @@ static int clp_store_query_pci_fn(struct zpci_dev *zdev, memcpy(zdev->util_str, response->util_str, sizeof(zdev->util_str)); } + zdev->mio_capable = response->mio_addr_avail; + for (i = 0; i < PCI_BAR_COUNT; i++) { + if (!(response->mio_valid & (1 << (PCI_BAR_COUNT - i - 1)))) + continue; + zdev->bars[i].mio_wb = (void __iomem *) response->addr[i].wb; + zdev->bars[i].mio_wt = (void __iomem *) response->addr[i].wt; + } return 0; } @@ -279,11 +286,18 @@ int clp_enable_fh(struct zpci_dev *zdev, u8 nr_dma_as) int rc; rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_PCI_FN); - if (!rc) - /* Success -> store enabled handle in zdev */ - zdev->fh = fh; + zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + if (rc) + goto out; - zpci_dbg(3, "ena fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc); + zdev->fh = fh; + if (zdev->mio_capable) { + rc = clp_set_pci_fn(&fh, nr_dma_as, CLP_SET_ENABLE_MIO); + zpci_dbg(3, "ena mio fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); + if (rc) + clp_disable_fh(zdev); + } +out: return rc; } @@ -296,11 +310,10 @@ int clp_disable_fh(struct zpci_dev *zdev) return 0; rc = clp_set_pci_fn(&fh, 0, CLP_SET_DISABLE_PCI_FN); + zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, fh, rc); if (!rc) - /* Success -> store disabled handle in zdev */ zdev->fh = fh; - zpci_dbg(3, "dis fid:%x, fh:%x, rc:%d\n", zdev->fid, zdev->fh, rc); return rc; } diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index f069929e8211..02f9505c99a8 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -8,9 +8,11 @@ #include <linux/export.h> #include <linux/errno.h> #include <linux/delay.h> +#include <linux/jump_label.h> #include <asm/facility.h> #include <asm/pci_insn.h> #include <asm/pci_debug.h> +#include <asm/pci_io.h> #include <asm/processor.h> #define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ @@ -96,13 +98,15 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } /* Set Interruption Controls */ -int zpci_set_irq_ctrl(u16 ctl, char *unused, u8 isc) +int __zpci_set_irq_ctrl(u16 ctl, u8 isc, union zpci_sic_iib *iib) { if (!test_facility(72)) return -EIO; - asm volatile ( - " .insn rsy,0xeb00000000d1,%[ctl],%[isc],%[u]\n" - : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [u] "Q" (*unused)); + + asm volatile( + ".insn rsy,0xeb00000000d1,%[ctl],%[isc],%[iib]\n" + : : [ctl] "d" (ctl), [isc] "d" (isc << 27), [iib] "Q" (*iib)); + return 0; } @@ -140,7 +144,7 @@ static inline int __pcilg(u64 *data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_load(u64 *data, u64 req, u64 offset) +int __zpci_load(u64 *data, u64 req, u64 offset) { u8 status; int cc; @@ -156,6 +160,52 @@ int zpci_load(u64 *data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } +EXPORT_SYMBOL_GPL(__zpci_load); + +static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr, + unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + + return __zpci_load(data, req, ZPCI_OFFSET(addr)); +} + +static inline int __pcilg_mio(u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + register u64 addr asm("2") = ioaddr; + register u64 r3 asm("3") = len; + int cc = -ENXIO; + u64 __data; + + asm volatile ( + " .insn rre,0xb9d60000,%[data],%[ioaddr]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [data] "=d" (__data), "+d" (r3) + : [ioaddr] "d" (addr) + : "cc"); + *status = r3 >> 24 & 0xff; + *data = __data; + return cc; +} + +int zpci_load(u64 *data, const volatile void __iomem *addr, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_load_fh(data, addr, len); + + cc = __pcilg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) addr); + + return (cc > 0) ? -EIO : cc; +} EXPORT_SYMBOL_GPL(zpci_load); /* PCI Store */ @@ -178,7 +228,7 @@ static inline int __pcistg(u64 data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_store(u64 data, u64 req, u64 offset) +int __zpci_store(u64 data, u64 req, u64 offset) { u8 status; int cc; @@ -194,6 +244,50 @@ int zpci_store(u64 data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } +EXPORT_SYMBOL_GPL(__zpci_store); + +static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data, + unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + + return __zpci_store(data, req, ZPCI_OFFSET(addr)); +} + +static inline int __pcistg_mio(u64 data, u64 ioaddr, u64 len, u8 *status) +{ + register u64 addr asm("2") = ioaddr; + register u64 r3 asm("3") = len; + int cc = -ENXIO; + + asm volatile ( + " .insn rre,0xb9d40000,%[data],%[ioaddr]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), "+d" (r3) + : [data] "d" (data), [ioaddr] "d" (addr) + : "cc"); + *status = r3 >> 24 & 0xff; + return cc; +} + +int zpci_store(const volatile void __iomem *addr, u64 data, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_store_fh(addr, data, len); + + cc = __pcistg_mio(data, (__force u64) addr, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) addr); + + return (cc > 0) ? -EIO : cc; +} EXPORT_SYMBOL_GPL(zpci_store); /* PCI Store Block */ @@ -214,7 +308,7 @@ static inline int __pcistb(const u64 *data, u64 req, u64 offset, u8 *status) return cc; } -int zpci_store_block(const u64 *data, u64 req, u64 offset) +int __zpci_store_block(const u64 *data, u64 req, u64 offset) { u8 status; int cc; @@ -230,4 +324,63 @@ int zpci_store_block(const u64 *data, u64 req, u64 offset) return (cc > 0) ? -EIO : cc; } -EXPORT_SYMBOL_GPL(zpci_store_block); +EXPORT_SYMBOL_GPL(__zpci_store_block); + +static inline int zpci_write_block_fh(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(dst)]; + u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 offset = ZPCI_OFFSET(dst); + + return __zpci_store_block(src, req, offset); +} + +static inline int __pcistb_mio(const u64 *data, u64 ioaddr, u64 len, u8 *status) +{ + int cc = -ENXIO; + + asm volatile ( + " .insn rsy,0xeb00000000d4,%[len],%[ioaddr],%[data]\n" + "0: ipm %[cc]\n" + " srl %[cc],28\n" + "1:\n" + EX_TABLE(0b, 1b) + : [cc] "+d" (cc), [len] "+d" (len) + : [ioaddr] "d" (ioaddr), [data] "Q" (*data) + : "cc"); + *status = len >> 24 & 0xff; + return cc; +} + +int zpci_write_block(volatile void __iomem *dst, + const void *src, unsigned long len) +{ + u8 status; + int cc; + + if (!static_branch_unlikely(&have_mio)) + return zpci_write_block_fh(dst, src, len); + + cc = __pcistb_mio(src, (__force u64) dst, len, &status); + if (cc) + zpci_err_insn(cc, status, 0, (__force u64) dst); + + return (cc > 0) ? -EIO : cc; +} +EXPORT_SYMBOL_GPL(zpci_write_block); + +static inline void __pciwb_mio(void) +{ + unsigned long unused = 0; + + asm volatile (".insn rre,0xb9d50000,%[op],%[op]\n" + : [op] "+d" (unused)); +} + +void zpci_barrier(void) +{ + if (static_branch_likely(&have_mio)) + __pciwb_mio(); +} +EXPORT_SYMBOL_GPL(zpci_barrier); diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c new file mode 100644 index 000000000000..d80616ae8dd8 --- /dev/null +++ b/arch/s390/pci/pci_irq.c @@ -0,0 +1,486 @@ +// SPDX-License-Identifier: GPL-2.0 +#define KMSG_COMPONENT "zpci" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/irq.h> +#include <linux/kernel_stat.h> +#include <linux/pci.h> +#include <linux/msi.h> +#include <linux/smp.h> + +#include <asm/isc.h> +#include <asm/airq.h> + +static enum {FLOATING, DIRECTED} irq_delivery; + +#define SIC_IRQ_MODE_ALL 0 +#define SIC_IRQ_MODE_SINGLE 1 +#define SIC_IRQ_MODE_DIRECT 4 +#define SIC_IRQ_MODE_D_ALL 16 +#define SIC_IRQ_MODE_D_SINGLE 17 +#define SIC_IRQ_MODE_SET_CPU 18 + +/* + * summary bit vector + * FLOATING - summary bit per function + * DIRECTED - summary bit per cpu (only used in fallback path) + */ +static struct airq_iv *zpci_sbv; + +/* + * interrupt bit vectors + * FLOATING - interrupt bit vector per function + * DIRECTED - interrupt bit vector per cpu + */ +static struct airq_iv **zpci_ibv; + +/* Modify PCI: Register adapter interruptions */ +static int zpci_set_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT); + struct zpci_fib fib = {0}; + u8 status; + + fib.fmt0.isc = PCI_ISC; + fib.fmt0.sum = 1; /* enable summary notifications */ + fib.fmt0.noi = airq_iv_end(zdev->aibv); + fib.fmt0.aibv = (unsigned long) zdev->aibv->vector; + fib.fmt0.aibvo = 0; /* each zdev has its own interrupt vector */ + fib.fmt0.aisb = (unsigned long) zpci_sbv->vector + (zdev->aisb/64)*8; + fib.fmt0.aisbo = zdev->aisb & 63; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister adapter interruptions */ +static int zpci_clear_airq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT); + struct zpci_fib fib = {0}; + u8 cc, status; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +/* Modify PCI: Register CPU directed interruptions */ +static int zpci_set_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_REG_INT_D); + struct zpci_fib fib = {0}; + u8 status; + + fib.fmt = 1; + fib.fmt1.noi = zdev->msi_nr_irqs; + fib.fmt1.dibvo = zdev->msi_first_bit; + + return zpci_mod_fc(req, &fib, &status) ? -EIO : 0; +} + +/* Modify PCI: Unregister CPU directed interruptions */ +static int zpci_clear_directed_irq(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_DEREG_INT_D); + struct zpci_fib fib = {0}; + u8 cc, status; + + fib.fmt = 1; + cc = zpci_mod_fc(req, &fib, &status); + if (cc == 3 || (cc == 1 && status == 24)) + /* Function already gone or IRQs already deregistered. */ + cc = 0; + + return cc ? -EIO : 0; +} + +static int zpci_set_irq_affinity(struct irq_data *data, const struct cpumask *dest, + bool force) +{ + struct msi_desc *entry = irq_get_msi_desc(data->irq); + struct msi_msg msg = entry->msg; + + msg.address_lo &= 0xff0000ff; + msg.address_lo |= (cpumask_first(dest) << 8); + pci_write_msi_msg(data->irq, &msg); + + return IRQ_SET_MASK_OK; +} + +static struct irq_chip zpci_irq_chip = { + .name = "PCI-MSI", + .irq_unmask = pci_msi_unmask_irq, + .irq_mask = pci_msi_mask_irq, + .irq_set_affinity = zpci_set_irq_affinity, +}; + +static void zpci_handle_cpu_local_irq(bool rescan) +{ + struct airq_iv *dibv = zpci_ibv[smp_processor_id()]; + unsigned long bit; + int irqs_on = 0; + + for (bit = 0;;) { + /* Scan the directed IRQ bit vector */ + bit = airq_iv_scan(dibv, bit, airq_iv_end(dibv)); + if (bit == -1UL) { + if (!rescan || irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC)) + break; + bit = 0; + continue; + } + inc_irq_stat(IRQIO_MSI); + generic_handle_irq(airq_iv_get_data(dibv, bit)); + } +} + +struct cpu_irq_data { + call_single_data_t csd; + atomic_t scheduled; +}; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_irq_data, irq_data); + +static void zpci_handle_remote_irq(void *data) +{ + atomic_t *scheduled = data; + + do { + zpci_handle_cpu_local_irq(false); + } while (atomic_dec_return(scheduled)); +} + +static void zpci_handle_fallback_irq(void) +{ + struct cpu_irq_data *cpu_data; + unsigned long cpu; + int irqs_on = 0; + + for (cpu = 0;;) { + cpu = airq_iv_scan(zpci_sbv, cpu, airq_iv_end(zpci_sbv)); + if (cpu == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + break; + cpu = 0; + continue; + } + cpu_data = &per_cpu(irq_data, cpu); + if (atomic_inc_return(&cpu_data->scheduled) > 1) + continue; + + cpu_data->csd.func = zpci_handle_remote_irq; + cpu_data->csd.info = &cpu_data->scheduled; + cpu_data->csd.flags = 0; + smp_call_function_single_async(cpu, &cpu_data->csd); + } +} + +static void zpci_directed_irq_handler(struct airq_struct *airq, bool floating) +{ + if (floating) { + inc_irq_stat(IRQIO_PCF); + zpci_handle_fallback_irq(); + } else { + inc_irq_stat(IRQIO_PCD); + zpci_handle_cpu_local_irq(true); + } +} + +static void zpci_floating_irq_handler(struct airq_struct *airq, bool floating) +{ + unsigned long si, ai; + struct airq_iv *aibv; + int irqs_on = 0; + + inc_irq_stat(IRQIO_PCF); + for (si = 0;;) { + /* Scan adapter summary indicator bit vector */ + si = airq_iv_scan(zpci_sbv, si, airq_iv_end(zpci_sbv)); + if (si == -1UL) { + if (irqs_on++) + /* End of second scan with interrupts on. */ + break; + /* First scan complete, reenable interrupts. */ + if (zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC)) + break; + si = 0; + continue; + } + + /* Scan the adapter interrupt vector for this device. */ + aibv = zpci_ibv[si]; + for (ai = 0;;) { + ai = airq_iv_scan(aibv, ai, airq_iv_end(aibv)); + if (ai == -1UL) + break; + inc_irq_stat(IRQIO_MSI); + airq_iv_lock(aibv, ai); + generic_handle_irq(airq_iv_get_data(aibv, ai)); + airq_iv_unlock(aibv, ai); + } + } +} + +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct zpci_dev *zdev = to_zpci(pdev); + unsigned int hwirq, msi_vecs, cpu; + unsigned long bit; + struct msi_desc *msi; + struct msi_msg msg; + int rc, irq; + + zdev->aisb = -1UL; + zdev->msi_first_bit = -1U; + if (type == PCI_CAP_ID_MSI && nvec > 1) + return 1; + msi_vecs = min_t(unsigned int, nvec, zdev->max_msi); + + if (irq_delivery == DIRECTED) { + /* Allocate cpu vector bits */ + bit = airq_iv_alloc(zpci_ibv[0], msi_vecs); + if (bit == -1UL) + return -EIO; + } else { + /* Allocate adapter summary indicator bit */ + bit = airq_iv_alloc_bit(zpci_sbv); + if (bit == -1UL) + return -EIO; + zdev->aisb = bit; + + /* Create adapter interrupt vector */ + zdev->aibv = airq_iv_create(msi_vecs, AIRQ_IV_DATA | AIRQ_IV_BITLOCK); + if (!zdev->aibv) + return -ENOMEM; + + /* Wire up shortcut pointer */ + zpci_ibv[bit] = zdev->aibv; + /* Each function has its own interrupt vector */ + bit = 0; + } + + /* Request MSI interrupts */ + hwirq = bit; + for_each_pci_msi_entry(msi, pdev) { + rc = -EIO; + if (hwirq - bit >= msi_vecs) + break; + irq = __irq_alloc_descs(-1, 0, 1, 0, THIS_MODULE, msi->affinity); + if (irq < 0) + return -ENOMEM; + rc = irq_set_msi_desc(irq, msi); + if (rc) + return rc; + irq_set_chip_and_handler(irq, &zpci_irq_chip, + handle_percpu_irq); + msg.data = hwirq; + if (irq_delivery == DIRECTED) { + msg.address_lo = zdev->msi_addr & 0xff0000ff; + msg.address_lo |= msi->affinity ? + (cpumask_first(&msi->affinity->mask) << 8) : 0; + for_each_possible_cpu(cpu) { + airq_iv_set_data(zpci_ibv[cpu], hwirq, irq); + } + } else { + msg.address_lo = zdev->msi_addr & 0xffffffff; + airq_iv_set_data(zdev->aibv, hwirq, irq); + } + msg.address_hi = zdev->msi_addr >> 32; + pci_write_msi_msg(irq, &msg); + hwirq++; + } + + zdev->msi_first_bit = bit; + zdev->msi_nr_irqs = msi_vecs; + + if (irq_delivery == DIRECTED) + rc = zpci_set_directed_irq(zdev); + else + rc = zpci_set_airq(zdev); + if (rc) + return rc; + + return (msi_vecs == nvec) ? 0 : msi_vecs; +} + +void arch_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + struct msi_desc *msi; + int rc; + + /* Disable interrupts */ + if (irq_delivery == DIRECTED) + rc = zpci_clear_directed_irq(zdev); + else + rc = zpci_clear_airq(zdev); + if (rc) + return; + + /* Release MSI interrupts */ + for_each_pci_msi_entry(msi, pdev) { + if (!msi->irq) + continue; + if (msi->msi_attrib.is_msix) + __pci_msix_desc_mask_irq(msi, 1); + else + __pci_msi_desc_mask_irq(msi, 1, 1); + irq_set_msi_desc(msi->irq, NULL); + irq_free_desc(msi->irq); + msi->msg.address_lo = 0; + msi->msg.address_hi = 0; + msi->msg.data = 0; + msi->irq = 0; + } + + if (zdev->aisb != -1UL) { + zpci_ibv[zdev->aisb] = NULL; + airq_iv_free_bit(zpci_sbv, zdev->aisb); + zdev->aisb = -1UL; + } + if (zdev->aibv) { + airq_iv_release(zdev->aibv); + zdev->aibv = NULL; + } + + if ((irq_delivery == DIRECTED) && zdev->msi_first_bit != -1U) + airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); +} + +static struct airq_struct zpci_airq = { + .handler = zpci_floating_irq_handler, + .isc = PCI_ISC, +}; + +static void __init cpu_enable_directed_irq(void *unused) +{ + union zpci_sic_iib iib = {{0}}; + + iib.cdiib.dibv_addr = (u64) zpci_ibv[smp_processor_id()]->vector; + + __zpci_set_irq_ctrl(SIC_IRQ_MODE_SET_CPU, 0, &iib); + zpci_set_irq_ctrl(SIC_IRQ_MODE_D_SINGLE, PCI_ISC); +} + +static int __init zpci_directed_irq_init(void) +{ + union zpci_sic_iib iib = {{0}}; + unsigned int cpu; + + zpci_sbv = airq_iv_create(num_possible_cpus(), 0); + if (!zpci_sbv) + return -ENOMEM; + + iib.diib.isc = PCI_ISC; + iib.diib.nr_cpus = num_possible_cpus(); + iib.diib.disb_addr = (u64) zpci_sbv->vector; + __zpci_set_irq_ctrl(SIC_IRQ_MODE_DIRECT, 0, &iib); + + zpci_ibv = kcalloc(num_possible_cpus(), sizeof(*zpci_ibv), + GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + /* + * Per CPU IRQ vectors look the same but bit-allocation + * is only done on the first vector. + */ + zpci_ibv[cpu] = airq_iv_create(cache_line_size() * BITS_PER_BYTE, + AIRQ_IV_DATA | + AIRQ_IV_CACHELINE | + (!cpu ? AIRQ_IV_ALLOC : 0)); + if (!zpci_ibv[cpu]) + return -ENOMEM; + } + on_each_cpu(cpu_enable_directed_irq, NULL, 1); + + zpci_irq_chip.irq_set_affinity = zpci_set_irq_affinity; + + return 0; +} + +static int __init zpci_floating_irq_init(void) +{ + zpci_ibv = kcalloc(ZPCI_NR_DEVICES, sizeof(*zpci_ibv), GFP_KERNEL); + if (!zpci_ibv) + return -ENOMEM; + + zpci_sbv = airq_iv_create(ZPCI_NR_DEVICES, AIRQ_IV_ALLOC); + if (!zpci_sbv) + goto out_free; + + return 0; + +out_free: + kfree(zpci_ibv); + return -ENOMEM; +} + +int __init zpci_irq_init(void) +{ + int rc; + + irq_delivery = sclp.has_dirq ? DIRECTED : FLOATING; + if (s390_pci_force_floating) + irq_delivery = FLOATING; + + if (irq_delivery == DIRECTED) + zpci_airq.handler = zpci_directed_irq_handler; + + rc = register_adapter_interrupt(&zpci_airq); + if (rc) + goto out; + /* Set summary to 1 to be called every time for the ISC. */ + *zpci_airq.lsi_ptr = 1; + + switch (irq_delivery) { + case FLOATING: + rc = zpci_floating_irq_init(); + break; + case DIRECTED: + rc = zpci_directed_irq_init(); + break; + } + + if (rc) + goto out_airq; + + /* + * Enable floating IRQs (with suppression after one IRQ). When using + * directed IRQs this enables the fallback path. + */ + zpci_set_irq_ctrl(SIC_IRQ_MODE_SINGLE, PCI_ISC); + + return 0; +out_airq: + unregister_adapter_interrupt(&zpci_airq); +out: + return rc; +} + +void __init zpci_irq_exit(void) +{ + unsigned int cpu; + + if (irq_delivery == DIRECTED) { + for_each_possible_cpu(cpu) { + airq_iv_release(zpci_ibv[cpu]); + } + } + kfree(zpci_ibv); + if (zpci_sbv) + airq_iv_release(zpci_sbv); + unregister_adapter_interrupt(&zpci_airq); +} diff --git a/arch/s390/purgatory/Makefile b/arch/s390/purgatory/Makefile index ce6a3f75065b..dc1ae4ff79d7 100644 --- a/arch/s390/purgatory/Makefile +++ b/arch/s390/purgatory/Makefile @@ -4,7 +4,7 @@ OBJECT_FILES_NON_STANDARD := y purgatory-y := head.o purgatory.o string.o sha256.o mem.o -targets += $(purgatory-y) purgatory.ro kexec-purgatory.c +targets += $(purgatory-y) purgatory.lds purgatory purgatory.ro PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) $(obj)/sha256.o: $(srctree)/lib/sha256.c FORCE @@ -16,22 +16,26 @@ $(obj)/mem.o: $(srctree)/arch/s390/lib/mem.S FORCE $(obj)/string.o: $(srctree)/arch/s390/lib/string.c FORCE $(call if_changed_rule,cc_o_c) -LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -LDFLAGS_purgatory.ro += -z nodefaultlib KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes KBUILD_CFLAGS += -Wno-pointer-sign -Wno-sign-compare KBUILD_CFLAGS += -fno-zero-initialized-in-bss -fno-builtin -ffreestanding KBUILD_CFLAGS += -c -MD -Os -m64 -msoft-float -fno-common +KBUILD_CFLAGS += $(CLANG_FLAGS) KBUILD_CFLAGS += $(call cc-option,-fno-PIE) KBUILD_AFLAGS := $(filter-out -DCC_USING_EXPOLINE,$(KBUILD_AFLAGS)) -$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE +LDFLAGS_purgatory := -r --no-undefined -nostdlib -z nodefaultlib -T +$(obj)/purgatory: $(obj)/purgatory.lds $(PURGATORY_OBJS) FORCE $(call if_changed,ld) -quiet_cmd_bin2c = BIN2C $@ - cmd_bin2c = $(objtree)/scripts/bin2c kexec_purgatory < $< > $@ +OBJCOPYFLAGS_purgatory.ro := -O elf64-s390 +OBJCOPYFLAGS_purgatory.ro += --remove-section='*debug*' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.comment' +OBJCOPYFLAGS_purgatory.ro += --remove-section='.note.*' +$(obj)/purgatory.ro: $(obj)/purgatory FORCE + $(call if_changed,objcopy) -$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE - $(call if_changed,bin2c) +$(obj)/kexec-purgatory.o: $(obj)/kexec-purgatory.S $(obj)/purgatory.ro FORCE + $(call if_changed_rule,as_o_S) obj-$(CONFIG_ARCH_HAS_KEXEC_PURGATORY) += kexec-purgatory.o diff --git a/arch/s390/purgatory/kexec-purgatory.S b/arch/s390/purgatory/kexec-purgatory.S new file mode 100644 index 000000000000..8293753100ae --- /dev/null +++ b/arch/s390/purgatory/kexec-purgatory.S @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + + .section .rodata, "a" + + .align 8 +kexec_purgatory: + .globl kexec_purgatory + .incbin "arch/s390/purgatory/purgatory.ro" +.Lkexec_purgatroy_end: + + .align 8 +kexec_purgatory_size: + .globl kexec_purgatory_size + .quad .Lkexec_purgatroy_end - kexec_purgatory diff --git a/arch/s390/purgatory/purgatory.lds.S b/arch/s390/purgatory/purgatory.lds.S new file mode 100644 index 000000000000..482eb4fbcef1 --- /dev/null +++ b/arch/s390/purgatory/purgatory.lds.S @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm-generic/vmlinux.lds.h> + +OUTPUT_FORMAT("elf64-s390", "elf64-s390", "elf64-s390") +OUTPUT_ARCH(s390:64-bit) + +ENTRY(purgatory_start) + +SECTIONS +{ + . = 0; + .head.text : { + _head = . ; + HEAD_TEXT + _ehead = . ; + } + .text : { + _text = .; /* Text */ + *(.text) + *(.text.*) + _etext = . ; + } + .rodata : { + _rodata = . ; + *(.rodata) /* read-only data */ + *(.rodata.*) + _erodata = . ; + } + .data : { + _data = . ; + *(.data) + *(.data.*) + _edata = . ; + } + + . = ALIGN(256); + .bss : { + _bss = . ; + *(.bss) + *(.bss.*) + *(COMMON) + . = ALIGN(8); /* For convenience during zeroing */ + _ebss = .; + } + _end = .; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.eh_frame) + *(*__ksymtab*) + *(___kcrctab*) + } +} diff --git a/arch/s390/scripts/Makefile.chkbss b/arch/s390/scripts/Makefile.chkbss index cd7e8f4419f5..884a9caff5fb 100644 --- a/arch/s390/scripts/Makefile.chkbss +++ b/arch/s390/scripts/Makefile.chkbss @@ -11,7 +11,8 @@ chkbss: $(addprefix $(obj)/, $(chkbss-files)) quiet_cmd_chkbss = CHKBSS $< cmd_chkbss = \ - if ! $(OBJDUMP) -j .bss -w -h $< | awk 'END { if ($$3) exit 1 }'; then \ + if $(OBJDUMP) -h $< | grep -q "\.bss" && \ + ! $(OBJDUMP) -j .bss -w -h $< | awk 'END { if ($$3) exit 1 }'; then \ echo "error: $< .bss section is not empty" >&2; exit 1; \ fi; \ touch $@; diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 1cbed82cd17b..64638b764d1c 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -1,3 +1,5 @@ +0000 illegal E +0002 brkpt E 0101 pr E 0102 upt E 0104 ptff E @@ -257,6 +259,7 @@ b258 bsg RRE_RR b25a bsa RRE_RR b25d clst RRE_RR b25e srst RRE_RR +b25f chsc RRE_R0 b263 cmpsc RRE_RR b274 siga S_RD b276 xsch S_00 @@ -277,6 +280,9 @@ b29d lfpc S_RD b2a5 tre RRE_RR b2a6 cu21 RRF_U0RR b2a7 cu12 RRF_U0RR +b2ad nqap RRE_RR +b2ae dqap RRE_RR +b2af pqap RRE_RR b2b0 stfle S_RD b2b1 stfl S_RD b2b2 lpswe S_RD @@ -290,6 +296,7 @@ b2e5 epctr RRE_RR b2e8 ppa RRF_U0RR b2ec etnd RRE_R0 b2ed ecpga RRE_RR +b2f0 iucv RRE_RR b2f8 tend S_00 b2fa niai IE_UU b2fc tabort S_RD @@ -559,12 +566,15 @@ b998 alcr RRE_RR b999 slbr RRE_RR b99a epair RRE_R0 b99b esair RRE_R0 +b99c eqbs RRF_U0RR b99d esea RRE_R0 b99e pti RRE_RR b99f ssair RRE_R0 +b9a0 clp RRF_U0RR b9a1 tpei RRE_RR b9a2 ptf RRE_R0 b9aa lptea RRF_RURR2 +b9ab essa RRF_U0RR b9ac irbm RRE_RR b9ae rrbm RRE_RR b9af pfmf RRE_RR @@ -1039,6 +1049,7 @@ eb7a agsi SIY_IRD eb7e algsi SIY_IRD eb80 icmh RSY_RURD eb81 icmy RSY_RURD +eb8a sqbs RSY_RDRU eb8e mvclu RSY_RRRD eb8f clclu RSY_RRRD eb90 stmy RSY_RRRD diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b1c91ea9a958..0be08d586d40 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -90,12 +90,6 @@ config ARCH_DEFCONFIG default "arch/sh/configs/shx3_defconfig" if SUPERH32 default "arch/sh/configs/cayman_defconfig" if SUPERH64 -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config GENERIC_BUG def_bool y depends on BUG && SUPERH32 diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c index 34e5414c5563..f402aa741bf3 100644 --- a/arch/sh/boards/mach-ecovec24/setup.c +++ b/arch/sh/boards/mach-ecovec24/setup.c @@ -806,7 +806,6 @@ static struct spi_board_info spi_bus[] = { .platform_data = &mmc_spi_info, .max_speed_hz = 5000000, .mode = SPI_MODE_0, - .controller_data = (void *) GPIO_PTM4, }, }; @@ -838,6 +837,14 @@ static struct platform_device msiof0_device = { .resource = msiof0_resources, }; +static struct gpiod_lookup_table msiof_gpio_table = { + .dev_id = "spi_sh_msiof.0", + .table = { + GPIO_LOOKUP("sh7724_pfc", GPIO_PTM4, "cs", GPIO_ACTIVE_HIGH), + { }, + }, +}; + #endif /* FSI */ @@ -1296,12 +1303,11 @@ static int __init arch_setup(void) gpio_request(GPIO_FN_MSIOF0_TXD, NULL); gpio_request(GPIO_FN_MSIOF0_RXD, NULL); gpio_request(GPIO_FN_MSIOF0_TSCK, NULL); - gpio_request(GPIO_PTM4, NULL); /* software CS control of TSYNC pin */ - gpio_direction_output(GPIO_PTM4, 1); /* active low CS */ gpio_request(GPIO_PTB6, NULL); /* 3.3V power control */ gpio_direction_output(GPIO_PTB6, 0); /* disable power by default */ gpiod_add_lookup_table(&mmc_spi_gpio_table); + gpiod_add_lookup_table(&msiof_gpio_table); spi_register_board_info(spi_bus, ARRAY_SIZE(spi_bus)); #endif diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index 7bf2cb680d32..73fff39a0122 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -17,7 +17,6 @@ generic-y += mm-arch-hooks.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h -generic-y += rwsem.h generic-y += serial.h generic-y += sizes.h generic-y += trace_clock.h diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 4f7f235f15f8..c28e37a344ad 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -229,9 +229,6 @@ __BUILD_IOPORT_STRING(q, u64) #define IO_SPACE_LIMIT 0xffffffff -/* synco on SH-4A, otherwise a nop */ -#define mmiowb() wmb() - /* We really want to try and get these to memcpy etc */ void memcpy_fromio(void *, const volatile void __iomem *, unsigned long); void memcpy_toio(volatile void __iomem *, const void *, unsigned long); diff --git a/arch/sh/include/asm/mmiowb.h b/arch/sh/include/asm/mmiowb.h new file mode 100644 index 000000000000..535d59735f1d --- /dev/null +++ b/arch/sh/include/asm/mmiowb.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_SH_MMIOWB_H +#define __ASM_SH_MMIOWB_H + +#include <asm/barrier.h> + +/* synco on SH-4A, otherwise a nop */ +#define mmiowb() wmb() + +#include <asm-generic/mmiowb.h> + +#endif /* __ASM_SH_MMIOWB_H */ diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 8ad73cb31121..b56f908b1395 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -70,6 +70,15 @@ do { \ tlb_remove_page((tlb), (pte)); \ } while (0) +#if CONFIG_PGTABLE_LEVELS > 2 +#define __pmd_free_tlb(tlb, pmdp, addr) \ +do { \ + struct page *page = virt_to_page(pmdp); \ + pgtable_pmd_page_dtor(page); \ + tlb_remove_page((tlb), page); \ +} while (0); +#endif + static inline void check_pgt_cache(void) { quicklist_trim(QUICK_PT, NULL, 25, 16); diff --git a/arch/sh/include/asm/spinlock-llsc.h b/arch/sh/include/asm/spinlock-llsc.h index 786ee0fde3b0..7fd929cd2e7a 100644 --- a/arch/sh/include/asm/spinlock-llsc.h +++ b/arch/sh/include/asm/spinlock-llsc.h @@ -47,6 +47,8 @@ static inline void arch_spin_unlock(arch_spinlock_t *lock) { unsigned long tmp; + /* This could be optimised with ARCH_HAS_MMIOWB */ + mmiowb(); __asm__ __volatile__ ( "mov #1, %0 ! arch_spin_unlock \n\t" "mov.l %0, @%1 \n\t" diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 77abe192fb43..bc77f3dd4261 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -11,133 +11,8 @@ #ifdef CONFIG_MMU #include <linux/swap.h> -#include <asm/pgalloc.h> -#include <asm/tlbflush.h> -#include <asm/mmu_context.h> -/* - * TLB handling. This allows us to remove pages from the page - * tables, and efficiently handle the TLB issues. - */ -struct mmu_gather { - struct mm_struct *mm; - unsigned int fullmm; - unsigned long start, end; -}; - -static inline void init_tlb_gather(struct mmu_gather *tlb) -{ - tlb->start = TASK_SIZE; - tlb->end = 0; - - if (tlb->fullmm) { - tlb->start = 0; - tlb->end = TASK_SIZE; - } -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - - init_tlb_gather(tlb); -} - -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (tlb->fullmm || force) - flush_tlb_mm(tlb->mm); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -static inline void -tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) -{ - if (tlb->start > address) - tlb->start = address; - if (tlb->end < address + PAGE_SIZE) - tlb->end = address + PAGE_SIZE; -} - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -/* - * In the case of tlb vma handling, we can optimise these away in the - * case where we're doing a full MM flush. When we're doing a munmap, - * the vmas are adjusted to only cover the region to be torn down. - */ -static inline void -tlb_start_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm) - flush_cache_range(vma, vma->vm_start, vma->vm_end); -} - -static inline void -tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) -{ - if (!tlb->fullmm && tlb->end) { - flush_tlb_range(vma, tlb->start, tlb->end); - init_tlb_gather(tlb); - } -} - -static inline void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ -} - -static inline void tlb_flush_mmu_free(struct mmu_gather *tlb) -{ -} - -static inline void tlb_flush_mmu(struct mmu_gather *tlb) -{ -} - -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - __tlb_remove_page(tlb, page); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) -#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) -#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) - -#define tlb_migrate_finish(mm) do { } while (0) +#include <asm-generic/tlb.h> #if defined(CONFIG_CPU_SH4) || defined(CONFIG_SUPERH64) extern void tlb_wire_entry(struct vm_area_struct *, unsigned long, pte_t); @@ -157,11 +32,6 @@ static inline void tlb_unwire_entry(void) #else /* CONFIG_MMU */ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, pte, address) do { } while (0) -#define tlb_flush(tlb) do { } while (0) - #include <asm-generic/tlb.h> #endif /* CONFIG_MMU */ diff --git a/arch/sh/kernel/stacktrace.c b/arch/sh/kernel/stacktrace.c index f3cb2cccb262..2950b19ad077 100644 --- a/arch/sh/kernel/stacktrace.c +++ b/arch/sh/kernel/stacktrace.c @@ -49,8 +49,6 @@ void save_stack_trace(struct stack_trace *trace) unsigned long *sp = (unsigned long *)current_stack_pointer; unwind_stack(current, NULL, sp, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace); @@ -84,7 +82,5 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) unsigned long *sp = (unsigned long *)tsk->thread.sp; unwind_stack(current, NULL, sp, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index bfda678576e4..480b057556ee 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -426,3 +426,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 40f8f4f73fe8..f6421c9ce5d3 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -63,6 +63,7 @@ config SPARC64 select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP + select HAVE_RCU_TABLE_NO_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_MEMBLOCK_NODE_MAP select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_DYNAMIC_FTRACE @@ -191,14 +192,6 @@ config NR_CPUS source "kernel/Kconfig.hz" -config RWSEM_GENERIC_SPINLOCK - bool - default y if SPARC32 - -config RWSEM_XCHGADD_ALGORITHM - bool - default y if SPARC64 - config GENERIC_HWEIGHT bool default y diff --git a/arch/sparc/crypto/des_glue.c b/arch/sparc/crypto/des_glue.c index 4884315daff4..453a4cf5492a 100644 --- a/arch/sparc/crypto/des_glue.c +++ b/arch/sparc/crypto/des_glue.c @@ -201,18 +201,15 @@ static int des3_ede_set_key(struct crypto_tfm *tfm, const u8 *key, unsigned int keylen) { struct des3_ede_sparc64_ctx *dctx = crypto_tfm_ctx(tfm); - const u32 *K = (const u32 *)key; u32 *flags = &tfm->crt_flags; u64 k1[DES_EXPKEY_WORDS / 2]; u64 k2[DES_EXPKEY_WORDS / 2]; u64 k3[DES_EXPKEY_WORDS / 2]; + int err; - if (unlikely(!((K[0] ^ K[2]) | (K[1] ^ K[3])) || - !((K[2] ^ K[4]) | (K[3] ^ K[5]))) && - (*flags & CRYPTO_TFM_REQ_FORBID_WEAK_KEYS)) { - *flags |= CRYPTO_TFM_RES_WEAK_KEY; - return -EINVAL; - } + err = __des3_verify_key(flags, key); + if (unlikely(err)) + return err; des_sparc64_key_expand((const u32 *)key, k1); key += DES_KEY_SIZE; diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index a22cfd5c0ee8..95c44380b1d6 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -15,10 +15,10 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += msi.h generic-y += preempt.h -generic-y += rwsem.h generic-y += serial.h generic-y += trace_clock.h generic-y += word-at-a-time.h diff --git a/arch/sparc/include/asm/io_64.h b/arch/sparc/include/asm/io_64.h index b162c23ae8c2..688911051b44 100644 --- a/arch/sparc/include/asm/io_64.h +++ b/arch/sparc/include/asm/io_64.h @@ -396,8 +396,6 @@ static inline void memcpy_toio(volatile void __iomem *dst, const void *src, } } -#define mmiowb() - #ifdef __KERNEL__ /* On sparc64 we have the whole physical IO address space accessible diff --git a/arch/sparc/include/asm/tlb_32.h b/arch/sparc/include/asm/tlb_32.h index 343cea19e573..5cd28a8793e3 100644 --- a/arch/sparc/include/asm/tlb_32.h +++ b/arch/sparc/include/asm/tlb_32.h @@ -2,24 +2,6 @@ #ifndef _SPARC_TLB_H #define _SPARC_TLB_H -#define tlb_start_vma(tlb, vma) \ -do { \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define tlb_end_vma(tlb, vma) \ -do { \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ -} while (0) - -#define __tlb_remove_tlb_entry(tlb, pte, address) \ - do { } while (0) - -#define tlb_flush(tlb) \ -do { \ - flush_tlb_mm((tlb)->mm); \ -} while (0) - #include <asm-generic/tlb.h> #endif /* _SPARC_TLB_H */ diff --git a/arch/sparc/kernel/ds.c b/arch/sparc/kernel/ds.c index f87265afb175..cad08ccce625 100644 --- a/arch/sparc/kernel/ds.c +++ b/arch/sparc/kernel/ds.c @@ -876,7 +876,7 @@ void ldom_power_off(void) static void ds_conn_reset(struct ds_info *dp) { - printk(KERN_ERR "ds-%llu: ds_conn_reset() from %pf\n", + printk(KERN_ERR "ds-%llu: ds_conn_reset() from %ps\n", dp->id, __builtin_return_address(0)); } diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index a8af6023c126..14b93c5564e3 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -73,6 +73,11 @@ static inline void iommu_batch_start(struct device *dev, unsigned long prot, uns p->npages = 0; } +static inline bool iommu_use_atu(struct iommu *iommu, u64 mask) +{ + return iommu->atu && mask > DMA_BIT_MASK(32); +} + /* Interrupts must be disabled. */ static long iommu_batch_flush(struct iommu_batch *p, u64 mask) { @@ -92,7 +97,7 @@ static long iommu_batch_flush(struct iommu_batch *p, u64 mask) prot &= (HV_PCI_MAP_ATTR_READ | HV_PCI_MAP_ATTR_WRITE); while (npages != 0) { - if (mask <= DMA_BIT_MASK(32) || !pbm->iommu->atu) { + if (!iommu_use_atu(pbm->iommu, mask)) { num = pci_sun4v_iommu_map(devhandle, HV_PCI_TSBID(0, entry), npages, @@ -179,7 +184,6 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, unsigned long flags, order, first_page, npages, n; unsigned long prot = 0; struct iommu *iommu; - struct atu *atu; struct iommu_map_table *tbl; struct page *page; void *ret; @@ -205,13 +209,11 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, memset((char *)first_page, 0, PAGE_SIZE << order); iommu = dev->archdata.iommu; - atu = iommu->atu; - mask = dev->coherent_dma_mask; - if (mask <= DMA_BIT_MASK(32) || !atu) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else - tbl = &atu->tbl; + tbl = &iommu->atu->tbl; entry = iommu_tbl_range_alloc(dev, tbl, npages, NULL, (unsigned long)(-1), 0); @@ -333,7 +335,7 @@ static void dma_4v_free_coherent(struct device *dev, size_t size, void *cpu, atu = iommu->atu; devhandle = pbm->devhandle; - if (dvma <= DMA_BIT_MASK(32)) { + if (!iommu_use_atu(iommu, dvma)) { tbl = &iommu->tbl; iotsb_num = 0; /* we don't care for legacy iommu */ } else { @@ -374,7 +376,7 @@ static dma_addr_t dma_4v_map_page(struct device *dev, struct page *page, npages >>= IO_PAGE_SHIFT; mask = *dev->dma_mask; - if (mask <= DMA_BIT_MASK(32)) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else tbl = &atu->tbl; @@ -510,7 +512,7 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, IO_PAGE_SIZE) >> IO_PAGE_SHIFT; mask = *dev->dma_mask; - if (mask <= DMA_BIT_MASK(32)) + if (!iommu_use_atu(iommu, mask)) tbl = &iommu->tbl; else tbl = &atu->tbl; diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index b9a5a04b2d2c..a1dd24307b00 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -469,3 +469,7 @@ 421 32 rt_sigtimedwait_time64 sys_rt_sigtimedwait compat_sys_rt_sigtimedwait_time64 422 32 futex_time64 sys_futex sys_futex 423 32 sched_rr_get_interval_time64 sys_sched_rr_get_interval sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/um/drivers/harddog_kern.c b/arch/um/drivers/harddog_kern.c index 6d381279b362..000cb69ba0bc 100644 --- a/arch/um/drivers/harddog_kern.c +++ b/arch/um/drivers/harddog_kern.c @@ -85,7 +85,7 @@ static int harddog_open(struct inode *inode, struct file *file) timer_alive = 1; spin_unlock(&lock); mutex_unlock(&harddog_mutex); - return nonseekable_open(inode, file); + return stream_open(inode, file); err: spin_unlock(&lock); mutex_unlock(&harddog_mutex); diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 00bcbe2326d9..b506ad06aefc 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += irq_work.h generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += param.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index dce6db147f24..70ee60383900 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -2,162 +2,8 @@ #ifndef __UM_TLB_H #define __UM_TLB_H -#include <linux/pagemap.h> -#include <linux/swap.h> -#include <asm/percpu.h> -#include <asm/pgalloc.h> #include <asm/tlbflush.h> - -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - -/* struct mmu_gather is an opaque type used by the mm code for passing around - * any data needed by arch specific code for tlb_remove_page. - */ -struct mmu_gather { - struct mm_struct *mm; - unsigned int need_flush; /* Really unmapped some ptes? */ - unsigned long start; - unsigned long end; - unsigned int fullmm; /* non-zero means full mm flush */ -}; - -static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, - unsigned long address) -{ - if (tlb->start > address) - tlb->start = address; - if (tlb->end < address + PAGE_SIZE) - tlb->end = address + PAGE_SIZE; -} - -static inline void init_tlb_gather(struct mmu_gather *tlb) -{ - tlb->need_flush = 0; - - tlb->start = TASK_SIZE; - tlb->end = 0; - - if (tlb->fullmm) { - tlb->start = 0; - tlb->end = TASK_SIZE; - } -} - -static inline void -arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - tlb->mm = mm; - tlb->start = start; - tlb->end = end; - tlb->fullmm = !(start | (end+1)); - - init_tlb_gather(tlb); -} - -extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, - unsigned long end); - -static inline void -tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) -{ - flush_tlb_mm_range(tlb->mm, tlb->start, tlb->end); -} - -static inline void -tlb_flush_mmu_free(struct mmu_gather *tlb) -{ - init_tlb_gather(tlb); -} - -static inline void -tlb_flush_mmu(struct mmu_gather *tlb) -{ - if (!tlb->need_flush) - return; - - tlb_flush_mmu_tlbonly(tlb); - tlb_flush_mmu_free(tlb); -} - -/* arch_tlb_finish_mmu - * Called at the end of the shootdown operation to free up any resources - * that were required. - */ -static inline void -arch_tlb_finish_mmu(struct mmu_gather *tlb, - unsigned long start, unsigned long end, bool force) -{ - if (force) { - tlb->start = start; - tlb->end = end; - tlb->need_flush = 1; - } - tlb_flush_mmu(tlb); - - /* keep the page table cache within bounds */ - check_pgt_cache(); -} - -/* tlb_remove_page - * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), - * while handling the additional races in SMP caused by other CPUs - * caching valid mappings in their TLBs. - */ -static inline int __tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - tlb->need_flush = 1; - free_page_and_swap_cache(page); - return false; /* avoid calling tlb_flush_mmu */ -} - -static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) -{ - __tlb_remove_page(tlb, page); -} - -static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return __tlb_remove_page(tlb, page); -} - -static inline void tlb_remove_page_size(struct mmu_gather *tlb, - struct page *page, int page_size) -{ - return tlb_remove_page(tlb, page); -} - -/** - * tlb_remove_tlb_entry - remember a pte unmapping for later tlb invalidation. - * - * Record the fact that pte's were really umapped in ->need_flush, so we can - * later optimise away the tlb invalidate. This helps when userspace is - * unmapping already-unmapped pages, which happens quite a lot. - */ -#define tlb_remove_tlb_entry(tlb, ptep, address) \ - do { \ - tlb->need_flush = 1; \ - __tlb_remove_tlb_entry(tlb, ptep, address); \ - } while (0) - -#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ - tlb_remove_tlb_entry(tlb, ptep, address) - -#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change -static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, - unsigned int page_size) -{ -} - -#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) - -#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) - -#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) - -#define tlb_migrate_finish(mm) do {} while (0) +#include <asm-generic/cacheflush.h> +#include <asm-generic/tlb.h> #endif diff --git a/arch/um/kernel/stacktrace.c b/arch/um/kernel/stacktrace.c index ebe7bcf62684..bd95e020d509 100644 --- a/arch/um/kernel/stacktrace.c +++ b/arch/um/kernel/stacktrace.c @@ -63,8 +63,6 @@ static const struct stacktrace_ops dump_ops = { static void __save_stack_trace(struct task_struct *tsk, struct stack_trace *trace) { dump_trace(tsk, &dump_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c index 6b995e870d55..05585eef11d9 100644 --- a/arch/um/kernel/sysrq.c +++ b/arch/um/kernel/sysrq.c @@ -20,7 +20,7 @@ static void _print_addr(void *data, unsigned long address, int reliable) { - pr_info(" [<%08lx>] %s%pF\n", address, reliable ? "" : "? ", + pr_info(" [<%08lx>] %s%pS\n", address, reliable ? "" : "? ", (void *)address); } diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 817d82608712..2445dfcf6444 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -20,6 +20,7 @@ config UNICORE32 select GENERIC_IOMAP select MODULES_USE_ELF_REL select NEED_DMA_MAP_STATE + select MMU_GATHER_NO_RANGE if MMU help UniCore-32 is 32-bit Instruction Set Architecture, including a series of low-power-consumption RISC chip @@ -38,12 +39,6 @@ config STACKTRACE_SUPPORT config LOCKDEP_SUPPORT def_bool y -config RWSEM_GENERIC_SPINLOCK - def_bool y - -config RWSEM_XCHGADD_ALGORITHM - bool - config ARCH_HAS_ILOG2_U32 bool diff --git a/arch/unicore32/include/asm/Kbuild b/arch/unicore32/include/asm/Kbuild index d77d953c04c1..b301a0b3c0b2 100644 --- a/arch/unicore32/include/asm/Kbuild +++ b/arch/unicore32/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += kvm_para.h generic-y += local.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += module.h generic-y += parport.h generic-y += percpu.h diff --git a/arch/unicore32/include/asm/tlb.h b/arch/unicore32/include/asm/tlb.h index 9cca15cdae94..00a8477333f6 100644 --- a/arch/unicore32/include/asm/tlb.h +++ b/arch/unicore32/include/asm/tlb.h @@ -12,10 +12,9 @@ #ifndef __UNICORE_TLB_H__ #define __UNICORE_TLB_H__ -#define tlb_start_vma(tlb, vma) do { } while (0) -#define tlb_end_vma(tlb, vma) do { } while (0) -#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) +/* + * unicore32 lacks an efficient flush_tlb_range(), use flush_tlb_mm(). + */ #define __pte_free_tlb(tlb, pte, addr) \ do { \ diff --git a/arch/unicore32/kernel/stacktrace.c b/arch/unicore32/kernel/stacktrace.c index 9976e767d51c..e37da8c6837b 100644 --- a/arch/unicore32/kernel/stacktrace.c +++ b/arch/unicore32/kernel/stacktrace.c @@ -120,8 +120,6 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) } walk_stackframe(&frame, save_trace, &data); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7a70fb58b2d0..e7212731cffb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -14,6 +14,7 @@ config X86_32 select ARCH_WANT_IPC_PARSE_VERSION select CLKSRC_I8253 select CLONE_BACKWARDS + select HAVE_DEBUG_STACKOVERFLOW select MODULES_USE_ELF_REL select OLD_SIGACTION @@ -28,7 +29,6 @@ config X86_64 select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE select SWIOTLB - select X86_DEV_DMA_OPS select ARCH_HAS_SYSCALL_WRAPPER # @@ -64,6 +64,7 @@ config X86 select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 && X86_MCE select ARCH_HAS_SET_MEMORY + select ARCH_HAS_SET_DIRECT_MAP select ARCH_HAS_STRICT_KERNEL_RWX select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE @@ -73,6 +74,7 @@ config X86 select ARCH_MIGHT_HAVE_ACPI_PDC if ACPI select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO + select ARCH_STACKWALK select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 @@ -137,7 +139,6 @@ config X86 select HAVE_COPY_THREAD_TLS select HAVE_C_RECORDMCOUNT select HAVE_DEBUG_KMEMLEAK - select HAVE_DEBUG_STACKOVERFLOW select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS @@ -182,7 +183,6 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_RCU_TABLE_FREE if PARAVIRT - select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API @@ -267,9 +267,6 @@ config ARCH_MAY_HAVE_PC_FDC def_bool y depends on ISA_DMA_API -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_CALIBRATE_DELAY def_bool y @@ -702,8 +699,6 @@ config STA2X11 bool "STA2X11 Companion Chip Support" depends on X86_32_NON_STANDARD && PCI select ARCH_HAS_PHYS_TO_DMA - select X86_DEV_DMA_OPS - select X86_DMA_REMAP select SWIOTLB select MFD_STA2X11 select GPIOLIB @@ -782,14 +777,6 @@ config PARAVIRT_SPINLOCKS If you are unsure how to answer this question, answer Y. -config QUEUED_LOCK_STAT - bool "Paravirt queued spinlock statistics" - depends on PARAVIRT_SPINLOCKS && DEBUG_FS - ---help--- - Enable the collection of statistical data on the slowpath - behavior of paravirtualized queued spinlocks and report - them on debugfs. - source "arch/x86/xen/Kconfig" config KVM_GUEST @@ -1329,8 +1316,16 @@ config MICROCODE_AMD processors will be enabled. config MICROCODE_OLD_INTERFACE - def_bool y + bool "Ancient loading interface (DEPRECATED)" + default n depends on MICROCODE + ---help--- + DO NOT USE THIS! This is the ancient /dev/cpu/microcode interface + which was used by userspace tools like iucode_tool and microcode.ctl. + It is inadequate because it runs too late to be able to properly + load microcode on a machine and it needs special tools. Instead, you + should've switched to the early loading method with the initrd or + builtin microcode by now: Documentation/x86/microcode.txt config X86_MSR tristate "/dev/cpu/*/msr - Model-specific register support" @@ -1498,7 +1493,7 @@ config X86_CPA_STATISTICS depends on DEBUG_FS ---help--- Expose statistics about the Change Page Attribute mechanims, which - helps to determine the effectivness of preserving large and huge + helps to determine the effectiveness of preserving large and huge page mappings when mapping protections are changed. config ARCH_HAS_MEM_ENCRYPT @@ -1605,12 +1600,9 @@ config ARCH_FLATMEM_ENABLE depends on X86_32 && !NUMA config ARCH_DISCONTIGMEM_ENABLE - def_bool y - depends on NUMA && X86_32 - -config ARCH_DISCONTIGMEM_DEFAULT - def_bool y + def_bool n depends on NUMA && X86_32 + depends on BROKEN config ARCH_SPARSEMEM_ENABLE def_bool y @@ -1619,8 +1611,7 @@ config ARCH_SPARSEMEM_ENABLE select SPARSEMEM_VMEMMAP_ENABLE if X86_64 config ARCH_SPARSEMEM_DEFAULT - def_bool y - depends on X86_64 + def_bool X86_64 || (NUMA && X86_32) config ARCH_SELECT_MEMORY_MODEL def_bool y @@ -2877,11 +2868,6 @@ config HAVE_ATOMIC_IOMAP config X86_DEV_DMA_OPS bool - depends on X86_64 || STA2X11 - -config X86_DMA_REMAP - bool - depends on STA2X11 config HAVE_GENERIC_GUP def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a587805c6687..56e748a7679f 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -47,7 +47,7 @@ export REALMODE_CFLAGS export BITS ifdef CONFIG_X86_NEED_RELOCS - LDFLAGS_vmlinux := --emit-relocs + LDFLAGS_vmlinux := --emit-relocs --discard-none endif # diff --git a/arch/x86/boot/compressed/acpi.c b/arch/x86/boot/compressed/acpi.c index 0ef4ad55b29b..ad84239e595e 100644 --- a/arch/x86/boot/compressed/acpi.c +++ b/arch/x86/boot/compressed/acpi.c @@ -276,7 +276,7 @@ static unsigned long get_acpi_srat_table(void) if (acpi_table) { header = (struct acpi_table_header *)acpi_table; - if (ACPI_COMPARE_NAME(header->signature, ACPI_SIG_SRAT)) + if (ACPI_COMPARE_NAMESEG(header->signature, ACPI_SIG_SRAT)) return acpi_table; } entry += size; diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index c0d6c560df69..5a237e8dbf8d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -352,7 +352,7 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap, boot_params->hdr.loadflags &= ~KASLR_FLAG; /* Save RSDP address for later use. */ - boot_params->acpi_rsdp_addr = get_rsdp_addr(); + /* boot_params->acpi_rsdp_addr = get_rsdp_addr(); */ sanitize_boot_params(boot_params); diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9f908112bbb9..2b2481acc661 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -25,18 +25,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_SMP=y CONFIG_X86_GENERIC=y CONFIG_HPET_TIMER=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 1d3badfda09e..e8829abf063a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -24,18 +24,6 @@ CONFIG_JUMP_LABEL=y CONFIG_MODULES=y CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y -CONFIG_PARTITION_ADVANCED=y -CONFIG_OSF_PARTITION=y -CONFIG_AMIGA_PARTITION=y -CONFIG_MAC_PARTITION=y -CONFIG_BSD_DISKLABEL=y -CONFIG_MINIX_SUBPARTITION=y -CONFIG_SOLARIS_X86_PARTITION=y -CONFIG_UNIXWARE_DISKLABEL=y -CONFIG_SGI_PARTITION=y -CONFIG_SUN_PARTITION=y -CONFIG_KARMA_PARTITION=y -CONFIG_EFI_PARTITION=y CONFIG_SMP=y CONFIG_CALGARY_IOMMU=y CONFIG_NR_CPUS=64 diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c index 3ea71b871813..bdeee1b830be 100644 --- a/arch/x86/crypto/aegis128-aesni-glue.c +++ b/arch/x86/crypto/aegis128-aesni-glue.c @@ -11,8 +11,8 @@ * any later version. */ -#include <crypto/cryptd.h> #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> @@ -242,131 +242,35 @@ static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) { } -static int cryptd_aegis128_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} - -static int cryptd_aegis128_aesni_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - -static int cryptd_aegis128_aesni_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} - -static int cryptd_aegis128_aesni_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} - -static int cryptd_aegis128_aesni_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__aegis128-aesni", CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void cryptd_aegis128_aesni_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg crypto_aegis128_aesni_alg[] = { - { - .setkey = crypto_aegis128_aesni_setkey, - .setauthsize = crypto_aegis128_aesni_setauthsize, - .encrypt = crypto_aegis128_aesni_encrypt, - .decrypt = crypto_aegis128_aesni_decrypt, - .init = crypto_aegis128_aesni_init_tfm, - .exit = crypto_aegis128_aesni_exit_tfm, - - .ivsize = AEGIS128_NONCE_SIZE, - .maxauthsize = AEGIS128_MAX_AUTH_SIZE, - .chunksize = AEGIS128_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - - .cra_name = "__aegis128", - .cra_driver_name = "__aegis128-aesni", - - .cra_module = THIS_MODULE, - } - }, { - .setkey = cryptd_aegis128_aesni_setkey, - .setauthsize = cryptd_aegis128_aesni_setauthsize, - .encrypt = cryptd_aegis128_aesni_encrypt, - .decrypt = cryptd_aegis128_aesni_decrypt, - .init = cryptd_aegis128_aesni_init_tfm, - .exit = cryptd_aegis128_aesni_exit_tfm, - - .ivsize = AEGIS128_NONCE_SIZE, - .maxauthsize = AEGIS128_MAX_AUTH_SIZE, - .chunksize = AEGIS128_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_alignmask = 0, - - .cra_priority = 400, - - .cra_name = "aegis128", - .cra_driver_name = "aegis128-aesni", - - .cra_module = THIS_MODULE, - } +static struct aead_alg crypto_aegis128_aesni_alg = { + .setkey = crypto_aegis128_aesni_setkey, + .setauthsize = crypto_aegis128_aesni_setauthsize, + .encrypt = crypto_aegis128_aesni_encrypt, + .decrypt = crypto_aegis128_aesni_decrypt, + .init = crypto_aegis128_aesni_init_tfm, + .exit = crypto_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, + .chunksize = AEGIS128_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + .cra_priority = 400, + + .cra_name = "__aegis128", + .cra_driver_name = "__aegis128-aesni", + + .cra_module = THIS_MODULE, } }; +static struct simd_aead_alg *simd_alg; + static int __init crypto_aegis128_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || @@ -374,14 +278,13 @@ static int __init crypto_aegis128_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_aegis128_aesni_alg, - ARRAY_SIZE(crypto_aegis128_aesni_alg)); + return simd_register_aeads_compat(&crypto_aegis128_aesni_alg, 1, + &simd_alg); } static void __exit crypto_aegis128_aesni_module_exit(void) { - crypto_unregister_aeads(crypto_aegis128_aesni_alg, - ARRAY_SIZE(crypto_aegis128_aesni_alg)); + simd_unregister_aeads(&crypto_aegis128_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis128_aesni_module_init); diff --git a/arch/x86/crypto/aegis128l-aesni-glue.c b/arch/x86/crypto/aegis128l-aesni-glue.c index 1b1b39c66c5e..80d917f7e467 100644 --- a/arch/x86/crypto/aegis128l-aesni-glue.c +++ b/arch/x86/crypto/aegis128l-aesni-glue.c @@ -11,8 +11,8 @@ * any later version. */ -#include <crypto/cryptd.h> #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> @@ -242,131 +242,35 @@ static void crypto_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) { } -static int cryptd_aegis128l_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} - -static int cryptd_aegis128l_aesni_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - -static int cryptd_aegis128l_aesni_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} - -static int cryptd_aegis128l_aesni_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} - -static int cryptd_aegis128l_aesni_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__aegis128l-aesni", CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void cryptd_aegis128l_aesni_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg crypto_aegis128l_aesni_alg[] = { - { - .setkey = crypto_aegis128l_aesni_setkey, - .setauthsize = crypto_aegis128l_aesni_setauthsize, - .encrypt = crypto_aegis128l_aesni_encrypt, - .decrypt = crypto_aegis128l_aesni_decrypt, - .init = crypto_aegis128l_aesni_init_tfm, - .exit = crypto_aegis128l_aesni_exit_tfm, - - .ivsize = AEGIS128L_NONCE_SIZE, - .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, - .chunksize = AEGIS128L_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - - .cra_name = "__aegis128l", - .cra_driver_name = "__aegis128l-aesni", - - .cra_module = THIS_MODULE, - } - }, { - .setkey = cryptd_aegis128l_aesni_setkey, - .setauthsize = cryptd_aegis128l_aesni_setauthsize, - .encrypt = cryptd_aegis128l_aesni_encrypt, - .decrypt = cryptd_aegis128l_aesni_decrypt, - .init = cryptd_aegis128l_aesni_init_tfm, - .exit = cryptd_aegis128l_aesni_exit_tfm, - - .ivsize = AEGIS128L_NONCE_SIZE, - .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, - .chunksize = AEGIS128L_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_alignmask = 0, - - .cra_priority = 400, - - .cra_name = "aegis128l", - .cra_driver_name = "aegis128l-aesni", - - .cra_module = THIS_MODULE, - } +static struct aead_alg crypto_aegis128l_aesni_alg = { + .setkey = crypto_aegis128l_aesni_setkey, + .setauthsize = crypto_aegis128l_aesni_setauthsize, + .encrypt = crypto_aegis128l_aesni_encrypt, + .decrypt = crypto_aegis128l_aesni_decrypt, + .init = crypto_aegis128l_aesni_init_tfm, + .exit = crypto_aegis128l_aesni_exit_tfm, + + .ivsize = AEGIS128L_NONCE_SIZE, + .maxauthsize = AEGIS128L_MAX_AUTH_SIZE, + .chunksize = AEGIS128L_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + .cra_priority = 400, + + .cra_name = "__aegis128l", + .cra_driver_name = "__aegis128l-aesni", + + .cra_module = THIS_MODULE, } }; +static struct simd_aead_alg *simd_alg; + static int __init crypto_aegis128l_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || @@ -374,14 +278,13 @@ static int __init crypto_aegis128l_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_aegis128l_aesni_alg, - ARRAY_SIZE(crypto_aegis128l_aesni_alg)); + return simd_register_aeads_compat(&crypto_aegis128l_aesni_alg, 1, + &simd_alg); } static void __exit crypto_aegis128l_aesni_module_exit(void) { - crypto_unregister_aeads(crypto_aegis128l_aesni_alg, - ARRAY_SIZE(crypto_aegis128l_aesni_alg)); + simd_unregister_aeads(&crypto_aegis128l_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis128l_aesni_module_init); diff --git a/arch/x86/crypto/aegis256-aesni-glue.c b/arch/x86/crypto/aegis256-aesni-glue.c index 6227ca3220a0..716eecb66bd5 100644 --- a/arch/x86/crypto/aegis256-aesni-glue.c +++ b/arch/x86/crypto/aegis256-aesni-glue.c @@ -11,8 +11,8 @@ * any later version. */ -#include <crypto/cryptd.h> #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <crypto/scatterwalk.h> #include <linux/module.h> @@ -242,131 +242,35 @@ static void crypto_aegis256_aesni_exit_tfm(struct crypto_aead *aead) { } -static int cryptd_aegis256_aesni_setkey(struct crypto_aead *aead, - const u8 *key, unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} - -static int cryptd_aegis256_aesni_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - -static int cryptd_aegis256_aesni_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} - -static int cryptd_aegis256_aesni_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} - -static int cryptd_aegis256_aesni_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__aegis256-aesni", CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void cryptd_aegis256_aesni_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg crypto_aegis256_aesni_alg[] = { - { - .setkey = crypto_aegis256_aesni_setkey, - .setauthsize = crypto_aegis256_aesni_setauthsize, - .encrypt = crypto_aegis256_aesni_encrypt, - .decrypt = crypto_aegis256_aesni_decrypt, - .init = crypto_aegis256_aesni_init_tfm, - .exit = crypto_aegis256_aesni_exit_tfm, - - .ivsize = AEGIS256_NONCE_SIZE, - .maxauthsize = AEGIS256_MAX_AUTH_SIZE, - .chunksize = AEGIS256_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_INTERNAL, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct aegis_ctx) + - __alignof__(struct aegis_ctx), - .cra_alignmask = 0, - - .cra_name = "__aegis256", - .cra_driver_name = "__aegis256-aesni", - - .cra_module = THIS_MODULE, - } - }, { - .setkey = cryptd_aegis256_aesni_setkey, - .setauthsize = cryptd_aegis256_aesni_setauthsize, - .encrypt = cryptd_aegis256_aesni_encrypt, - .decrypt = cryptd_aegis256_aesni_decrypt, - .init = cryptd_aegis256_aesni_init_tfm, - .exit = cryptd_aegis256_aesni_exit_tfm, - - .ivsize = AEGIS256_NONCE_SIZE, - .maxauthsize = AEGIS256_MAX_AUTH_SIZE, - .chunksize = AEGIS256_BLOCK_SIZE, - - .base = { - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_alignmask = 0, - - .cra_priority = 400, - - .cra_name = "aegis256", - .cra_driver_name = "aegis256-aesni", - - .cra_module = THIS_MODULE, - } +static struct aead_alg crypto_aegis256_aesni_alg = { + .setkey = crypto_aegis256_aesni_setkey, + .setauthsize = crypto_aegis256_aesni_setauthsize, + .encrypt = crypto_aegis256_aesni_encrypt, + .decrypt = crypto_aegis256_aesni_decrypt, + .init = crypto_aegis256_aesni_init_tfm, + .exit = crypto_aegis256_aesni_exit_tfm, + + .ivsize = AEGIS256_NONCE_SIZE, + .maxauthsize = AEGIS256_MAX_AUTH_SIZE, + .chunksize = AEGIS256_BLOCK_SIZE, + + .base = { + .cra_flags = CRYPTO_ALG_INTERNAL, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct aegis_ctx) + + __alignof__(struct aegis_ctx), + .cra_alignmask = 0, + .cra_priority = 400, + + .cra_name = "__aegis256", + .cra_driver_name = "__aegis256-aesni", + + .cra_module = THIS_MODULE, } }; +static struct simd_aead_alg *simd_alg; + static int __init crypto_aegis256_aesni_module_init(void) { if (!boot_cpu_has(X86_FEATURE_XMM2) || @@ -374,14 +278,13 @@ static int __init crypto_aegis256_aesni_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_aegis256_aesni_alg, - ARRAY_SIZE(crypto_aegis256_aesni_alg)); + return simd_register_aeads_compat(&crypto_aegis256_aesni_alg, 1, + &simd_alg); } static void __exit crypto_aegis256_aesni_module_exit(void) { - crypto_unregister_aeads(crypto_aegis256_aesni_alg, - ARRAY_SIZE(crypto_aegis256_aesni_alg)); + simd_unregister_aeads(&crypto_aegis256_aesni_alg, 1, &simd_alg); } module_init(crypto_aegis256_aesni_module_init); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 1e3d2102033a..21c246799aa5 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -25,14 +25,13 @@ #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/aes.h> -#include <crypto/cryptd.h> #include <crypto/ctr.h> #include <crypto/b128ops.h> #include <crypto/gcm.h> #include <crypto/xts.h> #include <asm/cpu_device_id.h> -#include <asm/fpu/api.h> #include <asm/crypto/aes.h> +#include <asm/simd.h> #include <crypto/scatterwalk.h> #include <crypto/internal/aead.h> #include <crypto/internal/simd.h> @@ -333,7 +332,7 @@ static int aes_set_key_common(struct crypto_tfm *tfm, void *raw_ctx, return -EINVAL; } - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) err = crypto_aes_expand_key(ctx, in_key, key_len); else { kernel_fpu_begin(); @@ -354,7 +353,7 @@ static void aes_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) crypto_aes_encrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -367,7 +366,7 @@ static void aes_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { struct crypto_aes_ctx *ctx = aes_ctx(crypto_tfm_ctx(tfm)); - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) crypto_aes_decrypt_x86(ctx, dst, src); else { kernel_fpu_begin(); @@ -643,29 +642,6 @@ static int xts_decrypt(struct skcipher_request *req) aes_ctx(ctx->raw_crypt_ctx)); } -static int rfc4106_init(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} - -static void rfc4106_exit(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - static int rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len) { @@ -710,15 +686,8 @@ static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len); } -static int gcmaes_wrapper_set_key(struct crypto_aead *parent, const u8 *key, - unsigned int key_len) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(parent); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, key_len); -} - +/* This is the Integrity Check Value (aka the authentication tag) length and can + * be 8, 12 or 16 bytes long. */ static int common_rfc4106_set_authsize(struct crypto_aead *aead, unsigned int authsize) { @@ -734,17 +703,6 @@ static int common_rfc4106_set_authsize(struct crypto_aead *aead, return 0; } -/* This is the Integrity Check Value (aka the authentication tag length and can - * be 8, 12 or 16 bytes long. */ -static int gcmaes_wrapper_set_authsize(struct crypto_aead *parent, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(parent); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} - static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, unsigned int authsize) { @@ -964,38 +922,6 @@ static int helper_rfc4106_decrypt(struct aead_request *req) return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, aes_ctx); } - -static int gcmaes_wrapper_encrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(tfm); - struct cryptd_aead *cryptd_tfm = *ctx; - - tfm = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - tfm = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, tfm); - - return crypto_aead_encrypt(req); -} - -static int gcmaes_wrapper_decrypt(struct aead_request *req) -{ - struct crypto_aead *tfm = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(tfm); - struct cryptd_aead *cryptd_tfm = *ctx; - - tfm = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - tfm = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, tfm); - - return crypto_aead_decrypt(req); -} #endif static struct crypto_alg aesni_algs[] = { { @@ -1148,31 +1074,7 @@ static int generic_gcmaes_decrypt(struct aead_request *req) aes_ctx); } -static int generic_gcmaes_init(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_tfm = cryptd_alloc_aead("__driver-generic-gcm-aes-aesni", - CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - - return 0; -} - -static void generic_gcmaes_exit(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} - -static struct aead_alg aesni_aead_algs[] = { { +static struct aead_alg aesni_aeads[] = { { .setkey = common_rfc4106_set_key, .setauthsize = common_rfc4106_set_authsize, .encrypt = helper_rfc4106_encrypt, @@ -1180,8 +1082,9 @@ static struct aead_alg aesni_aead_algs[] = { { .ivsize = GCM_RFC4106_IV_SIZE, .maxauthsize = 16, .base = { - .cra_name = "__gcm-aes-aesni", - .cra_driver_name = "__driver-gcm-aes-aesni", + .cra_name = "__rfc4106(gcm(aes))", + .cra_driver_name = "__rfc4106-gcm-aesni", + .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), @@ -1189,24 +1092,6 @@ static struct aead_alg aesni_aead_algs[] = { { .cra_module = THIS_MODULE, }, }, { - .init = rfc4106_init, - .exit = rfc4106_exit, - .setkey = gcmaes_wrapper_set_key, - .setauthsize = gcmaes_wrapper_set_authsize, - .encrypt = gcmaes_wrapper_encrypt, - .decrypt = gcmaes_wrapper_decrypt, - .ivsize = GCM_RFC4106_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "rfc4106(gcm(aes))", - .cra_driver_name = "rfc4106-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_module = THIS_MODULE, - }, -}, { .setkey = generic_gcmaes_set_key, .setauthsize = generic_gcmaes_set_authsize, .encrypt = generic_gcmaes_encrypt, @@ -1214,38 +1099,21 @@ static struct aead_alg aesni_aead_algs[] = { { .ivsize = GCM_AES_IV_SIZE, .maxauthsize = 16, .base = { - .cra_name = "__generic-gcm-aes-aesni", - .cra_driver_name = "__driver-generic-gcm-aes-aesni", - .cra_priority = 0, + .cra_name = "__gcm(aes)", + .cra_driver_name = "__generic-gcm-aesni", + .cra_priority = 400, .cra_flags = CRYPTO_ALG_INTERNAL, .cra_blocksize = 1, .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), .cra_alignmask = AESNI_ALIGN - 1, .cra_module = THIS_MODULE, }, -}, { - .init = generic_gcmaes_init, - .exit = generic_gcmaes_exit, - .setkey = gcmaes_wrapper_set_key, - .setauthsize = gcmaes_wrapper_set_authsize, - .encrypt = gcmaes_wrapper_encrypt, - .decrypt = gcmaes_wrapper_decrypt, - .ivsize = GCM_AES_IV_SIZE, - .maxauthsize = 16, - .base = { - .cra_name = "gcm(aes)", - .cra_driver_name = "generic-gcm-aesni", - .cra_priority = 400, - .cra_flags = CRYPTO_ALG_ASYNC, - .cra_blocksize = 1, - .cra_ctxsize = sizeof(struct cryptd_aead *), - .cra_module = THIS_MODULE, - }, } }; #else -static struct aead_alg aesni_aead_algs[0]; +static struct aead_alg aesni_aeads[0]; #endif +static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; static const struct x86_cpu_id aesni_cpu_id[] = { X86_FEATURE_MATCH(X86_FEATURE_AES), @@ -1253,23 +1121,9 @@ static const struct x86_cpu_id aesni_cpu_id[] = { }; MODULE_DEVICE_TABLE(x86cpu, aesni_cpu_id); -static void aesni_free_simds(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(aesni_simd_skciphers) && - aesni_simd_skciphers[i]; i++) - simd_skcipher_free(aesni_simd_skciphers[i]); -} - static int __init aesni_init(void) { - struct simd_skcipher_alg *simd; - const char *basename; - const char *algname; - const char *drvname; int err; - int i; if (!x86_match_cpu(aesni_cpu_id)) return -ENODEV; @@ -1304,36 +1158,22 @@ static int __init aesni_init(void) if (err) return err; - err = crypto_register_skciphers(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers)); + err = simd_register_skciphers_compat(aesni_skciphers, + ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); if (err) goto unregister_algs; - err = crypto_register_aeads(aesni_aead_algs, - ARRAY_SIZE(aesni_aead_algs)); + err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); if (err) goto unregister_skciphers; - for (i = 0; i < ARRAY_SIZE(aesni_skciphers); i++) { - algname = aesni_skciphers[i].base.cra_name + 2; - drvname = aesni_skciphers[i].base.cra_driver_name + 2; - basename = aesni_skciphers[i].base.cra_driver_name; - simd = simd_skcipher_create_compat(algname, drvname, basename); - err = PTR_ERR(simd); - if (IS_ERR(simd)) - goto unregister_simds; - - aesni_simd_skciphers[i] = simd; - } - return 0; -unregister_simds: - aesni_free_simds(); - crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); unregister_skciphers: - crypto_unregister_skciphers(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers)); + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); unregister_algs: crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); return err; @@ -1341,10 +1181,10 @@ unregister_algs: static void __exit aesni_exit(void) { - aesni_free_simds(); - crypto_unregister_aeads(aesni_aead_algs, ARRAY_SIZE(aesni_aead_algs)); - crypto_unregister_skciphers(aesni_skciphers, - ARRAY_SIZE(aesni_skciphers)); + simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), + aesni_simd_aeads); + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); crypto_unregister_algs(aesni_algs, ARRAY_SIZE(aesni_algs)); } diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c index 45c1c4143176..4967ad620775 100644 --- a/arch/x86/crypto/chacha_glue.c +++ b/arch/x86/crypto/chacha_glue.c @@ -12,10 +12,10 @@ #include <crypto/algapi.h> #include <crypto/chacha.h> +#include <crypto/internal/simd.h> #include <crypto/internal/skcipher.h> #include <linux/kernel.h> #include <linux/module.h> -#include <asm/fpu/api.h> #include <asm/simd.h> #define CHACHA_STATE_ALIGN 16 @@ -170,7 +170,7 @@ static int chacha_simd(struct skcipher_request *req) struct skcipher_walk walk; int err; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_chacha_crypt(req); err = skcipher_walk_virt(&walk, req, true); @@ -193,7 +193,7 @@ static int xchacha_simd(struct skcipher_request *req) u8 real_iv[16]; int err; - if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable()) + if (req->cryptlen <= CHACHA_BLOCK_SIZE || !crypto_simd_usable()) return crypto_xchacha_crypt(req); err = skcipher_walk_virt(&walk, req, true); diff --git a/arch/x86/crypto/crc32-pclmul_glue.c b/arch/x86/crypto/crc32-pclmul_glue.c index c8d9cdacbf10..cb4ab6645106 100644 --- a/arch/x86/crypto/crc32-pclmul_glue.c +++ b/arch/x86/crypto/crc32-pclmul_glue.c @@ -32,10 +32,11 @@ #include <linux/kernel.h> #include <linux/crc32.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <asm/cpufeatures.h> #include <asm/cpu_device_id.h> -#include <asm/fpu/api.h> +#include <asm/simd.h> #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -54,7 +55,7 @@ static u32 __attribute__((pure)) unsigned int iremainder; unsigned int prealign; - if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !irq_fpu_usable()) + if (len < PCLMUL_MIN_LEN + SCALE_F_MASK || !crypto_simd_usable()) return crc32_le(crc, p, len); if ((long)p & SCALE_F_MASK) { diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 5773e1161072..a58fe217c856 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -29,10 +29,11 @@ #include <linux/string.h> #include <linux/kernel.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <asm/cpufeatures.h> #include <asm/cpu_device_id.h> -#include <asm/fpu/internal.h> +#include <asm/simd.h> #define CHKSUM_BLOCK_SIZE 1 #define CHKSUM_DIGEST_SIZE 4 @@ -177,7 +178,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -189,7 +190,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && crypto_simd_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); diff --git a/arch/x86/crypto/crct10dif-pclmul_glue.c b/arch/x86/crypto/crct10dif-pclmul_glue.c index 0e785c0b2354..3c81e15b0873 100644 --- a/arch/x86/crypto/crct10dif-pclmul_glue.c +++ b/arch/x86/crypto/crct10dif-pclmul_glue.c @@ -26,12 +26,13 @@ #include <linux/module.h> #include <linux/crc-t10dif.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/string.h> #include <linux/kernel.h> -#include <asm/fpu/api.h> #include <asm/cpufeatures.h> #include <asm/cpu_device_id.h> +#include <asm/simd.h> asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); @@ -53,7 +54,7 @@ static int chksum_update(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - if (length >= 16 && irq_fpu_usable()) { + if (length >= 16 && crypto_simd_usable()) { kernel_fpu_begin(); ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); kernel_fpu_end(); @@ -70,15 +71,14 @@ static int chksum_final(struct shash_desc *desc, u8 *out) return 0; } -static int __chksum_finup(__u16 *crcp, const u8 *data, unsigned int len, - u8 *out) +static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) { - if (len >= 16 && irq_fpu_usable()) { + if (len >= 16 && crypto_simd_usable()) { kernel_fpu_begin(); - *(__u16 *)out = crc_t10dif_pcl(*crcp, data, len); + *(__u16 *)out = crc_t10dif_pcl(crc, data, len); kernel_fpu_end(); } else - *(__u16 *)out = crc_t10dif_generic(*crcp, data, len); + *(__u16 *)out = crc_t10dif_generic(crc, data, len); return 0; } @@ -87,15 +87,13 @@ static int chksum_finup(struct shash_desc *desc, const u8 *data, { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - return __chksum_finup(&ctx->crc, data, len, out); + return __chksum_finup(ctx->crc, data, len, out); } static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { - struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); - - return __chksum_finup(&ctx->crc, data, length, out); + return __chksum_finup(0, data, length, out); } static struct shash_alg alg = { diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c index 3582ae885ee1..e3f3e6fd9d65 100644 --- a/arch/x86/crypto/ghash-clmulni-intel_glue.c +++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c @@ -19,8 +19,9 @@ #include <crypto/cryptd.h> #include <crypto/gf128mul.h> #include <crypto/internal/hash.h> -#include <asm/fpu/api.h> +#include <crypto/internal/simd.h> #include <asm/cpu_device_id.h> +#include <asm/simd.h> #define GHASH_BLOCK_SIZE 16 #define GHASH_DIGEST_SIZE 16 @@ -171,7 +172,6 @@ static int ghash_async_init(struct ahash_request *req) struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); desc->tfm = child; - desc->flags = req->base.flags; return crypto_shash_init(desc); } @@ -182,7 +182,7 @@ static int ghash_async_update(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -200,7 +200,7 @@ static int ghash_async_final(struct ahash_request *req) struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -241,7 +241,7 @@ static int ghash_async_digest(struct ahash_request *req) struct ahash_request *cryptd_req = ahash_request_ctx(req); struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm; - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (in_atomic() && cryptd_ahash_queued(cryptd_tfm))) { memcpy(cryptd_req, req, sizeof(*req)); ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base); @@ -251,7 +251,6 @@ static int ghash_async_digest(struct ahash_request *req) struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm); desc->tfm = child; - desc->flags = req->base.flags; return shash_ahash_digest(req, desc); } } diff --git a/arch/x86/crypto/morus1280-avx2-glue.c b/arch/x86/crypto/morus1280-avx2-glue.c index 6634907d6ccd..679627a2a824 100644 --- a/arch/x86/crypto/morus1280-avx2-glue.c +++ b/arch/x86/crypto/morus1280-avx2-glue.c @@ -12,6 +12,7 @@ */ #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> #include <crypto/morus1280_glue.h> #include <linux/module.h> #include <asm/fpu/api.h> @@ -35,7 +36,9 @@ asmlinkage void crypto_morus1280_avx2_dec_tail(void *state, const void *src, asmlinkage void crypto_morus1280_avx2_final(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -MORUS1280_DECLARE_ALGS(avx2, "morus1280-avx2", 400); +MORUS1280_DECLARE_ALG(avx2, "morus1280-avx2", 400); + +static struct simd_aead_alg *simd_alg; static int __init crypto_morus1280_avx2_module_init(void) { @@ -44,14 +47,13 @@ static int __init crypto_morus1280_avx2_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_morus1280_avx2_algs, - ARRAY_SIZE(crypto_morus1280_avx2_algs)); + return simd_register_aeads_compat(&crypto_morus1280_avx2_alg, 1, + &simd_alg); } static void __exit crypto_morus1280_avx2_module_exit(void) { - crypto_unregister_aeads(crypto_morus1280_avx2_algs, - ARRAY_SIZE(crypto_morus1280_avx2_algs)); + simd_unregister_aeads(&crypto_morus1280_avx2_alg, 1, &simd_alg); } module_init(crypto_morus1280_avx2_module_init); diff --git a/arch/x86/crypto/morus1280-sse2-glue.c b/arch/x86/crypto/morus1280-sse2-glue.c index f40244eaf14d..c35c0638d0bb 100644 --- a/arch/x86/crypto/morus1280-sse2-glue.c +++ b/arch/x86/crypto/morus1280-sse2-glue.c @@ -12,6 +12,7 @@ */ #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> #include <crypto/morus1280_glue.h> #include <linux/module.h> #include <asm/fpu/api.h> @@ -35,7 +36,9 @@ asmlinkage void crypto_morus1280_sse2_dec_tail(void *state, const void *src, asmlinkage void crypto_morus1280_sse2_final(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -MORUS1280_DECLARE_ALGS(sse2, "morus1280-sse2", 350); +MORUS1280_DECLARE_ALG(sse2, "morus1280-sse2", 350); + +static struct simd_aead_alg *simd_alg; static int __init crypto_morus1280_sse2_module_init(void) { @@ -43,14 +46,13 @@ static int __init crypto_morus1280_sse2_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_morus1280_sse2_algs, - ARRAY_SIZE(crypto_morus1280_sse2_algs)); + return simd_register_aeads_compat(&crypto_morus1280_sse2_alg, 1, + &simd_alg); } static void __exit crypto_morus1280_sse2_module_exit(void) { - crypto_unregister_aeads(crypto_morus1280_sse2_algs, - ARRAY_SIZE(crypto_morus1280_sse2_algs)); + simd_unregister_aeads(&crypto_morus1280_sse2_alg, 1, &simd_alg); } module_init(crypto_morus1280_sse2_module_init); diff --git a/arch/x86/crypto/morus1280_glue.c b/arch/x86/crypto/morus1280_glue.c index 7e600f8bcdad..30fc1bd98ec3 100644 --- a/arch/x86/crypto/morus1280_glue.c +++ b/arch/x86/crypto/morus1280_glue.c @@ -11,7 +11,6 @@ * any later version. */ -#include <crypto/cryptd.h> #include <crypto/internal/aead.h> #include <crypto/internal/skcipher.h> #include <crypto/morus1280_glue.h> @@ -205,90 +204,6 @@ void crypto_morus1280_glue_init_ops(struct crypto_aead *aead, } EXPORT_SYMBOL_GPL(crypto_morus1280_glue_init_ops); -int cryptd_morus1280_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setkey); - -int cryptd_morus1280_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_setauthsize); - -int cryptd_morus1280_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_encrypt); - -int cryptd_morus1280_glue_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_decrypt); - -int cryptd_morus1280_glue_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - const char *name = crypto_aead_alg(aead)->base.cra_driver_name; - char internal_name[CRYPTO_MAX_ALG_NAME]; - - if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) - >= CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; - - cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_init_tfm); - -void cryptd_morus1280_glue_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} -EXPORT_SYMBOL_GPL(cryptd_morus1280_glue_exit_tfm); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); MODULE_DESCRIPTION("MORUS-1280 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/morus640-sse2-glue.c b/arch/x86/crypto/morus640-sse2-glue.c index 9afaf8f8565a..32da56b3bdad 100644 --- a/arch/x86/crypto/morus640-sse2-glue.c +++ b/arch/x86/crypto/morus640-sse2-glue.c @@ -12,6 +12,7 @@ */ #include <crypto/internal/aead.h> +#include <crypto/internal/simd.h> #include <crypto/morus640_glue.h> #include <linux/module.h> #include <asm/fpu/api.h> @@ -35,7 +36,9 @@ asmlinkage void crypto_morus640_sse2_dec_tail(void *state, const void *src, asmlinkage void crypto_morus640_sse2_final(void *state, void *tag_xor, u64 assoclen, u64 cryptlen); -MORUS640_DECLARE_ALGS(sse2, "morus640-sse2", 400); +MORUS640_DECLARE_ALG(sse2, "morus640-sse2", 400); + +static struct simd_aead_alg *simd_alg; static int __init crypto_morus640_sse2_module_init(void) { @@ -43,14 +46,13 @@ static int __init crypto_morus640_sse2_module_init(void) !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) return -ENODEV; - return crypto_register_aeads(crypto_morus640_sse2_algs, - ARRAY_SIZE(crypto_morus640_sse2_algs)); + return simd_register_aeads_compat(&crypto_morus640_sse2_alg, 1, + &simd_alg); } static void __exit crypto_morus640_sse2_module_exit(void) { - crypto_unregister_aeads(crypto_morus640_sse2_algs, - ARRAY_SIZE(crypto_morus640_sse2_algs)); + simd_unregister_aeads(&crypto_morus640_sse2_alg, 1, &simd_alg); } module_init(crypto_morus640_sse2_module_init); diff --git a/arch/x86/crypto/morus640_glue.c b/arch/x86/crypto/morus640_glue.c index cb3a81732016..1dea33d84426 100644 --- a/arch/x86/crypto/morus640_glue.c +++ b/arch/x86/crypto/morus640_glue.c @@ -11,7 +11,6 @@ * any later version. */ -#include <crypto/cryptd.h> #include <crypto/internal/aead.h> #include <crypto/internal/skcipher.h> #include <crypto/morus640_glue.h> @@ -200,90 +199,6 @@ void crypto_morus640_glue_init_ops(struct crypto_aead *aead, } EXPORT_SYMBOL_GPL(crypto_morus640_glue_init_ops); -int cryptd_morus640_glue_setkey(struct crypto_aead *aead, const u8 *key, - unsigned int keylen) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setkey(&cryptd_tfm->base, key, keylen); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setkey); - -int cryptd_morus640_glue_setauthsize(struct crypto_aead *aead, - unsigned int authsize) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - return crypto_aead_setauthsize(&cryptd_tfm->base, authsize); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_setauthsize); - -int cryptd_morus640_glue_encrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_encrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_encrypt); - -int cryptd_morus640_glue_decrypt(struct aead_request *req) -{ - struct crypto_aead *aead = crypto_aead_reqtfm(req); - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - struct cryptd_aead *cryptd_tfm = *ctx; - - aead = &cryptd_tfm->base; - if (irq_fpu_usable() && (!in_atomic() || - !cryptd_aead_queued(cryptd_tfm))) - aead = cryptd_aead_child(cryptd_tfm); - - aead_request_set_tfm(req, aead); - - return crypto_aead_decrypt(req); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_decrypt); - -int cryptd_morus640_glue_init_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead *cryptd_tfm; - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - const char *name = crypto_aead_alg(aead)->base.cra_driver_name; - char internal_name[CRYPTO_MAX_ALG_NAME]; - - if (snprintf(internal_name, CRYPTO_MAX_ALG_NAME, "__%s", name) - >= CRYPTO_MAX_ALG_NAME) - return -ENAMETOOLONG; - - cryptd_tfm = cryptd_alloc_aead(internal_name, CRYPTO_ALG_INTERNAL, - CRYPTO_ALG_INTERNAL); - if (IS_ERR(cryptd_tfm)) - return PTR_ERR(cryptd_tfm); - - *ctx = cryptd_tfm; - crypto_aead_set_reqsize(aead, crypto_aead_reqsize(&cryptd_tfm->base)); - return 0; -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_init_tfm); - -void cryptd_morus640_glue_exit_tfm(struct crypto_aead *aead) -{ - struct cryptd_aead **ctx = crypto_aead_ctx(aead); - - cryptd_free_aead(*ctx); -} -EXPORT_SYMBOL_GPL(cryptd_morus640_glue_exit_tfm); - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Ondrej Mosnacek <omosnacek@gmail.com>"); MODULE_DESCRIPTION("MORUS-640 AEAD mode -- glue for x86 optimizations"); diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c index 20d815ea4b6a..f7567cbd35b6 100644 --- a/arch/x86/crypto/nhpoly1305-avx2-glue.c +++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c @@ -7,9 +7,10 @@ */ #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> -#include <asm/fpu/api.h> +#include <asm/simd.h> asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len, u8 hash[NH_HASH_BYTES]); @@ -24,7 +25,7 @@ static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_avx2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - if (srclen < 64 || !irq_fpu_usable()) + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); do { diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c index ed68d164ce14..a661ede3b5cf 100644 --- a/arch/x86/crypto/nhpoly1305-sse2-glue.c +++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c @@ -7,9 +7,10 @@ */ #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/nhpoly1305.h> #include <linux/module.h> -#include <asm/fpu/api.h> +#include <asm/simd.h> asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len, u8 hash[NH_HASH_BYTES]); @@ -24,7 +25,7 @@ static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len, static int nhpoly1305_sse2_update(struct shash_desc *desc, const u8 *src, unsigned int srclen) { - if (srclen < 64 || !irq_fpu_usable()) + if (srclen < 64 || !crypto_simd_usable()) return crypto_nhpoly1305_update(desc, src, srclen); do { diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index 3b6e70d085da..8457cdd47f75 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2) vpaddq t2,t1,t1 vmovq t1x,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index e6add74d78a5..6f0be7a86964 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx @@ -524,6 +524,12 @@ ENTRY(poly1305_2block_sse2) paddq t2,t1 movq t1,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -562,16 +568,16 @@ ENTRY(poly1305_2block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c index 88cc01506c84..6eb65b237b3c 100644 --- a/arch/x86/crypto/poly1305_glue.c +++ b/arch/x86/crypto/poly1305_glue.c @@ -11,11 +11,11 @@ #include <crypto/algapi.h> #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <crypto/poly1305.h> #include <linux/crypto.h> #include <linux/kernel.h> #include <linux/module.h> -#include <asm/fpu/api.h> #include <asm/simd.h> struct poly1305_simd_desc_ctx { @@ -126,7 +126,7 @@ static int poly1305_simd_update(struct shash_desc *desc, unsigned int bytes; /* kernel_fpu_begin/end is costly, use fallback for small updates */ - if (srclen <= 288 || !may_use_simd()) + if (srclen <= 288 || !crypto_simd_usable()) return crypto_poly1305_update(desc, src, srclen); kernel_fpu_begin(); diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7391c7de72c7..42f177afc33a 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -22,6 +22,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> @@ -29,7 +30,7 @@ #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha1_base.h> -#include <asm/fpu/api.h> +#include <asm/simd.h> typedef void (sha1_transform_fn)(u32 *digest, const char *data, unsigned int rounds); @@ -39,7 +40,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, { struct sha1_state *sctx = shash_desc_ctx(desc); - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE) return crypto_sha1_update(desc, data, len); @@ -57,7 +58,7 @@ static int sha1_update(struct shash_desc *desc, const u8 *data, static int sha1_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha1_transform_fn *sha1_xform) { - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) return crypto_sha1_finup(desc, data, len, out); kernel_fpu_begin(); diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index 773a873d2b28..73867da3cbee 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -30,6 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> @@ -37,8 +38,8 @@ #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha256_base.h> -#include <asm/fpu/api.h> #include <linux/string.h> +#include <asm/simd.h> asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data, u64 rounds); @@ -49,7 +50,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data, { struct sha256_state *sctx = shash_desc_ctx(desc); - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE) return crypto_sha256_update(desc, data, len); @@ -67,7 +68,7 @@ static int sha256_update(struct shash_desc *desc, const u8 *data, static int sha256_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha256_transform_fn *sha256_xform) { - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) return crypto_sha256_finup(desc, data, len, out); kernel_fpu_begin(); diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index f1b811b60ba6..458356a3f124 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -28,16 +28,16 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <crypto/internal/hash.h> +#include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mm.h> #include <linux/cryptohash.h> +#include <linux/string.h> #include <linux/types.h> #include <crypto/sha.h> #include <crypto/sha512_base.h> -#include <asm/fpu/api.h> - -#include <linux/string.h> +#include <asm/simd.h> asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data, u64 rounds); @@ -49,7 +49,7 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, { struct sha512_state *sctx = shash_desc_ctx(desc); - if (!irq_fpu_usable() || + if (!crypto_simd_usable() || (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE) return crypto_sha512_update(desc, data, len); @@ -67,7 +67,7 @@ static int sha512_update(struct shash_desc *desc, const u8 *data, static int sha512_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out, sha512_transform_fn *sha512_xform) { - if (!irq_fpu_usable()) + if (!crypto_simd_usable()) return crypto_sha512_finup(desc, data, len, out); kernel_fpu_begin(); diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 7bc105f47d21..51beb8d29123 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -25,12 +25,13 @@ #include <linux/uprobes.h> #include <linux/livepatch.h> #include <linux/syscalls.h> +#include <linux/uaccess.h> #include <asm/desc.h> #include <asm/traps.h> #include <asm/vdso.h> -#include <linux/uaccess.h> #include <asm/cpufeature.h> +#include <asm/fpu/api.h> #define CREATE_TRACE_POINTS #include <trace/events/syscalls.h> @@ -196,6 +197,13 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) exit_to_usermode_loop(regs, cached_flags); + /* Reload ti->flags; we may have rescheduled above. */ + cached_flags = READ_ONCE(ti->flags); + + fpregs_assert_state_consistent(); + if (unlikely(cached_flags & _TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + #ifdef CONFIG_COMPAT /* * Compat syscalls set TS_COMPAT. Make sure we clear it before diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index d309f30cf7af..7b23431be5cb 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -650,6 +650,7 @@ ENTRY(__switch_to_asm) pushl %ebx pushl %edi pushl %esi + pushfl /* switch stack */ movl %esp, TASK_threadsp(%eax) @@ -672,6 +673,7 @@ ENTRY(__switch_to_asm) #endif /* restore callee-saved registers */ + popfl popl %esi popl %edi popl %ebx @@ -766,13 +768,12 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) -.Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) jnz restore_all_kernel testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all_kernel call preempt_schedule_irq - jmp .Lneed_resched + jmp restore_all_kernel END(resume_kernel) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1f0efdb7b629..20e45d9b4e15 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -298,7 +298,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_STACKPROTECTOR movq TASK_stack_canary(%rsi), %rbx - movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq %rbx, PER_CPU_VAR(fixed_percpu_data) + stack_canary_offset #endif #ifdef CONFIG_RETPOLINE @@ -430,8 +430,8 @@ END(irq_entries_start) * it before we actually move ourselves to the IRQ stack. */ - movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) - movq PER_CPU_VAR(irq_stack_ptr), %rsp + movq \old_rsp, PER_CPU_VAR(irq_stack_backing_store + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(hardirq_stack_ptr), %rsp #ifdef CONFIG_DEBUG_ENTRY /* @@ -645,10 +645,9 @@ retint_kernel: /* Check if we need preemption */ btl $9, EFLAGS(%rsp) /* were interrupts off? */ jnc 1f -0: cmpl $0, PER_CPU_VAR(__preempt_count) + cmpl $0, PER_CPU_VAR(__preempt_count) jnz 1f call preempt_schedule_irq - jmp 0b 1: #endif /* @@ -841,7 +840,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) /** * idtentry - Generate an IDT entry stub @@ -879,7 +878,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will never switch stacks. This is for * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS. */ -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 @@ -925,13 +924,13 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + subq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) + addq $\ist_offset, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ @@ -1129,7 +1128,7 @@ apicinterrupt3 HYPERV_STIMER0_VECTOR \ hv_stimer0_callback_vector hv_stimer0_vector_handler #endif /* CONFIG_HYPERV */ -idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK +idtentry debug do_debug has_error_code=0 paranoid=1 shift_ist=IST_INDEX_DB ist_offset=DB_STACK_OFFSET idtentry int3 do_int3 has_error_code=0 idtentry stack_segment do_stack_segment has_error_code=1 diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 5bfe2243a08f..42fe42e82baf 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -116,7 +116,7 @@ $(obj)/%-x32.o: $(obj)/%.o FORCE targets += vdsox32.lds $(vobjx32s-y) $(obj)/%.so: OBJCOPYFLAGS := -S -$(obj)/%.so: $(obj)/%.so.dbg +$(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) $(obj)/vdsox32.so.dbg: $(obj)/vdsox32.lds $(vobjx32s) FORCE diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 007b3fe9d727..98c7d12b945c 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -29,12 +29,12 @@ extern int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz); extern time_t __vdso_time(time_t *t); #ifdef CONFIG_PARAVIRT_CLOCK -extern u8 pvclock_page +extern u8 pvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif #ifdef CONFIG_HYPERV_TSCPAGE -extern u8 hvclock_page +extern u8 hvclock_page[PAGE_SIZE] __attribute__((visibility("hidden"))); #endif diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index fa847a620f40..a20b134de2a8 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -7,7 +7,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, void *stripped_addr, size_t stripped_len, - FILE *outfile, const char *name) + FILE *outfile, const char *image_name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ @@ -93,11 +93,12 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, int k; ELF(Sym) *sym = raw_addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; - const char *name = raw_addr + GET_LE(&strtab_hdr->sh_offset) + - GET_LE(&sym->st_name); + const char *sym_name = raw_addr + + GET_LE(&strtab_hdr->sh_offset) + + GET_LE(&sym->st_name); for (k = 0; k < NSYMS; k++) { - if (!strcmp(name, required_syms[k].name)) { + if (!strcmp(sym_name, required_syms[k].name)) { if (syms[k]) { fail("duplicate symbol %s\n", required_syms[k].name); @@ -134,7 +135,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, if (syms[sym_vvar_start] % 4096) fail("vvar_begin must be a multiple of 4096\n"); - if (!name) { + if (!image_name) { fwrite(stripped_addr, stripped_len, 1, outfile); return; } @@ -157,7 +158,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, } fprintf(outfile, "\n};\n\n"); - fprintf(outfile, "const struct vdso_image %s = {\n", name); + fprintf(outfile, "const struct vdso_image %s = {\n", image_name); fprintf(outfile, "\t.data = raw_data,\n"); fprintf(outfile, "\t.size = %lu,\n", mapping_size); if (alt_sec) { diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 7d2d7c801dba..f15441b07dad 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -3,10 +3,14 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/delay.h> #include <asm/apicdef.h> +#include <asm/nmi.h> #include "../perf_event.h" +static DEFINE_PER_CPU(unsigned int, perf_nmi_counter); + static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] @@ -112,23 +116,144 @@ static __initconst const u64 amd_hw_cache_event_ids }, }; +static __initconst const u64 amd_hw_cache_event_ids_f17h + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { +[C(L1D)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0040, /* Data Cache Accesses */ + [C(RESULT_MISS)] = 0xc860, /* L2$ access from DC Miss */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0xff5a, /* h/w prefetch DC Fills */ + [C(RESULT_MISS)] = 0, + }, +}, +[C(L1I)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0080, /* Instruction cache fetches */ + [C(RESULT_MISS)] = 0x0081, /* Instruction cache misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(DTLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0xff45, /* All L2 DTLB accesses */ + [C(RESULT_MISS)] = 0xf045, /* L2 DTLB misses (PT walks) */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, +}, +[C(ITLB)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x0084, /* L1 ITLB misses, L2 ITLB hits */ + [C(RESULT_MISS)] = 0xff85, /* L1 ITLB misses, L2 misses */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(BPU)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0x00c2, /* Retired Branch Instr. */ + [C(RESULT_MISS)] = 0x00c3, /* Retired Mispredicted BI */ + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +[C(NODE)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = 0, + [C(RESULT_MISS)] = 0, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = -1, + [C(RESULT_MISS)] = -1, + }, +}, +}; + /* - * AMD Performance Monitor K7 and later. + * AMD Performance Monitor K7 and later, up to and including Family 16h: */ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, - [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +/* + * AMD Performance Monitor Family 17h and later: + */ +static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; static u64 amd_pmu_event_map(int hw_event) { + if (boot_cpu_data.x86 >= 0x17) + return amd_f17h_perfmon_event_map[hw_event]; + return amd_perfmon_event_map[hw_event]; } @@ -429,6 +554,132 @@ static void amd_pmu_cpu_dead(int cpu) } } +/* + * When a PMC counter overflows, an NMI is used to process the event and + * reset the counter. NMI latency can result in the counter being updated + * before the NMI can run, which can result in what appear to be spurious + * NMIs. This function is intended to wait for the NMI to run and reset + * the counter to avoid possible unhandled NMI messages. + */ +#define OVERFLOW_WAIT_COUNT 50 + +static void amd_pmu_wait_on_overflow(int idx) +{ + unsigned int i; + u64 counter; + + /* + * Wait for the counter to be reset if it has overflowed. This loop + * should exit very, very quickly, but just in case, don't wait + * forever... + */ + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) { + rdmsrl(x86_pmu_event_addr(idx), counter); + if (counter & (1ULL << (x86_pmu.cntval_bits - 1))) + break; + + /* Might be in IRQ context, so can't sleep */ + udelay(1); + } +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int idx; + + x86_pmu_disable_all(); + + /* + * This shouldn't be called from NMI context, but add a safeguard here + * to return, since if we're in NMI context we can't wait for an NMI + * to reset an overflowed counter value. + */ + if (in_nmi()) + return; + + /* + * Check each counter for overflow and wait for it to be reset by the + * NMI if it has overflowed. This relies on the fact that all active + * counters are always enabled when this function is caled and + * ARCH_PERFMON_EVENTSEL_INT is always set. + */ + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + amd_pmu_wait_on_overflow(idx); + } +} + +static void amd_pmu_disable_event(struct perf_event *event) +{ + x86_pmu_disable_event(event); + + /* + * This can be called from NMI context (via x86_pmu_stop). The counter + * may have overflowed, but either way, we'll never see it get reset + * by the NMI if we're already in the NMI. And the NMI latency support + * below will take care of any pending NMI that might have been + * generated by the overflow. + */ + if (in_nmi()) + return; + + amd_pmu_wait_on_overflow(event->hw.idx); +} + +/* + * Because of NMI latency, if multiple PMC counters are active or other sources + * of NMIs are received, the perf NMI handler can handle one or more overflowed + * PMC counters outside of the NMI associated with the PMC overflow. If the NMI + * doesn't arrive at the LAPIC in time to become a pending NMI, then the kernel + * back-to-back NMI support won't be active. This PMC handler needs to take into + * account that this can occur, otherwise this could result in unknown NMI + * messages being issued. Examples of this is PMC overflow while in the NMI + * handler when multiple PMCs are active or PMC overflow while handling some + * other source of an NMI. + * + * Attempt to mitigate this by using the number of active PMCs to determine + * whether to return NMI_HANDLED if the perf NMI handler did not handle/reset + * any PMCs. The per-CPU perf_nmi_counter variable is set to a minimum of the + * number of active PMCs or 2. The value of 2 is used in case an NMI does not + * arrive at the LAPIC in time to be collapsed into an already pending NMI. + */ +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int active, handled; + + /* + * Obtain the active count before calling x86_pmu_handle_irq() since + * it is possible that x86_pmu_handle_irq() may make a counter + * inactive (through x86_pmu_stop). + */ + active = __bitmap_weight(cpuc->active_mask, X86_PMC_IDX_MAX); + + /* Process any counter overflows */ + handled = x86_pmu_handle_irq(regs); + + /* + * If a counter was handled, record the number of possible remaining + * NMIs that can occur. + */ + if (handled) { + this_cpu_write(perf_nmi_counter, + min_t(unsigned int, 2, active)); + + return handled; + } + + if (!this_cpu_read(perf_nmi_counter)) + return NMI_DONE; + + this_cpu_dec(perf_nmi_counter); + + return NMI_HANDLED; +} + static struct event_constraint * amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -621,11 +872,11 @@ static ssize_t amd_event_sysfs_show(char *page, u64 config) static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", - .handle_irq = x86_pmu_handle_irq, - .disable_all = x86_pmu_disable_all, + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, .enable_all = x86_pmu_enable_all, .enable = x86_pmu_enable_event, - .disable = x86_pmu_disable_event, + .disable = amd_pmu_disable_event, .hw_config = amd_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_K7_EVNTSEL0, @@ -718,9 +969,10 @@ __init int amd_pmu_init(void) x86_pmu.amd_nb_constraints = 0; } - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); + if (boot_cpu_data.x86 >= 0x17) + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids_f17h, sizeof(hw_cache_event_ids)); + else + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, sizeof(hw_cache_event_ids)); return 0; } @@ -732,7 +984,7 @@ void amd_pmu_enable_virt(void) cpuc->perf_ctr_virt_mask = 0; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_enable_virt); @@ -750,7 +1002,7 @@ void amd_pmu_disable_virt(void) cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY; /* Reload all events */ - x86_pmu_disable_all(); + amd_pmu_disable_all(); x86_pmu_enable_all(0); } EXPORT_SYMBOL_GPL(amd_pmu_disable_virt); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index e2b1447192a8..f315425d8468 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -560,6 +560,21 @@ int x86_pmu_hw_config(struct perf_event *event) return -EINVAL; } + /* sample_regs_user never support XMM registers */ + if (unlikely(event->attr.sample_regs_user & PEBS_XMM_REGS)) + return -EINVAL; + /* + * Besides the general purpose registers, XMM registers may + * be collected in PEBS on some platforms, e.g. Icelake + */ + if (unlikely(event->attr.sample_regs_intr & PEBS_XMM_REGS)) { + if (x86_pmu.pebs_no_xmm_regs) + return -EINVAL; + + if (!event->attr.precise_ip) + return -EINVAL; + } + return x86_setup_perfctr(event); } @@ -661,6 +676,10 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } +struct pmu *x86_get_pmu(void) +{ + return &pmu; +} /* * Event scheduler state: * @@ -849,18 +868,43 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) struct event_constraint *c; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; struct perf_event *e; - int i, wmin, wmax, unsched = 0; + int n0, i, wmin, wmax, unsched = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); + /* + * Compute the number of events already present; see x86_pmu_add(), + * validate_group() and x86_pmu_commit_txn(). For the former two + * cpuc->n_events hasn't been updated yet, while for the latter + * cpuc->n_txn contains the number of events added in the current + * transaction. + */ + n0 = cpuc->n_events; + if (cpuc->txn_flags & PERF_PMU_TXN_ADD) + n0 -= cpuc->n_txn; + if (x86_pmu.start_scheduling) x86_pmu.start_scheduling(cpuc); for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { - cpuc->event_constraint[i] = NULL; - c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); - cpuc->event_constraint[i] = c; + c = cpuc->event_constraint[i]; + + /* + * Previously scheduled events should have a cached constraint, + * while new events should not have one. + */ + WARN_ON_ONCE((c && i >= n0) || (!c && i < n0)); + + /* + * Request constraints for new events; or for those events that + * have a dynamic constraint -- for those the constraint can + * change due to external factors (sibling state, allow_tfa). + */ + if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { + c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); + cpuc->event_constraint[i] = c; + } wmin = min(wmin, c->weight); wmax = max(wmax, c->weight); @@ -925,25 +969,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (!unsched && assign) { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; - e->hw.flags |= PERF_X86_EVENT_COMMITTED; if (x86_pmu.commit_scheduling) x86_pmu.commit_scheduling(cpuc, i, assign[i]); } } else { - for (i = 0; i < n; i++) { + for (i = n0; i < n; i++) { e = cpuc->event_list[i]; - /* - * do not put_constraint() on comitted events, - * because they are good to go - */ - if ((e->hw.flags & PERF_X86_EVENT_COMMITTED)) - continue; /* * release events that failed scheduling */ if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(cpuc, e); + + cpuc->event_constraint[i] = NULL; } } @@ -1349,8 +1388,9 @@ void x86_pmu_stop(struct perf_event *event, int flags) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) { + if (test_bit(hwc->idx, cpuc->active_mask)) { x86_pmu.disable(event); + __clear_bit(hwc->idx, cpuc->active_mask); cpuc->events[hwc->idx] = NULL; WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); hwc->state |= PERF_HES_STOPPED; @@ -1372,11 +1412,6 @@ static void x86_pmu_del(struct perf_event *event, int flags) int i; /* - * event is descheduled - */ - event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; - - /* * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. @@ -1412,6 +1447,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) cpuc->event_list[i-1] = cpuc->event_list[i]; cpuc->event_constraint[i-1] = cpuc->event_constraint[i]; } + cpuc->event_constraint[i-1] = NULL; --cpuc->n_events; perf_event_update_userpage(event); @@ -1447,16 +1483,8 @@ int x86_pmu_handle_irq(struct pt_regs *regs) apic_write(APIC_LVTPC, APIC_DM_NMI); for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) { - /* - * Though we deactivated the counter some cpus - * might still deliver spurious interrupts still - * in flight. Catch them: - */ - if (__test_and_clear_bit(idx, cpuc->running)) - handled++; + if (!test_bit(idx, cpuc->active_mask)) continue; - } event = cpuc->events[idx]; @@ -2031,7 +2059,7 @@ static int validate_event(struct perf_event *event) if (IS_ERR(fake_cpuc)) return PTR_ERR(fake_cpuc); - c = x86_pmu.get_event_constraints(fake_cpuc, -1, event); + c = x86_pmu.get_event_constraints(fake_cpuc, 0, event); if (!c || !c->weight) ret = -EINVAL; @@ -2079,8 +2107,7 @@ static int validate_group(struct perf_event *event) if (n < 0) goto out; - fake_cpuc->n_events = n; - + fake_cpuc->n_events = 0; ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out: @@ -2355,6 +2382,15 @@ void arch_perf_update_userpage(struct perf_event *event, cyc2ns_read_end(); } +/* + * Determine whether the regs were taken from an irq/exception handler rather + * than from perf_arch_fetch_caller_regs(). + */ +static bool perf_hw_regs(struct pt_regs *regs) +{ + return regs->flags & X86_EFLAGS_FIXED; +} + void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { @@ -2366,11 +2402,15 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - if (perf_callchain_store(entry, regs->ip)) - return; + if (perf_hw_regs(regs)) { + if (perf_callchain_store(entry, regs->ip)) + return; + unwind_start(&state, current, regs, NULL); + } else { + unwind_start(&state, current, NULL, (void *)regs->sp); + } - for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); - unwind_next_frame(&state)) { + for (; !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || perf_callchain_store(entry, addr)) return; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 8baa441d8000..ef763f535e3a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -239,6 +239,35 @@ static struct extra_reg intel_skl_extra_regs[] __read_mostly = { EVENT_EXTRA_END }; +static struct event_constraint intel_icl_event_constraints[] = { + FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ + INTEL_UEVENT_CONSTRAINT(0x1c0, 0), /* INST_RETIRED.PREC_DIST */ + FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0400, 3), /* SLOTS */ + INTEL_EVENT_CONSTRAINT_RANGE(0x03, 0x0a, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x1f, 0x28, 0xf), + INTEL_EVENT_CONSTRAINT(0x32, 0xf), /* SW_PREFETCH_ACCESS.* */ + INTEL_EVENT_CONSTRAINT_RANGE(0x48, 0x54, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0x60, 0x8b, 0xf), + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xff), /* CYCLE_ACTIVITY.STALLS_TOTAL */ + INTEL_UEVENT_CONSTRAINT(0x10a3, 0xff), /* CYCLE_ACTIVITY.STALLS_MEM_ANY */ + INTEL_EVENT_CONSTRAINT(0xa3, 0xf), /* CYCLE_ACTIVITY.* */ + INTEL_EVENT_CONSTRAINT_RANGE(0xa8, 0xb0, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xb7, 0xbd, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xd0, 0xe6, 0xf), + INTEL_EVENT_CONSTRAINT_RANGE(0xf0, 0xf4, 0xf), + EVENT_CONSTRAINT_END +}; + +static struct extra_reg intel_icl_extra_regs[] __read_mostly = { + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff9fffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), + INTEL_UEVENT_EXTRA_REG(0x01c6, MSR_PEBS_FRONTEND, 0x7fff17, FE), + EVENT_EXTRA_END +}; + EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); @@ -1827,6 +1856,45 @@ static __initconst const u64 glp_hw_cache_extra_regs }, }; +#define TNT_LOCAL_DRAM BIT_ULL(26) +#define TNT_DEMAND_READ GLM_DEMAND_DATA_RD +#define TNT_DEMAND_WRITE GLM_DEMAND_RFO +#define TNT_LLC_ACCESS GLM_ANY_RESPONSE +#define TNT_SNP_ANY (SNB_SNP_NOT_NEEDED|SNB_SNP_MISS| \ + SNB_NO_FWD|SNB_SNP_FWD|SNB_HITM) +#define TNT_LLC_MISS (TNT_SNP_ANY|SNB_NON_DRAM|TNT_LOCAL_DRAM) + +static __initconst const u64 tnt_hw_cache_extra_regs + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = { + [C(LL)] = { + [C(OP_READ)] = { + [C(RESULT_ACCESS)] = TNT_DEMAND_READ| + TNT_LLC_ACCESS, + [C(RESULT_MISS)] = TNT_DEMAND_READ| + TNT_LLC_MISS, + }, + [C(OP_WRITE)] = { + [C(RESULT_ACCESS)] = TNT_DEMAND_WRITE| + TNT_LLC_ACCESS, + [C(RESULT_MISS)] = TNT_DEMAND_WRITE| + TNT_LLC_MISS, + }, + [C(OP_PREFETCH)] = { + [C(RESULT_ACCESS)] = 0x0, + [C(RESULT_MISS)] = 0x0, + }, + }, +}; + +static struct extra_reg intel_tnt_extra_regs[] __read_mostly = { + /* must define OFFCORE_RSP_X first, see intel_fixup_er() */ + INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0xffffff9fffull, RSP_0), + INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0xffffff9fffull, RSP_1), + EVENT_EXTRA_END +}; + #define KNL_OT_L2_HITE BIT_ULL(19) /* Other Tile L2 Hit */ #define KNL_OT_L2_HITF BIT_ULL(20) /* Other Tile L2 Hit */ #define KNL_MCDRAM_LOCAL BIT_ULL(21) @@ -2015,7 +2083,7 @@ static void intel_tfa_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int /* * We're going to use PMC3, make sure TFA is set before we touch it. */ - if (cntr == 3 && !cpuc->is_fake) + if (cntr == 3) intel_set_tfa(cpuc, true); } @@ -2091,15 +2159,19 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); - if (unlikely(event->attr.precise_ip)) - intel_pmu_pebs_disable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; } x86_pmu_disable_event(event); + + /* + * Needs to be called after x86_pmu_disable_event, + * so we don't trigger the event without PEBS bit set. + */ + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); } static void intel_pmu_del_event(struct perf_event *event) @@ -2145,6 +2217,11 @@ static void intel_pmu_enable_fixed(struct perf_event *event) bits <<= (idx * 4); mask = 0xfULL << (idx * 4); + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip) { + bits |= ICL_FIXED_0_ADAPTIVE << (idx * 4); + mask |= ICL_FIXED_0_ADAPTIVE << (idx * 4); + } + rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; ctrl_val |= bits; @@ -2688,7 +2765,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } @@ -2838,7 +2915,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; struct intel_excl_states *xlo; int tid = cpuc->excl_thread_id; - int is_excl, i; + int is_excl, i, w; /* * validating a group does not require @@ -2894,36 +2971,40 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * SHARED : sibling counter measuring non-exclusive event * UNUSED : sibling counter unused */ + w = c->weight; for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) { /* * exclusive event in sibling counter * our corresponding counter cannot be used * regardless of our event */ - if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) + if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) { __clear_bit(i, c->idxmsk); + w--; + continue; + } /* * if measuring an exclusive event, sibling * measuring non-exclusive, then counter cannot * be used */ - if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) + if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) { __clear_bit(i, c->idxmsk); + w--; + continue; + } } /* - * recompute actual bit weight for scheduling algorithm - */ - c->weight = hweight64(c->idxmsk64); - - /* * if we return an empty mask, then switch * back to static empty constraint to avoid * the cost of freeing later on */ - if (c->weight == 0) + if (!w) c = &emptyconstraint; + c->weight = w; + return c; } @@ -2931,11 +3012,9 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = NULL; - struct event_constraint *c2; + struct event_constraint *c1, *c2; - if (idx >= 0) /* fake does < 0 */ - c1 = cpuc->event_constraint[idx]; + c1 = cpuc->event_constraint[idx]; /* * first time only @@ -2943,7 +3022,8 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, * - dynamic constraint: handled by intel_get_excl_constraints() */ c2 = __intel_get_event_constraints(cpuc, idx, event); - if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) { + if (c1) { + WARN_ON_ONCE(!(c1->flags & PERF_X86_EVENT_DYNAMIC)); bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX); c1->weight = c2->weight; c2 = c1; @@ -3131,7 +3211,7 @@ static unsigned long intel_pmu_large_pebs_flags(struct perf_event *event) flags &= ~PERF_SAMPLE_TIME; if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; - if (event->attr.sample_regs_user & ~PEBS_REGS) + if (event->attr.sample_regs_user & ~PEBS_GP_REGS) flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } @@ -3185,7 +3265,7 @@ static int intel_pmu_hw_config(struct perf_event *event) return ret; if (event->attr.precise_ip) { - if (!event->attr.freq) { + if (!(event->attr.freq || event->attr.wakeup_events)) { event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; if (!(event->attr.sample_type & ~intel_pmu_large_pebs_flags(event))) @@ -3366,6 +3446,12 @@ static struct event_constraint counter0_constraint = static struct event_constraint counter2_constraint = EVENT_CONSTRAINT(0, 0x4, 0); +static struct event_constraint fixed0_constraint = + FIXED_EVENT_CONSTRAINT(0x00c0, 0); + +static struct event_constraint fixed0_counter0_constraint = + INTEL_ALL_EVENT_CONSTRAINT(0, 0x100000001ULL); + static struct event_constraint * hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) @@ -3385,6 +3471,21 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx, } static struct event_constraint * +icl_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + /* + * Fixed counter 0 has less skid. + * Force instruction:ppp in Fixed counter 0 + */ + if ((event->attr.precise_ip == 3) && + constraint_match(&fixed0_constraint, event->hw.config)) + return &fixed0_constraint; + + return hsw_get_event_constraints(cpuc, idx, event); +} + +static struct event_constraint * glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { @@ -3399,6 +3500,29 @@ glp_get_event_constraints(struct cpu_hw_events *cpuc, int idx, return c; } +static struct event_constraint * +tnt_get_event_constraints(struct cpu_hw_events *cpuc, int idx, + struct perf_event *event) +{ + struct event_constraint *c; + + /* + * :ppp means to do reduced skid PEBS, + * which is available on PMC0 and fixed counter 0. + */ + if (event->attr.precise_ip == 3) { + /* Force instruction:ppp on PMC0 and Fixed counter 0 */ + if (constraint_match(&fixed0_constraint, event->hw.config)) + return &fixed0_counter0_constraint; + + return &counter0_constraint; + } + + c = intel_get_event_constraints(cpuc, idx, event); + + return c; +} + static bool allow_tsx_force_abort = true; static struct event_constraint * @@ -3410,7 +3534,7 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, /* * Without TFA we must not use PMC3. */ - if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) { + if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) { c = dyn_constraint(cpuc, c, idx); c->idxmsk64 &= ~(1ULL << 3); c->weight--; @@ -3507,6 +3631,8 @@ static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { + cpuc->pebs_record_size = x86_pmu.pebs_record_size; + if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) { cpuc->shared_regs = allocate_shared_regs(cpu); if (!cpuc->shared_regs) @@ -3575,6 +3701,12 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + if (x86_pmu.flags & PMU_FL_TFA) { + WARN_ON_ONCE(cpuc->tfa_shadow); + cpuc->tfa_shadow = ~0ULL; + intel_set_tfa(cpuc, false); + } + if (x86_pmu.version > 1) flip_smm_bit(&x86_pmu.attr_freeze_on_smi); @@ -4108,6 +4240,42 @@ static struct attribute *hsw_tsx_events_attrs[] = { NULL }; +EVENT_ATTR_STR(tx-capacity-read, tx_capacity_read, "event=0x54,umask=0x80"); +EVENT_ATTR_STR(tx-capacity-write, tx_capacity_write, "event=0x54,umask=0x2"); +EVENT_ATTR_STR(el-capacity-read, el_capacity_read, "event=0x54,umask=0x80"); +EVENT_ATTR_STR(el-capacity-write, el_capacity_write, "event=0x54,umask=0x2"); + +static struct attribute *icl_events_attrs[] = { + EVENT_PTR(mem_ld_hsw), + EVENT_PTR(mem_st_hsw), + NULL, +}; + +static struct attribute *icl_tsx_events_attrs[] = { + EVENT_PTR(tx_start), + EVENT_PTR(tx_abort), + EVENT_PTR(tx_commit), + EVENT_PTR(tx_capacity_read), + EVENT_PTR(tx_capacity_write), + EVENT_PTR(tx_conflict), + EVENT_PTR(el_start), + EVENT_PTR(el_abort), + EVENT_PTR(el_commit), + EVENT_PTR(el_capacity_read), + EVENT_PTR(el_capacity_write), + EVENT_PTR(el_conflict), + EVENT_PTR(cycles_t), + EVENT_PTR(cycles_ct), + NULL, +}; + +static __init struct attribute **get_icl_events_attrs(void) +{ + return boot_cpu_has(X86_FEATURE_RTM) ? + merge_attr(icl_events_attrs, icl_tsx_events_attrs) : + icl_events_attrs; +} + static ssize_t freeze_on_smi_show(struct device *cdev, struct device_attribute *attr, char *buf) @@ -4147,6 +4315,50 @@ done: return count; } +static void update_tfa_sched(void *ignored) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + /* + * check if PMC3 is used + * and if so force schedule out for all event types all contexts + */ + if (test_bit(3, cpuc->active_mask)) + perf_pmu_resched(x86_get_pmu()); +} + +static ssize_t show_sysctl_tfa(struct device *cdev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, 40, "%d\n", allow_tsx_force_abort); +} + +static ssize_t set_sysctl_tfa(struct device *cdev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + bool val; + ssize_t ret; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + /* no change */ + if (val == allow_tsx_force_abort) + return count; + + allow_tsx_force_abort = val; + + get_online_cpus(); + on_each_cpu(update_tfa_sched, NULL, 1); + put_online_cpus(); + + return count; +} + + static DEVICE_ATTR_RW(freeze_on_smi); static ssize_t branches_show(struct device *cdev, @@ -4179,7 +4391,9 @@ static struct attribute *intel_pmu_caps_attrs[] = { NULL }; -static DEVICE_BOOL_ATTR(allow_tsx_force_abort, 0644, allow_tsx_force_abort); +static DEVICE_ATTR(allow_tsx_force_abort, 0644, + show_sysctl_tfa, + set_sysctl_tfa); static struct attribute *intel_pmu_attrs[] = { &dev_attr_freeze_on_smi.attr, @@ -4440,6 +4654,32 @@ __init int intel_pmu_init(void) name = "goldmont_plus"; break; + case INTEL_FAM6_ATOM_TREMONT_X: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, glp_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, tnt_hw_cache_extra_regs, + sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_slm_event_constraints; + x86_pmu.extra_regs = intel_tnt_extra_regs; + /* + * It's recommended to use CPU_CLK_UNHALTED.CORE_P + NPEBS + * for precise cycles. + */ + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.lbr_pt_coexist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.get_event_constraints = tnt_get_event_constraints; + extra_attr = slm_format_attr; + pr_cont("Tremont events, "); + name = "Tremont"; + break; + case INTEL_FAM6_WESTMERE: case INTEL_FAM6_WESTMERE_EP: case INTEL_FAM6_WESTMERE_EX: @@ -4688,13 +4928,41 @@ __init int intel_pmu_init(void) x86_pmu.get_event_constraints = tfa_get_event_constraints; x86_pmu.enable_all = intel_tfa_pmu_enable_all; x86_pmu.commit_scheduling = intel_tfa_commit_scheduling; - intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr.attr; + intel_pmu_attrs[1] = &dev_attr_allow_tsx_force_abort.attr; } pr_cont("Skylake events, "); name = "skylake"; break; + case INTEL_FAM6_ICELAKE_MOBILE: + x86_pmu.late_ack = true; + memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs)); + hw_cache_event_ids[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)] = -1; + intel_pmu_lbr_init_skl(); + + x86_pmu.event_constraints = intel_icl_event_constraints; + x86_pmu.pebs_constraints = intel_icl_pebs_event_constraints; + x86_pmu.extra_regs = intel_icl_extra_regs; + x86_pmu.pebs_aliases = NULL; + x86_pmu.pebs_prec_dist = true; + x86_pmu.flags |= PMU_FL_HAS_RSP_1; + x86_pmu.flags |= PMU_FL_NO_HT_SHARING; + + x86_pmu.hw_config = hsw_hw_config; + x86_pmu.get_event_constraints = icl_get_event_constraints; + extra_attr = boot_cpu_has(X86_FEATURE_RTM) ? + hsw_format_attr : nhm_format_attr; + extra_attr = merge_attr(extra_attr, skl_format_attr); + x86_pmu.cpu_events = get_icl_events_attrs(); + x86_pmu.rtm_abort_event = X86_CONFIG(.event=0xca, .umask=0x02); + x86_pmu.lbr_pt_coexist = true; + intel_pmu_pebs_data_source_skl(false); + pr_cont("Icelake events, "); + name = "icelake"; + break; + default: switch (x86_pmu.version) { case 1: diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 94a4b7fc75d0..6072f92cb8ea 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -76,15 +76,15 @@ * Scope: Package (physical package) * MSR_PKG_C8_RESIDENCY: Package C8 Residency Counter. * perf code: 0x04 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C9_RESIDENCY: Package C9 Residency Counter. * perf code: 0x05 - * Available model: HSW ULT,CNL + * Available model: HSW ULT,KBL,CNL * Scope: Package (physical package) * MSR_PKG_C10_RESIDENCY: Package C10 Residency Counter. * perf code: 0x06 - * Available model: HSW ULT,GLM,CNL + * Available model: HSW ULT,KBL,GLM,CNL * Scope: Package (physical package) * */ @@ -566,8 +566,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_X, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, snb_cstates), - X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, snb_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_MOBILE, hswult_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_KABYLAKE_DESKTOP, hswult_cstates), X86_CSTATES_MODEL(INTEL_FAM6_CANNONLAKE_MOBILE, cnl_cstates), @@ -578,6 +578,8 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_X, glm_cstates), X86_CSTATES_MODEL(INTEL_FAM6_ATOM_GOLDMONT_PLUS, glm_cstates), + + X86_CSTATES_MODEL(INTEL_FAM6_ICELAKE_MOBILE, snb_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 10c99ce1fead..7a9f5dac5abe 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -849,6 +849,26 @@ struct event_constraint intel_skl_pebs_event_constraints[] = { EVENT_CONSTRAINT_END }; +struct event_constraint intel_icl_pebs_event_constraints[] = { + INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x100000000ULL), /* INST_RETIRED.PREC_DIST */ + INTEL_FLAGS_UEVENT_CONSTRAINT(0x0400, 0x400000000ULL), /* SLOTS */ + + INTEL_PLD_CONSTRAINT(0x1cd, 0xff), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x1d0, 0xf), /* MEM_INST_RETIRED.LOAD */ + INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x2d0, 0xf), /* MEM_INST_RETIRED.STORE */ + + INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(0xd1, 0xd4, 0xf), /* MEM_LOAD_*_RETIRED.* */ + + INTEL_FLAGS_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_INST_RETIRED.* */ + + /* + * Everything else is handled by PMU_FL_PEBS_ALL, because we + * need the full constraints from the main table. + */ + + EVENT_CONSTRAINT_END +}; + struct event_constraint *intel_pebs_constraints(struct perf_event *event) { struct event_constraint *c; @@ -858,7 +878,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) if (x86_pmu.pebs_constraints) { for_each_event_constraint(c, x86_pmu.pebs_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } @@ -906,17 +926,87 @@ static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) if (cpuc->n_pebs == cpuc->n_large_pebs) { threshold = ds->pebs_absolute_maximum - - reserved * x86_pmu.pebs_record_size; + reserved * cpuc->pebs_record_size; } else { - threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + threshold = ds->pebs_buffer_base + cpuc->pebs_record_size; } ds->pebs_interrupt_threshold = threshold; } +static void adaptive_pebs_record_size_update(void) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + u64 pebs_data_cfg = cpuc->pebs_data_cfg; + int sz = sizeof(struct pebs_basic); + + if (pebs_data_cfg & PEBS_DATACFG_MEMINFO) + sz += sizeof(struct pebs_meminfo); + if (pebs_data_cfg & PEBS_DATACFG_GP) + sz += sizeof(struct pebs_gprs); + if (pebs_data_cfg & PEBS_DATACFG_XMMS) + sz += sizeof(struct pebs_xmm); + if (pebs_data_cfg & PEBS_DATACFG_LBRS) + sz += x86_pmu.lbr_nr * sizeof(struct pebs_lbr_entry); + + cpuc->pebs_record_size = sz; +} + +#define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ + PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ + PERF_SAMPLE_TRANSACTION) + +static u64 pebs_update_adaptive_cfg(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + u64 sample_type = attr->sample_type; + u64 pebs_data_cfg = 0; + bool gprs, tsx_weight; + + if (!(sample_type & ~(PERF_SAMPLE_IP|PERF_SAMPLE_TIME)) && + attr->precise_ip > 1) + return pebs_data_cfg; + + if (sample_type & PERF_PEBS_MEMINFO_TYPE) + pebs_data_cfg |= PEBS_DATACFG_MEMINFO; + + /* + * We need GPRs when: + * + user requested them + * + precise_ip < 2 for the non event IP + * + For RTM TSX weight we need GPRs for the abort code. + */ + gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && + (attr->sample_regs_intr & PEBS_GP_REGS); + + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + ((attr->config & INTEL_ARCH_EVENT_MASK) == + x86_pmu.rtm_abort_event); + + if (gprs || (attr->precise_ip < 2) || tsx_weight) + pebs_data_cfg |= PEBS_DATACFG_GP; + + if ((sample_type & PERF_SAMPLE_REGS_INTR) && + (attr->sample_regs_intr & PEBS_XMM_REGS)) + pebs_data_cfg |= PEBS_DATACFG_XMMS; + + if (sample_type & PERF_SAMPLE_BRANCH_STACK) { + /* + * For now always log all LBRs. Could configure this + * later. + */ + pebs_data_cfg |= PEBS_DATACFG_LBRS | + ((x86_pmu.lbr_nr-1) << PEBS_DATACFG_LBR_SHIFT); + } + + return pebs_data_cfg; +} + static void -pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) +pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, + struct perf_event *event, bool add) { + struct pmu *pmu = event->ctx->pmu; /* * Make sure we get updated with the first PEBS * event. It will trigger also during removal, but @@ -933,6 +1023,29 @@ pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) update = true; } + /* + * The PEBS record doesn't shrink on pmu::del(). Doing so would require + * iterating all remaining PEBS events to reconstruct the config. + */ + if (x86_pmu.intel_cap.pebs_baseline && add) { + u64 pebs_data_cfg; + + /* Clear pebs_data_cfg and pebs_record_size for first PEBS. */ + if (cpuc->n_pebs == 1) { + cpuc->pebs_data_cfg = 0; + cpuc->pebs_record_size = sizeof(struct pebs_basic); + } + + pebs_data_cfg = pebs_update_adaptive_cfg(event); + + /* Update pebs_record_size if new event requires more data. */ + if (pebs_data_cfg & ~cpuc->pebs_data_cfg) { + cpuc->pebs_data_cfg |= pebs_data_cfg; + adaptive_pebs_record_size_update(); + update = true; + } + } + if (update) pebs_update_threshold(cpuc); } @@ -947,7 +1060,7 @@ void intel_pmu_pebs_add(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs++; - pebs_update_state(needed_cb, cpuc, event->ctx->pmu); + pebs_update_state(needed_cb, cpuc, event, true); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -960,11 +1073,19 @@ void intel_pmu_pebs_enable(struct perf_event *event) cpuc->pebs_enabled |= 1ULL << hwc->idx; - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && (x86_pmu.version < 5)) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled |= 1ULL << 63; + if (x86_pmu.intel_cap.pebs_baseline) { + hwc->config |= ICL_EVENTSEL_ADAPTIVE; + if (cpuc->pebs_data_cfg != cpuc->active_pebs_data_cfg) { + wrmsrl(MSR_PEBS_DATA_CFG, cpuc->pebs_data_cfg); + cpuc->active_pebs_data_cfg = cpuc->pebs_data_cfg; + } + } + /* * Use auto-reload if possible to save a MSR write in the PMI. * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. @@ -991,7 +1112,7 @@ void intel_pmu_pebs_del(struct perf_event *event) if (hwc->flags & PERF_X86_EVENT_LARGE_PEBS) cpuc->n_large_pebs--; - pebs_update_state(needed_cb, cpuc, event->ctx->pmu); + pebs_update_state(needed_cb, cpuc, event, false); } void intel_pmu_pebs_disable(struct perf_event *event) @@ -1004,7 +1125,8 @@ void intel_pmu_pebs_disable(struct perf_event *event) cpuc->pebs_enabled &= ~(1ULL << hwc->idx); - if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + if ((event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) && + (x86_pmu.version < 5)) cpuc->pebs_enabled &= ~(1ULL << (hwc->idx + 32)); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); @@ -1125,34 +1247,57 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) return 0; } -static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs) +static inline u64 intel_get_tsx_weight(u64 tsx_tuning) { - if (pebs->tsx_tuning) { - union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning }; + if (tsx_tuning) { + union hsw_tsx_tuning tsx = { .value = tsx_tuning }; return tsx.cycles_last_block; } return 0; } -static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs) +static inline u64 intel_get_tsx_transaction(u64 tsx_tuning, u64 ax) { - u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; + u64 txn = (tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32; /* For RTM XABORTs also log the abort code from AX */ - if ((txn & PERF_TXN_TRANSACTION) && (pebs->ax & 1)) - txn |= ((pebs->ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; + if ((txn & PERF_TXN_TRANSACTION) && (ax & 1)) + txn |= ((ax >> 24) & 0xff) << PERF_TXN_ABORT_SHIFT; return txn; } -static void setup_pebs_sample_data(struct perf_event *event, - struct pt_regs *iregs, void *__pebs, - struct perf_sample_data *data, - struct pt_regs *regs) +static inline u64 get_pebs_status(void *n) { + if (x86_pmu.intel_cap.pebs_format < 4) + return ((struct pebs_record_nhm *)n)->status; + return ((struct pebs_basic *)n)->applicable_counters; +} + #define PERF_X86_EVENT_PEBS_HSW_PREC \ (PERF_X86_EVENT_PEBS_ST_HSW | \ PERF_X86_EVENT_PEBS_LD_HSW | \ PERF_X86_EVENT_PEBS_NA_HSW) + +static u64 get_data_src(struct perf_event *event, u64 aux) +{ + u64 val = PERF_MEM_NA; + int fl = event->hw.flags; + bool fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); + + if (fl & PERF_X86_EVENT_PEBS_LDLAT) + val = load_latency_data(aux); + else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) + val = precise_datala_hsw(event, aux); + else if (fst) + val = precise_store_data(aux); + return val; +} + +static void setup_pebs_fixed_sample_data(struct perf_event *event, + struct pt_regs *iregs, void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ /* * We cast to the biggest pebs_record but are careful not to * unconditionally access the 'extra' entries. @@ -1160,17 +1305,13 @@ static void setup_pebs_sample_data(struct perf_event *event, struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_record_skl *pebs = __pebs; u64 sample_type; - int fll, fst, dsrc; - int fl = event->hw.flags; + int fll; if (pebs == NULL) return; sample_type = event->attr.sample_type; - dsrc = sample_type & PERF_SAMPLE_DATA_SRC; - - fll = fl & PERF_X86_EVENT_PEBS_LDLAT; - fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); + fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; perf_sample_data_init(data, 0, event->hw.last_period); @@ -1185,16 +1326,8 @@ static void setup_pebs_sample_data(struct perf_event *event, /* * data.data_src encodes the data source */ - if (dsrc) { - u64 val = PERF_MEM_NA; - if (fll) - val = load_latency_data(pebs->dse); - else if (fst && (fl & PERF_X86_EVENT_PEBS_HSW_PREC)) - val = precise_datala_hsw(event, pebs->dse); - else if (fst) - val = precise_store_data(pebs->dse); - data->data_src.val = val; - } + if (sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = get_data_src(event, pebs->dse); /* * We must however always use iregs for the unwinder to stay sane; the @@ -1281,10 +1414,11 @@ static void setup_pebs_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_hsw_weight(pebs); + data->weight = intel_get_tsx_weight(pebs->tsx_tuning); if (sample_type & PERF_SAMPLE_TRANSACTION) - data->txn = intel_hsw_transaction(pebs); + data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, + pebs->ax); } /* @@ -1301,6 +1435,140 @@ static void setup_pebs_sample_data(struct perf_event *event, data->br_stack = &cpuc->lbr_stack; } +static void adaptive_pebs_save_regs(struct pt_regs *regs, + struct pebs_gprs *gprs) +{ + regs->ax = gprs->ax; + regs->bx = gprs->bx; + regs->cx = gprs->cx; + regs->dx = gprs->dx; + regs->si = gprs->si; + regs->di = gprs->di; + regs->bp = gprs->bp; + regs->sp = gprs->sp; +#ifndef CONFIG_X86_32 + regs->r8 = gprs->r8; + regs->r9 = gprs->r9; + regs->r10 = gprs->r10; + regs->r11 = gprs->r11; + regs->r12 = gprs->r12; + regs->r13 = gprs->r13; + regs->r14 = gprs->r14; + regs->r15 = gprs->r15; +#endif +} + +/* + * With adaptive PEBS the layout depends on what fields are configured. + */ + +static void setup_pebs_adaptive_sample_data(struct perf_event *event, + struct pt_regs *iregs, void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct pebs_basic *basic = __pebs; + void *next_record = basic + 1; + u64 sample_type; + u64 format_size; + struct pebs_meminfo *meminfo = NULL; + struct pebs_gprs *gprs = NULL; + struct x86_perf_regs *perf_regs; + + if (basic == NULL) + return; + + perf_regs = container_of(regs, struct x86_perf_regs, regs); + perf_regs->xmm_regs = NULL; + + sample_type = event->attr.sample_type; + format_size = basic->format_size; + perf_sample_data_init(data, 0, event->hw.last_period); + data->period = event->hw.last_period; + + if (event->attr.use_clockid == 0) + data->time = native_sched_clock_from_tsc(basic->tsc); + + /* + * We must however always use iregs for the unwinder to stay sane; the + * record BP,SP,IP can point into thin air when the record is from a + * previous PMI context or an (I)RET happened between the record and + * PMI. + */ + if (sample_type & PERF_SAMPLE_CALLCHAIN) + data->callchain = perf_callchain(event, iregs); + + *regs = *iregs; + /* The ip in basic is EventingIP */ + set_linear_ip(regs, basic->ip); + regs->flags = PERF_EFLAGS_EXACT; + + /* + * The record for MEMINFO is in front of GP + * But PERF_SAMPLE_TRANSACTION needs gprs->ax. + * Save the pointer here but process later. + */ + if (format_size & PEBS_DATACFG_MEMINFO) { + meminfo = next_record; + next_record = meminfo + 1; + } + + if (format_size & PEBS_DATACFG_GP) { + gprs = next_record; + next_record = gprs + 1; + + if (event->attr.precise_ip < 2) { + set_linear_ip(regs, gprs->ip); + regs->flags &= ~PERF_EFLAGS_EXACT; + } + + if (sample_type & PERF_SAMPLE_REGS_INTR) + adaptive_pebs_save_regs(regs, gprs); + } + + if (format_size & PEBS_DATACFG_MEMINFO) { + if (sample_type & PERF_SAMPLE_WEIGHT) + data->weight = meminfo->latency ?: + intel_get_tsx_weight(meminfo->tsx_tuning); + + if (sample_type & PERF_SAMPLE_DATA_SRC) + data->data_src.val = get_data_src(event, meminfo->aux); + + if (sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) + data->addr = meminfo->address; + + if (sample_type & PERF_SAMPLE_TRANSACTION) + data->txn = intel_get_tsx_transaction(meminfo->tsx_tuning, + gprs ? gprs->ax : 0); + } + + if (format_size & PEBS_DATACFG_XMMS) { + struct pebs_xmm *xmm = next_record; + + next_record = xmm + 1; + perf_regs->xmm_regs = xmm->xmm; + } + + if (format_size & PEBS_DATACFG_LBRS) { + struct pebs_lbr *lbr = next_record; + int num_lbr = ((format_size >> PEBS_DATACFG_LBR_SHIFT) + & 0xff) + 1; + next_record = next_record + num_lbr*sizeof(struct pebs_lbr_entry); + + if (has_branch_stack(event)) { + intel_pmu_store_pebs_lbrs(lbr); + data->br_stack = &cpuc->lbr_stack; + } + } + + WARN_ONCE(next_record != __pebs + (format_size >> 48), + "PEBS record size %llu, expected %llu, config %llx\n", + format_size >> 48, + (u64)(next_record - __pebs), + basic->format_size); +} + static inline void * get_next_pebs_record_by_bit(void *base, void *top, int bit) { @@ -1318,19 +1586,19 @@ get_next_pebs_record_by_bit(void *base, void *top, int bit) if (base == NULL) return NULL; - for (at = base; at < top; at += x86_pmu.pebs_record_size) { - struct pebs_record_nhm *p = at; + for (at = base; at < top; at += cpuc->pebs_record_size) { + unsigned long status = get_pebs_status(at); - if (test_bit(bit, (unsigned long *)&p->status)) { + if (test_bit(bit, (unsigned long *)&status)) { /* PEBS v3 has accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) return at; - if (p->status == (1 << bit)) + if (status == (1 << bit)) return at; /* clear non-PEBS bit and re-check */ - pebs_status = p->status & cpuc->pebs_enabled; + pebs_status = status & cpuc->pebs_enabled; pebs_status &= PEBS_COUNTER_MASK; if (pebs_status == (1 << bit)) return at; @@ -1410,11 +1678,18 @@ intel_pmu_save_and_restart_reload(struct perf_event *event, int count) static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *base, void *top, - int bit, int count) + int bit, int count, + void (*setup_sample)(struct perf_event *, + struct pt_regs *, + void *, + struct perf_sample_data *, + struct pt_regs *)) { + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct perf_sample_data data; - struct pt_regs regs; + struct x86_perf_regs perf_regs; + struct pt_regs *regs = &perf_regs.regs; void *at = get_next_pebs_record_by_bit(base, top, bit); if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { @@ -1429,20 +1704,20 @@ static void __intel_pmu_pebs_event(struct perf_event *event, return; while (count > 1) { - setup_pebs_sample_data(event, iregs, at, &data, ®s); - perf_event_output(event, &data, ®s); - at += x86_pmu.pebs_record_size; + setup_sample(event, iregs, at, &data, regs); + perf_event_output(event, &data, regs); + at += cpuc->pebs_record_size; at = get_next_pebs_record_by_bit(at, top, bit); count--; } - setup_pebs_sample_data(event, iregs, at, &data, ®s); + setup_sample(event, iregs, at, &data, regs); /* * All but the last records are processed. * The last one is left to be able to call the overflow handler. */ - if (perf_event_overflow(event, &data, ®s)) { + if (perf_event_overflow(event, &data, regs)) { x86_pmu_stop(event, 0); return; } @@ -1483,7 +1758,27 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) return; } - __intel_pmu_pebs_event(event, iregs, at, top, 0, n); + __intel_pmu_pebs_event(event, iregs, at, top, 0, n, + setup_pebs_fixed_sample_data); +} + +static void intel_pmu_pebs_event_update_no_drain(struct cpu_hw_events *cpuc, int size) +{ + struct perf_event *event; + int bit; + + /* + * The drain_pebs() could be called twice in a short period + * for auto-reload event in pmu::read(). There are no + * overflows have happened in between. + * It needs to call intel_pmu_save_and_restart_reload() to + * update the event->count for this case. + */ + for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, size) { + event = cpuc->events[bit]; + if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) + intel_pmu_save_and_restart_reload(event, 0); + } } static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) @@ -1513,19 +1808,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) } if (unlikely(base >= top)) { - /* - * The drain_pebs() could be called twice in a short period - * for auto-reload event in pmu::read(). There are no - * overflows have happened in between. - * It needs to call intel_pmu_save_and_restart_reload() to - * update the event->count for this case. - */ - for_each_set_bit(bit, (unsigned long *)&cpuc->pebs_enabled, - size) { - event = cpuc->events[bit]; - if (event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD) - intel_pmu_save_and_restart_reload(event, 0); - } + intel_pmu_pebs_event_update_no_drain(cpuc, size); return; } @@ -1538,8 +1821,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&pebs_status, - size) + for_each_set_bit(bit, (unsigned long *)&pebs_status, size) counts[bit]++; continue; @@ -1578,8 +1860,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) * If collision happened, the record will be dropped. */ if (p->status != (1ULL << bit)) { - for_each_set_bit(i, (unsigned long *)&pebs_status, - x86_pmu.max_pebs_events) + for_each_set_bit(i, (unsigned long *)&pebs_status, size) error[i]++; continue; } @@ -1587,7 +1868,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) counts[bit]++; } - for (bit = 0; bit < size; bit++) { + for_each_set_bit(bit, (unsigned long *)&mask, size) { if ((counts[bit] == 0) && (error[bit] == 0)) continue; @@ -1608,11 +1889,66 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) if (counts[bit]) { __intel_pmu_pebs_event(event, iregs, base, - top, bit, counts[bit]); + top, bit, counts[bit], + setup_pebs_fixed_sample_data); } } } +static void intel_pmu_drain_pebs_icl(struct pt_regs *iregs) +{ + short counts[INTEL_PMC_IDX_FIXED + MAX_FIXED_PEBS_EVENTS] = {}; + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event; + void *base, *at, *top; + int bit, size; + u64 mask; + + if (!x86_pmu.pebs_active) + return; + + base = (struct pebs_basic *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_basic *)(unsigned long)ds->pebs_index; + + ds->pebs_index = ds->pebs_buffer_base; + + mask = ((1ULL << x86_pmu.max_pebs_events) - 1) | + (((1ULL << x86_pmu.num_counters_fixed) - 1) << INTEL_PMC_IDX_FIXED); + size = INTEL_PMC_IDX_FIXED + x86_pmu.num_counters_fixed; + + if (unlikely(base >= top)) { + intel_pmu_pebs_event_update_no_drain(cpuc, size); + return; + } + + for (at = base; at < top; at += cpuc->pebs_record_size) { + u64 pebs_status; + + pebs_status = get_pebs_status(at) & cpuc->pebs_enabled; + pebs_status &= mask; + + for_each_set_bit(bit, (unsigned long *)&pebs_status, size) + counts[bit]++; + } + + for_each_set_bit(bit, (unsigned long *)&mask, size) { + if (counts[bit] == 0) + continue; + + event = cpuc->events[bit]; + if (WARN_ON_ONCE(!event)) + continue; + + if (WARN_ON_ONCE(!event->attr.precise_ip)) + continue; + + __intel_pmu_pebs_event(event, iregs, base, + top, bit, counts[bit], + setup_pebs_adaptive_sample_data); + } +} + /* * BTS, PEBS probe and setup */ @@ -1628,12 +1964,18 @@ void __init intel_ds_init(void) x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); x86_pmu.pebs_buffer_size = PEBS_BUFFER_SIZE; - if (x86_pmu.version <= 4) + if (x86_pmu.version <= 4) { x86_pmu.pebs_no_isolation = 1; + x86_pmu.pebs_no_xmm_regs = 1; + } if (x86_pmu.pebs) { char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; + char *pebs_qual = ""; int format = x86_pmu.intel_cap.pebs_format; + if (format < 4) + x86_pmu.intel_cap.pebs_baseline = 0; + switch (format) { case 0: pr_cont("PEBS fmt0%c, ", pebs_type); @@ -1669,6 +2011,29 @@ void __init intel_ds_init(void) x86_pmu.large_pebs_flags |= PERF_SAMPLE_TIME; break; + case 4: + x86_pmu.drain_pebs = intel_pmu_drain_pebs_icl; + x86_pmu.pebs_record_size = sizeof(struct pebs_basic); + if (x86_pmu.intel_cap.pebs_baseline) { + x86_pmu.large_pebs_flags |= + PERF_SAMPLE_BRANCH_STACK | + PERF_SAMPLE_TIME; + x86_pmu.flags |= PMU_FL_PEBS_ALL; + pebs_qual = "-baseline"; + } else { + /* Only basic record supported */ + x86_pmu.pebs_no_xmm_regs = 1; + x86_pmu.large_pebs_flags &= + ~(PERF_SAMPLE_ADDR | + PERF_SAMPLE_TIME | + PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_TRANSACTION | + PERF_SAMPLE_REGS_USER | + PERF_SAMPLE_REGS_INTR); + } + pr_cont("PEBS fmt4%c%s, ", pebs_type, pebs_qual); + break; + default: pr_cont("no PEBS fmt%d%c, ", format, pebs_type); x86_pmu.pebs = 0; diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 580c1b91c454..6f814a27416b 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -488,6 +488,8 @@ void intel_pmu_lbr_add(struct perf_event *event) * be 'new'. Conversely, a new event can get installed through the * context switch path for the first time. */ + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) + cpuc->lbr_pebs_users++; perf_sched_cb_inc(event->ctx->pmu); if (!cpuc->lbr_users++ && !event->total_time_running) intel_pmu_lbr_reset(); @@ -507,8 +509,11 @@ void intel_pmu_lbr_del(struct perf_event *event) task_ctx->lbr_callstack_users--; } + if (x86_pmu.intel_cap.pebs_baseline && event->attr.precise_ip > 0) + cpuc->lbr_pebs_users--; cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); + WARN_ON_ONCE(cpuc->lbr_pebs_users < 0); perf_sched_cb_dec(event->ctx->pmu); } @@ -658,7 +663,13 @@ void intel_pmu_lbr_read(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); - if (!cpuc->lbr_users) + /* + * Don't read when all LBRs users are using adaptive PEBS. + * + * This could be smarter and actually check the event, + * but this simple approach seems to work for now. + */ + if (!cpuc->lbr_users || cpuc->lbr_users == cpuc->lbr_pebs_users) return; if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) @@ -1080,6 +1091,28 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc) } } +void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + int i; + + cpuc->lbr_stack.nr = x86_pmu.lbr_nr; + for (i = 0; i < x86_pmu.lbr_nr; i++) { + u64 info = lbr->lbr[i].info; + struct perf_branch_entry *e = &cpuc->lbr_entries[i]; + + e->from = lbr->lbr[i].from; + e->to = lbr->lbr[i].to; + e->mispred = !!(info & LBR_INFO_MISPRED); + e->predicted = !(info & LBR_INFO_MISPRED); + e->in_tx = !!(info & LBR_INFO_IN_TX); + e->abort = !!(info & LBR_INFO_ABORT); + e->cycles = info & LBR_INFO_CYCLES; + e->reserved = 0; + } + intel_pmu_lbr_filter(cpuc); +} + /* * Map interface branch filters onto LBR filters */ diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index fb3a2f13fc70..339d7628080c 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -1525,8 +1525,7 @@ static __init int pt_init(void) } if (!intel_pt_validate_hw_cap(PT_CAP_topa_multiple_entries)) - pt_pmu.pmu.capabilities = - PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF; + pt_pmu.pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG; pt_pmu.pmu.capabilities |= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE; pt_pmu.pmu.attr_groups = pt_attr_groups; diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 94dc564146ca..37ebf6fc5415 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -775,6 +775,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_X, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, hsw_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, skl_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 9fe64c01a2e5..fc40a1473058 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -1367,6 +1367,11 @@ static const struct intel_uncore_init_fun skx_uncore_init __initconst = { .pci_init = skx_uncore_pci_init, }; +static const struct intel_uncore_init_fun icl_uncore_init __initconst = { + .cpu_init = icl_uncore_cpu_init, + .pci_init = skl_uncore_pci_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), @@ -1393,6 +1398,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_MOBILE, skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_KABYLAKE_DESKTOP, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_ICELAKE_MOBILE, icl_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 853a49a8ccf6..79eb2e21e4f0 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -512,6 +512,7 @@ int skl_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); void skl_uncore_cpu_init(void); +void icl_uncore_cpu_init(void); int snb_pci2phy_map_init(int devid); /* uncore_snbep.c */ diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 13493f43b247..f8431819b3e1 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -34,6 +34,8 @@ #define PCI_DEVICE_ID_INTEL_CFL_4S_S_IMC 0x3e33 #define PCI_DEVICE_ID_INTEL_CFL_6S_S_IMC 0x3eca #define PCI_DEVICE_ID_INTEL_CFL_8S_S_IMC 0x3e32 +#define PCI_DEVICE_ID_INTEL_ICL_U_IMC 0x8a02 +#define PCI_DEVICE_ID_INTEL_ICL_U2_IMC 0x8a12 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -93,6 +95,12 @@ #define SKL_UNC_PERF_GLOBAL_CTL 0xe01 #define SKL_UNC_GLOBAL_CTL_CORE_ALL ((1 << 5) - 1) +/* ICL Cbo register */ +#define ICL_UNC_CBO_CONFIG 0x396 +#define ICL_UNC_NUM_CBO_MASK 0xf +#define ICL_UNC_CBO_0_PER_CTR0 0x702 +#define ICL_UNC_CBO_MSR_OFFSET 0x8 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); @@ -280,6 +288,70 @@ void skl_uncore_cpu_init(void) snb_uncore_arb.ops = &skl_uncore_msr_ops; } +static struct intel_uncore_type icl_uncore_cbox = { + .name = "cbox", + .num_counters = 4, + .perf_ctr_bits = 44, + .perf_ctr = ICL_UNC_CBO_0_PER_CTR0, + .event_ctl = SNB_UNC_CBO_0_PERFEVTSEL0, + .event_mask = SNB_UNC_RAW_EVENT_MASK, + .msr_offset = ICL_UNC_CBO_MSR_OFFSET, + .ops = &skl_uncore_msr_ops, + .format_group = &snb_uncore_format_group, +}; + +static struct uncore_event_desc icl_uncore_events[] = { + INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff"), + { /* end: all zeroes */ }, +}; + +static struct attribute *icl_uncore_clock_formats_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group icl_uncore_clock_format_group = { + .name = "format", + .attrs = icl_uncore_clock_formats_attr, +}; + +static struct intel_uncore_type icl_uncore_clockbox = { + .name = "clock", + .num_counters = 1, + .num_boxes = 1, + .fixed_ctr_bits = 48, + .fixed_ctr = SNB_UNC_FIXED_CTR, + .fixed_ctl = SNB_UNC_FIXED_CTR_CTRL, + .single_fixed = 1, + .event_mask = SNB_UNC_CTL_EV_SEL_MASK, + .format_group = &icl_uncore_clock_format_group, + .ops = &skl_uncore_msr_ops, + .event_descs = icl_uncore_events, +}; + +static struct intel_uncore_type *icl_msr_uncores[] = { + &icl_uncore_cbox, + &snb_uncore_arb, + &icl_uncore_clockbox, + NULL, +}; + +static int icl_get_cbox_num(void) +{ + u64 num_boxes; + + rdmsrl(ICL_UNC_CBO_CONFIG, num_boxes); + + return num_boxes & ICL_UNC_NUM_CBO_MASK; +} + +void icl_uncore_cpu_init(void) +{ + uncore_msr_uncores = icl_msr_uncores; + icl_uncore_cbox.num_boxes = icl_get_cbox_num(); + snb_uncore_arb.ops = &skl_uncore_msr_ops; +} + enum { SNB_PCI_UNCORE_IMC, }; @@ -668,6 +740,18 @@ static const struct pci_device_id skl_uncore_pci_ids[] = { { /* end: all zeroes */ }, }; +static const struct pci_device_id icl_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICL_U2_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + static struct pci_driver snb_uncore_pci_driver = { .name = "snb_uncore", .id_table = snb_uncore_pci_ids, @@ -693,6 +777,11 @@ static struct pci_driver skl_uncore_pci_driver = { .id_table = skl_uncore_pci_ids, }; +static struct pci_driver icl_uncore_pci_driver = { + .name = "icl_uncore", + .id_table = icl_uncore_pci_ids, +}; + struct imc_uncore_pci_dev { __u32 pci_id; struct pci_driver *driver; @@ -732,6 +821,8 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(CFL_4S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 4 Cores Server */ IMC_DEV(CFL_6S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 6 Cores Server */ IMC_DEV(CFL_8S_S_IMC, &skl_uncore_pci_driver), /* 8th Gen Core S 8 Cores Server */ + IMC_DEV(ICL_U_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ + IMC_DEV(ICL_U2_IMC, &icl_uncore_pci_driver), /* 10th Gen Core Mobile */ { /* end marker */ } }; diff --git a/arch/x86/events/msr.c b/arch/x86/events/msr.c index a878e6286e4a..f3f4c2263501 100644 --- a/arch/x86/events/msr.c +++ b/arch/x86/events/msr.c @@ -89,6 +89,7 @@ static bool test_intel(int idx) case INTEL_FAM6_SKYLAKE_X: case INTEL_FAM6_KABYLAKE_MOBILE: case INTEL_FAM6_KABYLAKE_DESKTOP: + case INTEL_FAM6_ICELAKE_MOBILE: if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF) return true; break; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a75955741c50..07fc84bb85c1 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -49,28 +49,33 @@ struct event_constraint { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; u64 idxmsk64; }; - u64 code; - u64 cmask; - int weight; - int overlap; - int flags; + u64 code; + u64 cmask; + int weight; + int overlap; + int flags; + unsigned int size; }; + +static inline bool constraint_match(struct event_constraint *c, u64 ecode) +{ + return ((ecode & c->cmask) - c->code) <= (u64)c->size; +} + /* * struct hw_perf_event.flags flags */ #define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ #define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ #define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ -#define PERF_X86_EVENT_COMMITTED 0x0008 /* event passed commit_txn */ -#define PERF_X86_EVENT_PEBS_LD_HSW 0x0010 /* haswell style datala, load */ -#define PERF_X86_EVENT_PEBS_NA_HSW 0x0020 /* haswell style datala, unknown */ -#define PERF_X86_EVENT_EXCL 0x0040 /* HT exclusivity on counter */ -#define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ -#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ -#define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ -#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ -#define PERF_X86_EVENT_LARGE_PEBS 0x0800 /* use large PEBS */ - +#define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ +#define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ +#define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ +#define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ +#define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ +#define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -96,25 +101,43 @@ struct amd_nb { PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ PERF_SAMPLE_PERIOD) -#define PEBS_REGS \ - (PERF_REG_X86_AX | \ - PERF_REG_X86_BX | \ - PERF_REG_X86_CX | \ - PERF_REG_X86_DX | \ - PERF_REG_X86_DI | \ - PERF_REG_X86_SI | \ - PERF_REG_X86_SP | \ - PERF_REG_X86_BP | \ - PERF_REG_X86_IP | \ - PERF_REG_X86_FLAGS | \ - PERF_REG_X86_R8 | \ - PERF_REG_X86_R9 | \ - PERF_REG_X86_R10 | \ - PERF_REG_X86_R11 | \ - PERF_REG_X86_R12 | \ - PERF_REG_X86_R13 | \ - PERF_REG_X86_R14 | \ - PERF_REG_X86_R15) +#define PEBS_GP_REGS \ + ((1ULL << PERF_REG_X86_AX) | \ + (1ULL << PERF_REG_X86_BX) | \ + (1ULL << PERF_REG_X86_CX) | \ + (1ULL << PERF_REG_X86_DX) | \ + (1ULL << PERF_REG_X86_DI) | \ + (1ULL << PERF_REG_X86_SI) | \ + (1ULL << PERF_REG_X86_SP) | \ + (1ULL << PERF_REG_X86_BP) | \ + (1ULL << PERF_REG_X86_IP) | \ + (1ULL << PERF_REG_X86_FLAGS) | \ + (1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + +#define PEBS_XMM_REGS \ + ((1ULL << PERF_REG_X86_XMM0) | \ + (1ULL << PERF_REG_X86_XMM1) | \ + (1ULL << PERF_REG_X86_XMM2) | \ + (1ULL << PERF_REG_X86_XMM3) | \ + (1ULL << PERF_REG_X86_XMM4) | \ + (1ULL << PERF_REG_X86_XMM5) | \ + (1ULL << PERF_REG_X86_XMM6) | \ + (1ULL << PERF_REG_X86_XMM7) | \ + (1ULL << PERF_REG_X86_XMM8) | \ + (1ULL << PERF_REG_X86_XMM9) | \ + (1ULL << PERF_REG_X86_XMM10) | \ + (1ULL << PERF_REG_X86_XMM11) | \ + (1ULL << PERF_REG_X86_XMM12) | \ + (1ULL << PERF_REG_X86_XMM13) | \ + (1ULL << PERF_REG_X86_XMM14) | \ + (1ULL << PERF_REG_X86_XMM15)) /* * Per register state. @@ -207,10 +230,16 @@ struct cpu_hw_events { int n_pebs; int n_large_pebs; + /* Current super set of events hardware configuration */ + u64 pebs_data_cfg; + u64 active_pebs_data_cfg; + int pebs_record_size; + /* * Intel LBR bits */ int lbr_users; + int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; @@ -257,18 +286,29 @@ struct cpu_hw_events { void *kfree_on_online[X86_PERF_KFREE_MAX]; }; -#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ +#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \ { .idxmsk64 = (n) }, \ .code = (c), \ + .size = (e) - (c), \ .cmask = (m), \ .weight = (w), \ .overlap = (o), \ .flags = f, \ } +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \ + __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f) + #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) +/* + * The constraint_match() function only works for 'simple' event codes + * and not for extended (AMD64_EVENTSEL_EVENT) events codes. + */ +#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \ + __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0) + #define INTEL_EXCLEVT_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ 0, PERF_X86_EVENT_EXCL) @@ -304,6 +344,12 @@ struct cpu_hw_events { EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) /* + * Constraint on a range of Event codes + */ +#define INTEL_EVENT_CONSTRAINT_RANGE(c, e, n) \ + EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT) + +/* * Constraint on the Event code + UMask + fixed-mask * * filter mask to validate fixed counter events. @@ -350,6 +396,9 @@ struct cpu_hw_events { #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) +#define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n) \ + EVENT_CONSTRAINT_RANGE(c, e, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) + /* Check only flags, but allow all event/umask */ #define INTEL_ALL_EVENT_CONSTRAINT(code, n) \ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS) @@ -366,6 +415,11 @@ struct cpu_hw_events { ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) +#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(code, end, n) \ + __EVENT_CONSTRAINT_RANGE(code, end, n, \ + ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) + #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ @@ -473,6 +527,7 @@ union perf_capabilities { * values > 32bit. */ u64 full_width_write:1; + u64 pebs_baseline:1; }; u64 capabilities; }; @@ -613,14 +668,16 @@ struct x86_pmu { pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, - pebs_no_isolation :1; + pebs_no_isolation :1, + pebs_no_xmm_regs :1; int pebs_record_size; int pebs_buffer_size; + int max_pebs_events; void (*drain_pebs)(struct pt_regs *regs); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); - int max_pebs_events; unsigned long large_pebs_flags; + u64 rtm_abort_event; /* * Intel LBR @@ -714,6 +771,7 @@ static struct perf_pmu_events_ht_attr event_attr_##v = { \ .event_str_ht = ht, \ } +struct pmu *x86_get_pmu(void); extern struct x86_pmu x86_pmu __read_mostly; static inline bool x86_pmu_has_lbr_callstack(void) @@ -941,6 +999,8 @@ extern struct event_constraint intel_bdw_pebs_event_constraints[]; extern struct event_constraint intel_skl_pebs_event_constraints[]; +extern struct event_constraint intel_icl_pebs_event_constraints[]; + struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); @@ -959,6 +1019,8 @@ void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); +void intel_pmu_store_pebs_lbrs(struct pebs_lbr *lbr); + void intel_ds_init(void); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); diff --git a/arch/x86/hyperv/hv_apic.c b/arch/x86/hyperv/hv_apic.c index 8eb6fbee8e13..5c056b8aebef 100644 --- a/arch/x86/hyperv/hv_apic.c +++ b/arch/x86/hyperv/hv_apic.c @@ -86,6 +86,11 @@ static void hv_apic_write(u32 reg, u32 val) static void hv_apic_eoi_write(u32 reg, u32 val) { + struct hv_vp_assist_page *hvp = hv_vp_assist_page[smp_processor_id()]; + + if (hvp && (xchg(&hvp->apic_assist, 0) & 0x1)) + return; + wrmsr(HV_X64_MSR_EOI, val, 0); } diff --git a/arch/x86/hyperv/hv_spinlock.c b/arch/x86/hyperv/hv_spinlock.c index a861b0456b1a..07f21a06392f 100644 --- a/arch/x86/hyperv/hv_spinlock.c +++ b/arch/x86/hyperv/hv_spinlock.c @@ -56,7 +56,7 @@ static void hv_qlock_wait(u8 *byte, u8 val) /* * Hyper-V does not support this so far. */ -bool hv_vcpu_is_preempted(int vcpu) +__visible bool hv_vcpu_is_preempted(int vcpu) { return false; } diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 321fe5f5d0e9..629d1ee05599 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -61,9 +61,8 @@ } while (0) #define RELOAD_SEG(seg) { \ - unsigned int pre = GET_SEG(seg); \ + unsigned int pre = (seg) | 3; \ unsigned int cur = get_user_seg(seg); \ - pre |= 3; \ if (pre != cur) \ set_user_seg(seg, pre); \ } @@ -72,6 +71,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, struct sigcontext_32 __user *sc) { unsigned int tmpflags, err = 0; + u16 gs, fs, es, ds; void __user *buf; u32 tmp; @@ -79,16 +79,10 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, current->restart_block.fn = do_no_restart_syscall; get_user_try { - /* - * Reload fs and gs if they have changed in the signal - * handler. This does not handle long fs/gs base changes in - * the handler, but does not clobber them at least in the - * normal case. - */ - RELOAD_SEG(gs); - RELOAD_SEG(fs); - RELOAD_SEG(ds); - RELOAD_SEG(es); + gs = GET_SEG(gs); + fs = GET_SEG(fs); + ds = GET_SEG(ds); + es = GET_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(ax); @@ -106,6 +100,17 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, buf = compat_ptr(tmp); } get_user_catch(err); + /* + * Reload fs and gs if they have changed in the signal + * handler. This does not handle long fs/gs base changes in + * the handler, but does not clobber them at least in the + * normal case. + */ + RELOAD_SEG(gs); + RELOAD_SEG(fs); + RELOAD_SEG(ds); + RELOAD_SEG(es); + err |= fpu__restore_sig(buf, 1); force_iret(); @@ -216,8 +221,7 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size, void __user **fpstate) { - struct fpu *fpu = ¤t->thread.fpu; - unsigned long sp; + unsigned long sp, fx_aligned, math_size; /* Default to using normal stack */ sp = regs->sp; @@ -231,15 +235,11 @@ static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs, ksig->ka.sa.sa_restorer) sp = (unsigned long) ksig->ka.sa.sa_restorer; - if (fpu->initialized) { - unsigned long fx_aligned, math_size; - - sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); - *fpstate = (struct _fpstate_32 __user *) sp; - if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, - math_size) < 0) - return (void __user *) -1L; - } + sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size); + *fpstate = (struct _fpstate_32 __user *) sp; + if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned, + math_size) < 0) + return (void __user *) -1L; sp -= frame_size; /* Align the stack pointer according to the i386 ABI, diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index a0ab9ab61c75..eebd05942e6c 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += early_ioremap.h generic-y += export.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 31b627b43a8e..464034db299f 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -20,6 +20,17 @@ #endif /* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +.macro ANNOTATE_IGNORE_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.ignore_alts + .long .Lannotate_\@ - . + .popsection +.endm + +/* * Issue one struct alt_instr descriptor entry (need to put it into * the section .altinstructions, see below). This entry contains * enough information for the alternatives patching code to patch an diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4c74073a19cc..094fbc9c0b1c 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -45,6 +45,16 @@ #define LOCK_PREFIX "" #endif +/* + * objtool annotation to ignore the alternatives and only consider the original + * instruction(s). + */ +#define ANNOTATE_IGNORE_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.ignore_alts\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + struct alt_instr { s32 instr_offset; /* original instruction */ s32 repl_offset; /* offset to replacement instruction */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 6467757bb39f..3ff577c0b102 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -148,30 +148,6 @@ _ASM_PTR (entry); \ .popsection -.macro ALIGN_DESTINATION - /* check for bad alignment of destination */ - movl %edi,%ecx - andl $7,%ecx - jz 102f /* already aligned */ - subl $8,%ecx - negl %ecx - subl %ecx,%edx -100: movb (%rsi),%al -101: movb %al,(%rdi) - incq %rsi - incq %rdi - decl %ecx - jnz 100b -102: - .section .fixup,"ax" -103: addl %ecx,%edx /* ecx is zerorest also */ - jmp copy_user_handle_tail - .previous - - _ASM_EXTABLE_UA(100b, 103b) - _ASM_EXTABLE_UA(101b, 103b) - .endm - #else # define _EXPAND_EXTABLE_HANDLE(x) #x # define _ASM_EXTABLE_HANDLE(from, to, handler) \ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index d153d570bb04..8e790ec219a5 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -36,16 +36,17 @@ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ -#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#define RLONG_ADDR(x) "m" (*(volatile long *) (x)) +#define WBYTE_ADDR(x) "+m" (*(volatile char *) (x)) -#define ADDR BITOP_ADDR(addr) +#define ADDR RLONG_ADDR(addr) /* * We do the locked ops that don't return the old value as * a mask operation on a byte. */ #define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) -#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK_ADDR(nr, addr) WBYTE_ADDR((void *)(addr) + ((nr)>>3)) #define CONST_MASK(nr) (1 << ((nr) & 7)) /** @@ -73,7 +74,7 @@ set_bit(long nr, volatile unsigned long *addr) : "memory"); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(bts) " %1,%0" - : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -88,7 +89,7 @@ set_bit(long nr, volatile unsigned long *addr) */ static __always_inline void __set_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(bts) " %1,%0" : ADDR : "Ir" (nr) : "memory"); + asm volatile(__ASM_SIZE(bts) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } /** @@ -110,8 +111,7 @@ clear_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)~CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btr) " %1,%0" - : BITOP_ADDR(addr) - : "Ir" (nr)); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -131,7 +131,7 @@ static __always_inline void clear_bit_unlock(long nr, volatile unsigned long *ad static __always_inline void __clear_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(btr) " %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btr) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) @@ -139,7 +139,7 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile bool negative; asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) - : CC_OUT(s) (negative), ADDR + : CC_OUT(s) (negative), WBYTE_ADDR(addr) : "ir" ((char) ~(1 << nr)) : "memory"); return negative; } @@ -155,13 +155,9 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile * __clear_bit() is non-atomic and implies release semantics before the memory * operation. It can be used for an unlock if no other CPUs can concurrently * modify other bits in the word. - * - * No memory barrier is required here, because x86 cannot reorder stores past - * older loads. Same principle as spin_unlock. */ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long *addr) { - barrier(); __clear_bit(nr, addr); } @@ -176,7 +172,7 @@ static __always_inline void __clear_bit_unlock(long nr, volatile unsigned long * */ static __always_inline void __change_bit(long nr, volatile unsigned long *addr) { - asm volatile(__ASM_SIZE(btc) " %1,%0" : ADDR : "Ir" (nr)); + asm volatile(__ASM_SIZE(btc) " %1,%0" : : ADDR, "Ir" (nr) : "memory"); } /** @@ -196,8 +192,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) : "iq" ((u8)CONST_MASK(nr))); } else { asm volatile(LOCK_PREFIX __ASM_SIZE(btc) " %1,%0" - : BITOP_ADDR(addr) - : "Ir" (nr)); + : : RLONG_ADDR(addr), "Ir" (nr) : "memory"); } } @@ -242,8 +237,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * asm(__ASM_SIZE(bts) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -282,8 +277,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long asm volatile(__ASM_SIZE(btr) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr)); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -294,8 +289,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon asm volatile(__ASM_SIZE(btc) " %2,%1" CC_SET(c) - : CC_OUT(c) (oldbit), ADDR - : "Ir" (nr) : "memory"); + : CC_OUT(c) (oldbit) + : ADDR, "Ir" (nr) : "memory"); return oldbit; } @@ -326,7 +321,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l asm volatile(__ASM_SIZE(bt) " %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) - : "m" (*(unsigned long *)addr), "Ir" (nr)); + : "m" (*(unsigned long *)addr), "Ir" (nr) : "memory"); return oldbit; } diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 29c706415443..cff3f3f3bfe0 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h @@ -7,6 +7,64 @@ #include <asm/processor.h> #include <asm/intel_ds.h> +#ifdef CONFIG_X86_64 + +/* Macro to enforce the same ordering and stack sizes */ +#define ESTACKS_MEMBERS(guardsize, db2_holesize)\ + char DF_stack_guard[guardsize]; \ + char DF_stack[EXCEPTION_STKSZ]; \ + char NMI_stack_guard[guardsize]; \ + char NMI_stack[EXCEPTION_STKSZ]; \ + char DB2_stack_guard[guardsize]; \ + char DB2_stack[db2_holesize]; \ + char DB1_stack_guard[guardsize]; \ + char DB1_stack[EXCEPTION_STKSZ]; \ + char DB_stack_guard[guardsize]; \ + char DB_stack[EXCEPTION_STKSZ]; \ + char MCE_stack_guard[guardsize]; \ + char MCE_stack[EXCEPTION_STKSZ]; \ + char IST_top_guard[guardsize]; \ + +/* The exception stacks' physical storage. No guard pages required */ +struct exception_stacks { + ESTACKS_MEMBERS(0, 0) +}; + +/* The effective cpu entry area mapping with guard pages. */ +struct cea_exception_stacks { + ESTACKS_MEMBERS(PAGE_SIZE, EXCEPTION_STKSZ) +}; + +/* + * The exception stack ordering in [cea_]exception_stacks + */ +enum exception_stack_ordering { + ESTACK_DF, + ESTACK_NMI, + ESTACK_DB2, + ESTACK_DB1, + ESTACK_DB, + ESTACK_MCE, + N_EXCEPTION_STACKS +}; + +#define CEA_ESTACK_SIZE(st) \ + sizeof(((struct cea_exception_stacks *)0)->st## _stack) + +#define CEA_ESTACK_BOT(ceastp, st) \ + ((unsigned long)&(ceastp)->st## _stack) + +#define CEA_ESTACK_TOP(ceastp, st) \ + (CEA_ESTACK_BOT(ceastp, st) + CEA_ESTACK_SIZE(st)) + +#define CEA_ESTACK_OFFS(st) \ + offsetof(struct cea_exception_stacks, st## _stack) + +#define CEA_ESTACK_PAGES \ + (sizeof(struct cea_exception_stacks) / PAGE_SIZE) + +#endif + /* * cpu_entry_area is a percpu region that contains things needed by the CPU * and early entry/exit code. Real types aren't used for all fields here @@ -32,12 +90,9 @@ struct cpu_entry_area { #ifdef CONFIG_X86_64 /* - * Exception stacks used for IST entries. - * - * In the future, this should have a separate slot for each stack - * with guard pages between them. + * Exception stacks used for IST entries with guard pages. */ - char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; + struct cea_exception_stacks estacks; #endif #ifdef CONFIG_CPU_SUP_INTEL /* @@ -57,6 +112,7 @@ struct cpu_entry_area { #define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); +DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks); extern void setup_cpu_entry_areas(void); extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); @@ -76,4 +132,7 @@ static inline struct entry_stack *cpu_entry_stack(int cpu) return &get_cpu_entry_area(cpu)->entry_stack_page.stack; } +#define __this_cpu_ist_top_va(name) \ + CEA_ESTACK_TOP(__this_cpu_read(cea_exception_stacks), name) + #endif diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0e56ff7e4848..1d337c51f7e6 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -156,11 +156,14 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); #else /* - * Static testing of CPU features. Used the same as boot_cpu_has(). - * These will statically patch the target code for additional - * performance. + * Static testing of CPU features. Used the same as boot_cpu_has(). It + * statically patches the target code for additional performance. Use + * static_cpu_has() only in fast paths, where every cycle counts. Which + * means that the boot_cpu_has() variant is already fast enough for the + * majority of cases and you should stick to using it as it is generally + * only two instructions: a RIP-relative MOV and a TEST. */ -static __always_inline __pure bool _static_cpu_has(u16 bit) +static __always_inline bool _static_cpu_has(u16 bit) { asm_volatile_goto("1: jmp 6f\n" "2:\n" diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 9e5ca30738e5..1a8609a15856 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -104,11 +104,9 @@ static inline void debug_stack_usage_dec(void) { __this_cpu_dec(debug_stack_usage); } -int is_debug_stack(unsigned long addr); void debug_stack_set_zero(void); void debug_stack_reset(void); #else /* !X86_64 */ -static inline int is_debug_stack(unsigned long addr) { return 0; } static inline void debug_stack_set_zero(void) { } static inline void debug_stack_reset(void) { } static inline void debug_stack_usage_inc(void) { } diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 50ba74a34a37..9da8cccdf3fb 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -103,8 +103,6 @@ enum fixed_addresses { #ifdef CONFIG_PARAVIRT FIX_PARAVIRT_BOOTMAP, #endif - FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ - FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b56d504af654..b774c52e5411 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -10,6 +10,7 @@ #ifndef _ASM_X86_FPU_API_H #define _ASM_X86_FPU_API_H +#include <linux/bottom_half.h> /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It @@ -21,6 +22,36 @@ extern void kernel_fpu_begin(void); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); +extern void fpregs_mark_activate(void); + +/* + * Use fpregs_lock() while editing CPU's FPU registers or fpu->state. + * A context switch will (and softirq might) save CPU's FPU registers to + * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in + * a random state. + */ +static inline void fpregs_lock(void) +{ + preempt_disable(); + local_bh_disable(); +} + +static inline void fpregs_unlock(void) +{ + local_bh_enable(); + preempt_enable(); +} + +#ifdef CONFIG_X86_DEBUG_FPU +extern void fpregs_assert_state_consistent(void); +#else +static inline void fpregs_assert_state_consistent(void) { } +#endif + +/* + * Load the task FPU state before returning to userspace. + */ +extern void switch_fpu_return(void); /* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fb04a3ded7dd..9e27fa05a7ae 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -14,6 +14,7 @@ #include <linux/compat.h> #include <linux/sched.h> #include <linux/slab.h> +#include <linux/mm.h> #include <asm/user.h> #include <asm/fpu/api.h> @@ -24,14 +25,12 @@ /* * High level FPU state handling functions: */ -extern void fpu__initialize(struct fpu *fpu); extern void fpu__prepare_read(struct fpu *fpu); extern void fpu__prepare_write(struct fpu *fpu); extern void fpu__save(struct fpu *fpu); -extern void fpu__restore(struct fpu *fpu); extern int fpu__restore_sig(void __user *buf, int ia32_frame); extern void fpu__drop(struct fpu *fpu); -extern int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu); +extern int fpu__copy(struct task_struct *dst, struct task_struct *src); extern void fpu__clear(struct fpu *fpu); extern int fpu__exception_code(struct fpu *fpu, int trap_nr); extern int dump_fpu(struct pt_regs *ptregs, struct user_i387_struct *fpstate); @@ -122,6 +121,21 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); err; \ }) +#define kernel_insn_err(insn, output, input...) \ +({ \ + int err; \ + asm volatile("1:" #insn "\n\t" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: movl $-1,%[err]\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [err] "=r" (err), output \ + : "0"(0), input); \ + err; \ +}) + #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ @@ -150,6 +164,14 @@ static inline void copy_kernel_to_fxregs(struct fxregs_state *fx) kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } +static inline int copy_kernel_to_fxregs_err(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); + else + return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + static inline int copy_user_to_fxregs(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) @@ -163,6 +185,11 @@ static inline void copy_kernel_to_fregs(struct fregs_state *fx) kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } +static inline int copy_kernel_to_fregs_err(struct fregs_state *fx) +{ + return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} + static inline int copy_user_to_fregs(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); @@ -253,7 +280,7 @@ static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XSAVES, xstate, lmask, hmask, err); else XSTATE_OP(XSAVE, xstate, lmask, hmask, err); @@ -275,7 +302,7 @@ static inline void copy_kernel_to_xregs_booting(struct xregs_state *xstate) WARN_ON(system_state != SYSTEM_BOOTING); - if (static_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) XSTATE_OP(XRSTORS, xstate, lmask, hmask, err); else XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); @@ -363,6 +390,21 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask) } /* + * Restore xstate from kernel space xsave area, return an error code instead of + * an exception. + */ +static inline int copy_kernel_to_xregs_err(struct xregs_state *xstate, u64 mask) +{ + u32 lmask = mask; + u32 hmask = mask >> 32; + int err; + + XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); + + return err; +} + +/* * These must be called with preempt disabled. Returns * 'true' if the FPU state is still intact and we can * keep registers active. @@ -487,6 +529,25 @@ static inline void fpregs_activate(struct fpu *fpu) } /* + * Internal helper, do not use directly. Use switch_fpu_return() instead. + */ +static inline void __fpregs_load_activate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + int cpu = smp_processor_id(); + + if (WARN_ON_ONCE(current->mm == NULL)) + return; + + if (!fpregs_state_valid(fpu, cpu)) { + copy_kernel_to_fpregs(&fpu->state); + fpregs_activate(fpu); + fpu->last_cpu = cpu; + } + clear_thread_flag(TIF_NEED_FPU_LOAD); +} + +/* * FPU state switching for scheduling. * * This is a two-stage process: @@ -494,13 +555,23 @@ static inline void fpregs_activate(struct fpu *fpu) * - switch_fpu_prepare() saves the old state. * This is done within the context of the old process. * - * - switch_fpu_finish() restores the new state as - * necessary. + * - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state + * will get loaded on return to userspace, or when the kernel needs it. + * + * If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers + * are saved in the current thread's FPU register state. + * + * If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not + * hold current()'s FPU registers. It is required to load the + * registers before returning to userland or using the content + * otherwise. + * + * The FPU context is only stored/restored for a user task and + * ->mm is used to distinguish between kernel and user threads. */ -static inline void -switch_fpu_prepare(struct fpu *old_fpu, int cpu) +static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - if (static_cpu_has(X86_FEATURE_FPU) && old_fpu->initialized) { + if (static_cpu_has(X86_FEATURE_FPU) && current->mm) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; else @@ -508,8 +579,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ trace_x86_fpu_regs_deactivated(old_fpu); - } else - old_fpu->last_cpu = -1; + } } /* @@ -517,36 +587,32 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu) */ /* - * Set up the userspace FPU context for the new task, if the task - * has used the FPU. + * Load PKRU from the FPU context if available. Delay loading of the + * complete FPU state until the return to userland. */ -static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) +static inline void switch_fpu_finish(struct fpu *new_fpu) { - bool preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->initialized; + u32 pkru_val = init_pkru_value; + struct pkru_state *pk; - if (preload) { - if (!fpregs_state_valid(new_fpu, cpu)) - copy_kernel_to_fpregs(&new_fpu->state); - fpregs_activate(new_fpu); - } -} + if (!static_cpu_has(X86_FEATURE_FPU)) + return; -/* - * Needs to be preemption-safe. - * - * NOTE! user_fpu_begin() must be used only immediately before restoring - * the save state. It does not do any saving/restoring on its own. In - * lazy FPU mode, it is just an optimization to avoid a #NM exception, - * the task can lose the FPU right after preempt_enable(). - */ -static inline void user_fpu_begin(void) -{ - struct fpu *fpu = ¤t->thread.fpu; + set_thread_flag(TIF_NEED_FPU_LOAD); + + if (!cpu_feature_enabled(X86_FEATURE_OSPKE)) + return; - preempt_disable(); - fpregs_activate(fpu); - preempt_enable(); + /* + * PKRU state is switched eagerly because it needs to be valid before we + * return to userland e.g. for a copy_to_user() operation. + */ + if (current->mm) { + pk = get_xsave_addr(&new_fpu->state.xsave, XFEATURE_PKRU); + if (pk) + pkru_val = pk->pkru; + } + __write_pkru(pkru_val); } /* diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 44bbc39a57b3..7fb516b6893a 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -22,7 +22,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, extern void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk); -extern void convert_to_fxsr(struct task_struct *tsk, +extern void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env); unsigned long diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 2e32e178e064..f098f6cab94b 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -294,15 +294,6 @@ struct fpu { unsigned int last_cpu; /* - * @initialized: - * - * This flag indicates whether this context is initialized: if the task - * is not running then we can restore from this context, if the task - * is running then we should save into this context. - */ - unsigned char initialized; - - /* * @avx512_timestamp: * * Records the timestamp of AVX512 use during last context switch. diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 48581988d78c..7e42b285c856 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -2,9 +2,11 @@ #ifndef __ASM_X86_XSAVE_H #define __ASM_X86_XSAVE_H +#include <linux/uaccess.h> #include <linux/types.h> + #include <asm/processor.h> -#include <linux/uaccess.h> +#include <asm/user.h> /* Bit 63 of XCR0 is reserved for future expansion */ #define XFEATURE_MASK_EXTEND (~(XFEATURE_MASK_FPSSE | (1ULL << 63))) @@ -46,8 +48,8 @@ extern void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask); void fpu__xstate_clear_all_cpu_caps(void); -void *get_xsave_addr(struct xregs_state *xsave, int xstate); -const void *get_xsave_field_ptr(int xstate_field); +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr); +const void *get_xsave_field_ptr(int xfeature_nr); int using_compacted_format(void); int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned int offset, unsigned int size); diff --git a/arch/x86/include/asm/intel_ds.h b/arch/x86/include/asm/intel_ds.h index ae26df1c2789..8380c3ddd4b2 100644 --- a/arch/x86/include/asm/intel_ds.h +++ b/arch/x86/include/asm/intel_ds.h @@ -8,7 +8,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 -#define MAX_FIXED_PEBS_EVENTS 3 +#define MAX_FIXED_PEBS_EVENTS 4 /* * A debug store configuration. diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 686247db3106..a06a9f8294ea 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -90,8 +90,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) #define __raw_writew __writew #define __raw_writel __writel -#define mmiowb() barrier() - #ifdef CONFIG_X86_64 build_mmio_read(readq, "q", u64, "=r", :"memory") diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index fbb16e6b6c18..8f95686ec27e 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -16,11 +16,7 @@ static inline int irq_canonicalize(int irq) return ((irq == 2) ? 9 : irq); } -#ifdef CONFIG_X86_32 -extern void irq_ctx_init(int cpu); -#else -# define irq_ctx_init(cpu) do { } while (0) -#endif +extern int irq_init_percpu_irqstack(unsigned int cpu); #define __ARCH_HAS_DO_SOFTIRQ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 548d90bbf919..889f8b1b5b7f 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -18,8 +18,8 @@ * Vectors 0 ... 31 : system traps and exceptions - hardcoded events * Vectors 32 ... 127 : device interrupts * Vector 128 : legacy int80 syscall interface - * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device interrupts - * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts + * Vectors 129 ... LOCAL_TIMER_VECTOR-1 + * Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts * * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. * diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 93c4bf598fb0..feab24cac610 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -226,7 +226,9 @@ struct x86_emulate_ops { unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); - int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, u64 smbase); + int (*pre_leave_smm)(struct x86_emulate_ctxt *ctxt, + const char *smstate); + void (*post_leave_smm)(struct x86_emulate_ctxt *ctxt); }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 159b5988292f..c79abe7ca093 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -126,7 +126,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) } #define KVM_PERMILLE_MMU_PAGES 20 -#define KVM_MIN_ALLOC_MMU_PAGES 64 +#define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 @@ -295,6 +295,7 @@ union kvm_mmu_extended_role { unsigned int valid:1; unsigned int execonly:1; unsigned int cr0_pg:1; + unsigned int cr4_pae:1; unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; @@ -844,9 +845,9 @@ enum kvm_irqchip_mode { }; struct kvm_arch { - unsigned int n_used_mmu_pages; - unsigned int n_requested_mmu_pages; - unsigned int n_max_mmu_pages; + unsigned long n_used_mmu_pages; + unsigned long n_requested_mmu_pages; + unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; /* @@ -1182,7 +1183,7 @@ struct kvm_x86_ops { int (*smi_allowed)(struct kvm_vcpu *vcpu); int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate); - int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase); + int (*pre_leave_smm)(struct kvm_vcpu *vcpu, const char *smstate); int (*enable_smi_window)(struct kvm_vcpu *vcpu); int (*mem_enc_op)(struct kvm *kvm, void __user *argp); @@ -1256,8 +1257,8 @@ void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, gfn_t gfn_offset, unsigned long mask); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); -unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); +unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); bool pdptrs_changed(struct kvm_vcpu *vcpu); @@ -1592,4 +1593,7 @@ static inline int kvm_cpu_get_apicid(int mps_cpu) #define put_smstate(type, buf, offset, val) \ *(type *)((buf) + (offset) - 0x7e00) = val +#define GET_SMSTATE(type, buf, offset) \ + (*(type *)((buf) + (offset) - 0x7e00)) + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 22d05e3835f0..dc2d4b206ab7 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -210,16 +210,6 @@ static inline void cmci_rediscover(void) {} static inline void cmci_recheck(void) {} #endif -#ifdef CONFIG_X86_MCE_AMD -void mce_amd_feature_init(struct cpuinfo_x86 *c); -int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); -#else -static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } -static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; -#endif - -static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } - int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); @@ -345,12 +335,19 @@ extern bool amd_mce_is_memory_error(struct mce *m); extern int mce_threshold_create_device(unsigned int cpu); extern int mce_threshold_remove_device(unsigned int cpu); -#else +void mce_amd_feature_init(struct cpuinfo_x86 *c); +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); -static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; -static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; -static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; +#else +static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; +static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; +static inline bool amd_mce_is_memory_error(struct mce *m) { return false; }; +static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } +static inline int +umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif +static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_amd_feature_init(c); } + #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 19d18fae6ec6..93dff1963337 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -13,6 +13,7 @@ #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/mpx.h> +#include <asm/debugreg.h> extern atomic64_t last_mm_ctx_id; @@ -356,4 +357,59 @@ static inline unsigned long __get_current_cr3_fast(void) return cr3; } +typedef struct { + struct mm_struct *mm; +} temp_mm_state_t; + +/* + * Using a temporary mm allows to set temporary mappings that are not accessible + * by other CPUs. Such mappings are needed to perform sensitive memory writes + * that override the kernel memory protections (e.g., W^X), without exposing the + * temporary page-table mappings that are required for these write operations to + * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the + * mapping is torn down. + * + * Context: The temporary mm needs to be used exclusively by a single core. To + * harden security IRQs must be disabled while the temporary mm is + * loaded, thereby preventing interrupt handler bugs from overriding + * the kernel memory protection. + */ +static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) +{ + temp_mm_state_t temp_state; + + lockdep_assert_irqs_disabled(); + temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); + switch_mm_irqs_off(NULL, mm, current); + + /* + * If breakpoints are enabled, disable them while the temporary mm is + * used. Userspace might set up watchpoints on addresses that are used + * in the temporary mm, which would lead to wrong signals being sent or + * crashes. + * + * Note that breakpoints are not disabled selectively, which also causes + * kernel breakpoints (e.g., perf's) to be disabled. This might be + * undesirable, but still seems reasonable as the code that runs in the + * temporary mm should be short. + */ + if (hw_breakpoint_active()) + hw_breakpoint_disable(); + + return temp_state; +} + +static inline void unuse_temporary_mm(temp_mm_state_t prev_state) +{ + lockdep_assert_irqs_disabled(); + switch_mm_irqs_off(NULL, prev_state.mm, current); + + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. + */ + if (hw_breakpoint_active()) + hw_breakpoint_restore(); +} + #endif /* _ASM_X86_MMU_CONTEXT_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ca5bc0eacb95..1378518cf63f 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -116,6 +116,7 @@ #define LBR_INFO_CYCLES 0xffff #define MSR_IA32_PEBS_ENABLE 0x000003f1 +#define MSR_PEBS_DATA_CFG 0x000003f2 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 #define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index dad12b767ba0..daf25b60c9e3 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -11,6 +11,15 @@ #include <asm/msr-index.h> /* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + ANNOTATE_IGNORE_ALTERNATIVE + +/* * Fill the CPU return stack buffer. * * Each entry in the RSB, if used for a speculative 'ret', contains an @@ -57,19 +66,6 @@ #ifdef __ASSEMBLY__ /* - * This should be used immediately before a retpoline alternative. It tells - * objtool where the retpolines are so that it can make sense of the control - * flow by just reading the original instruction(s) and ignoring the - * alternatives. - */ -.macro ANNOTATE_NOSPEC_ALTERNATIVE - .Lannotate_\@: - .pushsection .discard.nospec - .long .Lannotate_\@ - . - .popsection -.endm - -/* * This should be used immediately before an indirect jump/call. It tells * objtool the subsequent indirect jump/call is vouched safe for retpoline * builds. @@ -152,12 +148,6 @@ #else /* __ASSEMBLY__ */ -#define ANNOTATE_NOSPEC_ALTERNATIVE \ - "999:\n\t" \ - ".pushsection .discard.nospec\n\t" \ - ".long 999b - .\n\t" \ - ".popsection\n\t" - #define ANNOTATE_RETPOLINE_SAFE \ "999:\n\t" \ ".pushsection .discard.retpoline_safe\n\t" \ diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0d5c739eebd7..565ad755c785 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -22,11 +22,9 @@ #define THREAD_SIZE_ORDER 1 #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 0 -#define DEBUG_STACK 0 -#define MCE_STACK 0 -#define N_EXCEPTION_STACKS 1 +#define IRQ_STACK_SIZE THREAD_SIZE + +#define N_EXCEPTION_STACKS 1 #ifdef CONFIG_X86_PAE /* diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 8f657286d599..793c14c372cb 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -14,22 +14,20 @@ #define THREAD_SIZE_ORDER (2 + KASAN_STACK_ORDER) #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) -#define CURRENT_MASK (~(THREAD_SIZE - 1)) #define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER) #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER) -#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1) -#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER) - #define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER) #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER) -#define DOUBLEFAULT_STACK 1 -#define NMI_STACK 2 -#define DEBUG_STACK 3 -#define MCE_STACK 4 -#define N_EXCEPTION_STACKS 4 /* hw limit: 7 */ +/* + * The index for the tss.ist[] array. The hardware limit is 7 entries. + */ +#define IST_INDEX_DF 0 +#define IST_INDEX_NMI 1 +#define IST_INDEX_DB 2 +#define IST_INDEX_MCE 3 /* * Set __PAGE_OFFSET to the most negative possible address + diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 8bdf74902293..1392d5e6e8d6 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -7,7 +7,7 @@ */ #define INTEL_PMC_MAX_GENERIC 32 -#define INTEL_PMC_MAX_FIXED 3 +#define INTEL_PMC_MAX_FIXED 4 #define INTEL_PMC_IDX_FIXED 32 #define X86_PMC_IDX_MAX 64 @@ -32,6 +32,8 @@ #define HSW_IN_TX (1ULL << 32) #define HSW_IN_TX_CHECKPOINTED (1ULL << 33) +#define ICL_EVENTSEL_ADAPTIVE (1ULL << 34) +#define ICL_FIXED_0_ADAPTIVE (1ULL << 32) #define AMD64_EVENTSEL_INT_CORE_ENABLE (1ULL << 36) #define AMD64_EVENTSEL_GUESTONLY (1ULL << 40) @@ -87,6 +89,12 @@ #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 #define ARCH_PERFMON_EVENTS_COUNT 7 +#define PEBS_DATACFG_MEMINFO BIT_ULL(0) +#define PEBS_DATACFG_GP BIT_ULL(1) +#define PEBS_DATACFG_XMMS BIT_ULL(2) +#define PEBS_DATACFG_LBRS BIT_ULL(3) +#define PEBS_DATACFG_LBR_SHIFT 24 + /* * Intel "Architectural Performance Monitoring" CPUID * detection/enumeration details: @@ -177,6 +185,41 @@ struct x86_pmu_capability { #define GLOBAL_STATUS_TRACE_TOPAPMI BIT_ULL(55) /* + * Adaptive PEBS v4 + */ + +struct pebs_basic { + u64 format_size; + u64 ip; + u64 applicable_counters; + u64 tsc; +}; + +struct pebs_meminfo { + u64 address; + u64 aux; + u64 latency; + u64 tsx_tuning; +}; + +struct pebs_gprs { + u64 flags, ip, ax, cx, dx, bx, sp, bp, si, di; + u64 r8, r9, r10, r11, r12, r13, r14, r15; +}; + +struct pebs_xmm { + u64 xmm[16*2]; /* two entries for each register */ +}; + +struct pebs_lbr_entry { + u64 from, to, info; +}; + +struct pebs_lbr { + struct pebs_lbr_entry lbr[0]; /* Variable length */ +}; + +/* * IBS cpuid feature detection */ @@ -248,6 +291,11 @@ extern void perf_events_lapic_init(void); #define PERF_EFLAGS_VM (1UL << 5) struct pt_regs; +struct x86_perf_regs { + struct pt_regs regs; + u64 *xmm_regs; +}; + extern unsigned long perf_instruction_pointer(struct pt_regs *regs); extern unsigned long perf_misc_flags(struct pt_regs *regs); #define perf_misc_flags(regs) perf_misc_flags(regs) @@ -260,14 +308,9 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs); */ #define perf_arch_fetch_caller_regs(regs, __ip) { \ (regs)->ip = (__ip); \ - (regs)->bp = caller_frame_pointer(); \ + (regs)->sp = (unsigned long)__builtin_frame_address(0); \ (regs)->cs = __KERNEL_CS; \ regs->flags = 0; \ - asm volatile( \ - _ASM_MOV "%%"_ASM_SP ", %0\n" \ - : "=m" ((regs)->sp) \ - :: "memory" \ - ); \ } struct perf_guest_switch_msr { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 2779ace16d23..5e0509b41986 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -23,6 +23,8 @@ #ifndef __ASSEMBLY__ #include <asm/x86_init.h> +#include <asm/fpu/xstate.h> +#include <asm/fpu/api.h> extern pgd_t early_top_pgt[PTRS_PER_PGD]; int __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -46,7 +48,7 @@ void ptdump_walk_user_pgd_level_checkwx(void); */ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __visible; -#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +#define ZERO_PAGE(vaddr) ((void)(vaddr),virt_to_page(empty_zero_page)) extern spinlock_t pgd_lock; extern struct list_head pgd_list; @@ -127,14 +129,29 @@ static inline int pte_dirty(pte_t pte) static inline u32 read_pkru(void) { if (boot_cpu_has(X86_FEATURE_OSPKE)) - return __read_pkru(); + return rdpkru(); return 0; } static inline void write_pkru(u32 pkru) { - if (boot_cpu_has(X86_FEATURE_OSPKE)) - __write_pkru(pkru); + struct pkru_state *pk; + + if (!boot_cpu_has(X86_FEATURE_OSPKE)) + return; + + pk = get_xsave_addr(¤t->thread.fpu.state.xsave, XFEATURE_PKRU); + + /* + * The PKRU value in xstate needs to be in sync with the value that is + * written to the CPU. The FPU restore on return to userland would + * otherwise load the previous value again. + */ + fpregs_lock(); + if (pk) + pk->pkru = pkru; + __write_pkru(pkru); + fpregs_unlock(); } static inline int pte_young(pte_t pte) @@ -1021,6 +1038,9 @@ static inline void __meminit init_trampoline_default(void) /* Default trampoline pgd value */ trampoline_pgd_entry = init_top_pgt[pgd_index(__PAGE_OFFSET)]; } + +void __init poking_init(void); + # ifdef CONFIG_RANDOMIZE_MEMORY void __meminit init_trampoline(void); # else @@ -1355,6 +1375,12 @@ static inline pmd_t pmd_swp_clear_soft_dirty(pmd_t pmd) #define PKRU_WD_BIT 0x2 #define PKRU_BITS_PER_PKEY 2 +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS +extern u32 init_pkru_value; +#else +#define init_pkru_value 0 +#endif + static inline bool __pkru_allows_read(u32 pkru, u16 pkey) { int pkru_pkey_bits = pkey * PKRU_BITS_PER_PKEY; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2bb3a648fc12..7e99ef67bff0 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -367,6 +367,13 @@ DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); #define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET + IO_BITMAP_BYTES + sizeof(unsigned long) - 1) +/* Per CPU interrupt stacks */ +struct irq_stack { + char stack[IRQ_STACK_SIZE]; +} __aligned(IRQ_STACK_SIZE); + +DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); + #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else @@ -374,38 +381,25 @@ DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif -/* - * Save the original ist values for checking stack pointers during debugging - */ -struct orig_ist { - unsigned long ist[7]; -}; - #ifdef CONFIG_X86_64 -DECLARE_PER_CPU(struct orig_ist, orig_ist); - -union irq_stack_union { - char irq_stack[IRQ_STACK_SIZE]; +struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ - struct { - char gs_base[40]; - unsigned long stack_canary; - }; + char gs_base[40]; + unsigned long stack_canary; }; -DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible; -DECLARE_INIT_PER_CPU(irq_stack_union); +DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; +DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { - return (unsigned long)per_cpu(irq_stack_union.gs_base, cpu); + return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } -DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); @@ -427,15 +421,8 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -/* - * per-CPU IRQ handling stacks - */ -struct irq_stack { - u32 stack[THREAD_SIZE/sizeof(u32)]; -} __aligned(THREAD_SIZE); - -DECLARE_PER_CPU(struct irq_stack *, hardirq_stack); -DECLARE_PER_CPU(struct irq_stack *, softirq_stack); +/* Per CPU softirq stack pointer */ +DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* X86_64 */ extern unsigned int fpu_kernel_xstate_size; diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h deleted file mode 100644 index 4c25cf6caefa..000000000000 --- a/arch/x86/include/asm/rwsem.h +++ /dev/null @@ -1,237 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+ - * - * Written by David Howells (dhowells@redhat.com). - * - * Derived from asm-x86/semaphore.h - * - * - * The MSW of the count is the negated number of active writers and waiting - * lockers, and the LSW is the total number of active locks - * - * The lock count is initialized to 0 (no active and no waiting lockers). - * - * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an - * uncontended lock. This can be determined because XADD returns the old value. - * Readers increment by 1 and see a positive value when uncontended, negative - * if there are writers (and maybe) readers waiting (in which case it goes to - * sleep). - * - * The value of WAITING_BIAS supports up to 32766 waiting processes. This can - * be extended to 65534 by manually checking the whole MSW rather than relying - * on the S flag. - * - * The value of ACTIVE_BIAS supports up to 65535 active processes. - * - * This should be totally fair - if anything is waiting, a process that wants a - * lock will go to the back of the queue. When the currently active lock is - * released, if there's a writer at the front of the queue, then that and only - * that will be woken up; if there's a bunch of consecutive readers at the - * front, then they'll all be woken up, but no other readers will be. - */ - -#ifndef _ASM_X86_RWSEM_H -#define _ASM_X86_RWSEM_H - -#ifndef _LINUX_RWSEM_H -#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead" -#endif - -#ifdef __KERNEL__ -#include <asm/asm.h> - -/* - * The bias values and the counter type limits the number of - * potential readers/writers to 32767 for 32 bits and 2147483647 - * for 64 bits. - */ - -#ifdef CONFIG_X86_64 -# define RWSEM_ACTIVE_MASK 0xffffffffL -#else -# define RWSEM_ACTIVE_MASK 0x0000ffffL -#endif - -#define RWSEM_UNLOCKED_VALUE 0x00000000L -#define RWSEM_ACTIVE_BIAS 0x00000001L -#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) -#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS -#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) - -/* - * lock for reading - */ -#define ____down_read(sem, slow_path) \ -({ \ - struct rw_semaphore* ret; \ - asm volatile("# beginning down_read\n\t" \ - LOCK_PREFIX _ASM_INC "(%[sem])\n\t" \ - /* adds 0x00000001 */ \ - " jns 1f\n" \ - " call " slow_path "\n" \ - "1:\n\t" \ - "# ending down_read\n\t" \ - : "+m" (sem->count), "=a" (ret), \ - ASM_CALL_CONSTRAINT \ - : [sem] "a" (sem) \ - : "memory", "cc"); \ - ret; \ -}) - -static inline void __down_read(struct rw_semaphore *sem) -{ - ____down_read(sem, "call_rwsem_down_read_failed"); -} - -static inline int __down_read_killable(struct rw_semaphore *sem) -{ - if (IS_ERR(____down_read(sem, "call_rwsem_down_read_failed_killable"))) - return -EINTR; - return 0; -} - -/* - * trylock for reading -- returns 1 if successful, 0 if contention - */ -static inline bool __down_read_trylock(struct rw_semaphore *sem) -{ - long result, tmp; - asm volatile("# beginning __down_read_trylock\n\t" - " mov %[count],%[result]\n\t" - "1:\n\t" - " mov %[result],%[tmp]\n\t" - " add %[inc],%[tmp]\n\t" - " jle 2f\n\t" - LOCK_PREFIX " cmpxchg %[tmp],%[count]\n\t" - " jnz 1b\n\t" - "2:\n\t" - "# ending __down_read_trylock\n\t" - : [count] "+m" (sem->count), [result] "=&a" (result), - [tmp] "=&r" (tmp) - : [inc] "i" (RWSEM_ACTIVE_READ_BIAS) - : "memory", "cc"); - return result >= 0; -} - -/* - * lock for writing - */ -#define ____down_write(sem, slow_path) \ -({ \ - long tmp; \ - struct rw_semaphore* ret; \ - \ - asm volatile("# beginning down_write\n\t" \ - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" \ - /* adds 0xffff0001, returns the old value */ \ - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" \ - /* was the active mask 0 before? */\ - " jz 1f\n" \ - " call " slow_path "\n" \ - "1:\n" \ - "# ending down_write" \ - : "+m" (sem->count), [tmp] "=d" (tmp), \ - "=a" (ret), ASM_CALL_CONSTRAINT \ - : [sem] "a" (sem), "[tmp]" (RWSEM_ACTIVE_WRITE_BIAS) \ - : "memory", "cc"); \ - ret; \ -}) - -static inline void __down_write(struct rw_semaphore *sem) -{ - ____down_write(sem, "call_rwsem_down_write_failed"); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) -{ - if (IS_ERR(____down_write(sem, "call_rwsem_down_write_failed_killable"))) - return -EINTR; - - return 0; -} - -/* - * trylock for writing -- returns 1 if successful, 0 if contention - */ -static inline bool __down_write_trylock(struct rw_semaphore *sem) -{ - bool result; - long tmp0, tmp1; - asm volatile("# beginning __down_write_trylock\n\t" - " mov %[count],%[tmp0]\n\t" - "1:\n\t" - " test " __ASM_SEL(%w1,%k1) "," __ASM_SEL(%w1,%k1) "\n\t" - /* was the active mask 0 before? */ - " jnz 2f\n\t" - " mov %[tmp0],%[tmp1]\n\t" - " add %[inc],%[tmp1]\n\t" - LOCK_PREFIX " cmpxchg %[tmp1],%[count]\n\t" - " jnz 1b\n\t" - "2:\n\t" - CC_SET(e) - "# ending __down_write_trylock\n\t" - : [count] "+m" (sem->count), [tmp0] "=&a" (tmp0), - [tmp1] "=&r" (tmp1), CC_OUT(e) (result) - : [inc] "er" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory"); - return result; -} - -/* - * unlock after reading - */ -static inline void __up_read(struct rw_semaphore *sem) -{ - long tmp; - asm volatile("# beginning __up_read\n\t" - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" - /* subtracts 1, returns the old value */ - " jns 1f\n\t" - " call call_rwsem_wake\n" /* expects old value in %edx */ - "1:\n" - "# ending __up_read\n" - : "+m" (sem->count), [tmp] "=d" (tmp) - : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_READ_BIAS) - : "memory", "cc"); -} - -/* - * unlock after writing - */ -static inline void __up_write(struct rw_semaphore *sem) -{ - long tmp; - asm volatile("# beginning __up_write\n\t" - LOCK_PREFIX " xadd %[tmp],(%[sem])\n\t" - /* subtracts 0xffff0001, returns the old value */ - " jns 1f\n\t" - " call call_rwsem_wake\n" /* expects old value in %edx */ - "1:\n\t" - "# ending __up_write\n" - : "+m" (sem->count), [tmp] "=d" (tmp) - : [sem] "a" (sem), "[tmp]" (-RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); -} - -/* - * downgrade write lock to read lock - */ -static inline void __downgrade_write(struct rw_semaphore *sem) -{ - asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX _ASM_ADD "%[inc],(%[sem])\n\t" - /* - * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) - * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) - */ - " jns 1f\n\t" - " call call_rwsem_downgrade_wake\n" - "1:\n\t" - "# ending __downgrade_write\n" - : "+m" (sem->count) - : [sem] "a" (sem), [inc] "er" (-RWSEM_WAITING_BIAS) - : "memory", "cc"); -} - -#endif /* __KERNEL__ */ -#endif /* _ASM_X86_RWSEM_H */ diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 07a25753e85c..ae7b909dc242 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -85,6 +85,9 @@ int set_pages_nx(struct page *page, int numpages); int set_pages_ro(struct page *page, int numpages); int set_pages_rw(struct page *page, int numpages); +int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_default_noflush(struct page *page); + extern int kernel_set_to_readonly; void set_kernel_text_rw(void); void set_kernel_text_ro(void); diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index db333300bd4b..f94a7d0ddd49 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -13,13 +13,12 @@ #ifndef _ASM_X86_SMAP_H #define _ASM_X86_SMAP_H -#include <linux/stringify.h> #include <asm/nops.h> #include <asm/cpufeatures.h> /* "Raw" instruction opcodes */ -#define __ASM_CLAC .byte 0x0f,0x01,0xca -#define __ASM_STAC .byte 0x0f,0x01,0xcb +#define __ASM_CLAC ".byte 0x0f,0x01,0xca" +#define __ASM_STAC ".byte 0x0f,0x01,0xcb" #ifdef __ASSEMBLY__ @@ -28,10 +27,10 @@ #ifdef CONFIG_X86_SMAP #define ASM_CLAC \ - ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_CLAC, X86_FEATURE_SMAP #define ASM_STAC \ - ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP + ALTERNATIVE "", __ASM_STAC, X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -49,26 +48,46 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __ASM_CLAC, X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __ASM_STAC, X86_FEATURE_SMAP); +} + +static __always_inline unsigned long smap_save(void) +{ + unsigned long flags; + + asm volatile (ALTERNATIVE("", "pushf; pop %0; " __ASM_CLAC, + X86_FEATURE_SMAP) + : "=rm" (flags) : : "memory", "cc"); + + return flags; +} + +static __always_inline void smap_restore(unsigned long flags) +{ + asm volatile (ALTERNATIVE("", "push %0; popf", X86_FEATURE_SMAP) + : : "g" (flags) : "memory", "cc"); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_CLAC, X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __ASM_STAC, X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ static inline void clac(void) { } static inline void stac(void) { } +static inline unsigned long smap_save(void) { return 0; } +static inline void smap_restore(unsigned long flags) { } + #define ASM_CLAC #define ASM_STAC diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 2e95b6c1bca3..da545df207b2 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -131,7 +131,7 @@ void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); -void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); +int common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); int common_cpu_die(unsigned int cpu); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 43c029cdc3fe..0a3c4cab39db 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -92,7 +92,7 @@ static inline void native_write_cr8(unsigned long val) #endif #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -static inline u32 __read_pkru(void) +static inline u32 rdpkru(void) { u32 ecx = 0; u32 edx, pkru; @@ -107,7 +107,7 @@ static inline u32 __read_pkru(void) return pkru; } -static inline void __write_pkru(u32 pkru) +static inline void wrpkru(u32 pkru) { u32 ecx = 0, edx = 0; @@ -118,8 +118,21 @@ static inline void __write_pkru(u32 pkru) asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (pkru), "c"(ecx), "d"(edx)); } + +static inline void __write_pkru(u32 pkru) +{ + /* + * WRPKRU is relatively expensive compared to RDPKRU. + * Avoid WRPKRU when it would not change the value. + */ + if (pkru == rdpkru()) + return; + + wrpkru(pkru); +} + #else -static inline u32 __read_pkru(void) +static inline u32 rdpkru(void) { return 0; } diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 8ec97a62c245..91e29b6a86a5 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -13,7 +13,7 @@ * On x86_64, %gs is shared by percpu area and stack canary. All * percpu symbols are zero based and %gs points to the base of percpu * area. The first occupant of the percpu area is always - * irq_stack_union which contains stack_canary at offset 40. Userland + * fixed_percpu_data which contains stack_canary at offset 40. Userland * %gs is always saved and restored on kernel entry and exit using * swapgs, so stack protector doesn't add any complexity there. * @@ -64,7 +64,7 @@ static __always_inline void boot_init_stack_canary(void) u64 tsc; #ifdef CONFIG_X86_64 - BUILD_BUG_ON(offsetof(union irq_stack_union, stack_canary) != 40); + BUILD_BUG_ON(offsetof(struct fixed_percpu_data, stack_canary) != 40); #endif /* * We both use the random pool and the current TSC as a source @@ -79,7 +79,7 @@ static __always_inline void boot_init_stack_canary(void) current->stack_canary = canary; #ifdef CONFIG_X86_64 - this_cpu_write(irq_stack_union.stack_canary, canary); + this_cpu_write(fixed_percpu_data.stack_canary, canary); #else this_cpu_write(stack_canary.canary, canary); #endif diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f335aad404a4..a8d0cdf48616 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -9,6 +9,8 @@ #include <linux/uaccess.h> #include <linux/ptrace.h> + +#include <asm/cpu_entry_area.h> #include <asm/switch_to.h> enum stack_type { @@ -98,19 +100,6 @@ struct stack_frame_ia32 { u32 return_address; }; -static inline unsigned long caller_frame_pointer(void) -{ - struct stack_frame *frame; - - frame = __builtin_frame_address(0); - -#ifdef CONFIG_FRAME_POINTER - frame = frame->next_frame; -#endif - - return (unsigned long)frame; -} - void show_opcodes(struct pt_regs *regs, const char *loglvl); void show_ip(struct pt_regs *regs, const char *loglvl); #endif /* _ASM_X86_STACKTRACE_H */ diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 7cf1a270d891..18a4b6890fa8 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -46,6 +46,7 @@ struct inactive_task_frame { unsigned long r13; unsigned long r12; #else + unsigned long flags; unsigned long si; unsigned long di; #endif diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index 2fe745356fb1..6d8d6bc183b7 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -14,6 +14,8 @@ * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). */ +#include <asm/rmwcc.h> + #define ADDR (*(volatile long *)addr) /** @@ -29,7 +31,7 @@ */ static inline void sync_set_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; bts %1,%0" + asm volatile("lock; " __ASM_SIZE(bts) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -47,7 +49,7 @@ static inline void sync_set_bit(long nr, volatile unsigned long *addr) */ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btr %1,%0" + asm volatile("lock; " __ASM_SIZE(btr) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -64,7 +66,7 @@ static inline void sync_clear_bit(long nr, volatile unsigned long *addr) */ static inline void sync_change_bit(long nr, volatile unsigned long *addr) { - asm volatile("lock; btc %1,%0" + asm volatile("lock; " __ASM_SIZE(btc) " %1,%0" : "+m" (ADDR) : "Ir" (nr) : "memory"); @@ -78,14 +80,9 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) +static inline bool sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; bts %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(bts), *addr, c, "Ir", nr); } /** @@ -98,12 +95,7 @@ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; btr %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btr), *addr, c, "Ir", nr); } /** @@ -116,12 +108,7 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; - - asm volatile("lock; btc %2,%1\n\tsetc %0" - : "=qm" (oldbit), "+m" (ADDR) - : "Ir" (nr) : "memory"); - return oldbit; + return GEN_BINARY_RMWcc("lock; " __ASM_SIZE(btc), *addr, c, "Ir", nr); } #define sync_test_bit(nr, addr) test_bit(nr, addr) diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index e85ff65c43c3..c90678fd391a 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -18,7 +18,7 @@ static inline void apply_paravirt(struct paravirt_patch_site *start, #define __parainstructions_end NULL #endif -extern void *text_poke_early(void *addr, const void *opcode, size_t len); +extern void text_poke_early(void *addr, const void *opcode, size_t len); /* * Clear and restore the kernel write-protection flag on the local CPU. @@ -35,8 +35,11 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); -extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); extern int after_bootmem; +extern __ro_after_init struct mm_struct *poking_mm; +extern __ro_after_init unsigned long poking_addr; #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e0eccbcb8447..f9453536f9bb 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -88,6 +88,7 @@ struct thread_info { #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_PATCH_PENDING 13 /* pending live patching update */ +#define TIF_NEED_FPU_LOAD 14 /* load FPU on return to userspace */ #define TIF_NOCPUID 15 /* CPUID is not accessible in userland */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ @@ -117,6 +118,7 @@ struct thread_info { #define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) +#define _TIF_NEED_FPU_LOAD (1 << TIF_NEED_FPU_LOAD) #define _TIF_NOCPUID (1 << TIF_NOCPUID) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h index 404b8b1d44f5..f23e7aaff4cd 100644 --- a/arch/x86/include/asm/tlb.h +++ b/arch/x86/include/asm/tlb.h @@ -6,6 +6,7 @@ #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) +#define tlb_flush tlb_flush static inline void tlb_flush(struct mmu_gather *tlb); #include <asm-generic/tlb.h> diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index f4204bf377fc..dee375831962 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -167,7 +167,7 @@ struct tlb_state { */ struct mm_struct *loaded_mm; -#define LOADED_MM_SWITCHING ((struct mm_struct *)1) +#define LOADED_MM_SWITCHING ((struct mm_struct *)1UL) /* Last user mm for optimizing IBPB */ union { @@ -274,6 +274,8 @@ static inline bool nmi_uaccess_okay(void) return true; } +#define nmi_uaccess_okay nmi_uaccess_okay + /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { diff --git a/arch/x86/include/asm/trace/exceptions.h b/arch/x86/include/asm/trace/exceptions.h index e0e6d7f21399..6b1e87194809 100644 --- a/arch/x86/include/asm/trace/exceptions.h +++ b/arch/x86/include/asm/trace/exceptions.h @@ -30,7 +30,7 @@ DECLARE_EVENT_CLASS(x86_exceptions, __entry->error_code = error_code; ), - TP_printk("address=%pf ip=%pf error_code=0x%lx", + TP_printk("address=%ps ip=%ps error_code=0x%lx", (void *)__entry->address, (void *)__entry->ip, __entry->error_code) ); diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 069c04be1507..879b77792f94 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -13,22 +13,22 @@ DECLARE_EVENT_CLASS(x86_fpu, TP_STRUCT__entry( __field(struct fpu *, fpu) - __field(bool, initialized) + __field(bool, load_fpu) __field(u64, xfeatures) __field(u64, xcomp_bv) ), TP_fast_assign( __entry->fpu = fpu; - __entry->initialized = fpu->initialized; + __entry->load_fpu = test_thread_flag(TIF_NEED_FPU_LOAD); if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p initialized: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p load: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, - __entry->initialized, + __entry->load_fpu, __entry->xfeatures, __entry->xcomp_bv ) @@ -64,11 +64,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_regs_deactivated, TP_ARGS(fpu) ); -DEFINE_EVENT(x86_fpu, x86_fpu_activate_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 1954dd5552a2..c82abd6e4ca3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -427,10 +427,11 @@ do { \ ({ \ __label__ __pu_label; \ int __pu_err = -EFAULT; \ - __typeof__(*(ptr)) __pu_val; \ - __pu_val = x; \ + __typeof__(*(ptr)) __pu_val = (x); \ + __typeof__(ptr) __pu_ptr = (ptr); \ + __typeof__(size) __pu_size = (size); \ __uaccess_begin(); \ - __put_user_size(__pu_val, (ptr), (size), __pu_label); \ + __put_user_size(__pu_val, __pu_ptr, __pu_size, __pu_label); \ __pu_err = 0; \ __pu_label: \ __uaccess_end(); \ @@ -585,7 +586,6 @@ extern void __cmpxchg_wrong_size(void) #define __user_atomic_cmpxchg_inatomic(uval, ptr, old, new, size) \ ({ \ int __ret = 0; \ - __typeof__(ptr) __uval = (uval); \ __typeof__(*(ptr)) __old = (old); \ __typeof__(*(ptr)) __new = (new); \ __uaccess_begin_nospec(); \ @@ -661,7 +661,7 @@ extern void __cmpxchg_wrong_size(void) __cmpxchg_wrong_size(); \ } \ __uaccess_end(); \ - *__uval = __old; \ + *(uval) = __old; \ __ret; \ }) @@ -705,7 +705,7 @@ extern struct movsl_mask { * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ -static __must_check inline bool user_access_begin(const void __user *ptr, size_t len) +static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; @@ -715,6 +715,9 @@ static __must_check inline bool user_access_begin(const void __user *ptr, size_t #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() +#define user_access_save() smap_save() +#define user_access_restore(x) smap_restore(x) + #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index a9d637bc301d..5cd1caa8bc65 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -208,9 +208,6 @@ __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) } unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len); - -unsigned long mcsafe_handle_tail(char *to, char *from, unsigned len); #endif /* _ASM_X86_UACCESS_64_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 2863c2026655..d50c7b747d8b 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -217,6 +217,22 @@ xen_single_call(unsigned int call, return (long)__res; } +static __always_inline void __xen_stac(void) +{ + /* + * Suppress objtool seeing the STAC/CLAC and getting confused about it + * calling random code with AC=1. + */ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_STAC ::: "memory", "flags"); +} + +static __always_inline void __xen_clac(void) +{ + asm volatile(ANNOTATE_IGNORE_ALTERNATIVE + ASM_CLAC ::: "memory", "flags"); +} + static inline long privcmd_call(unsigned int call, unsigned long a1, unsigned long a2, @@ -225,9 +241,9 @@ privcmd_call(unsigned int call, { long res; - stac(); + __xen_stac(); res = xen_single_call(call, a1, a2, a3, a4, a5); - clac(); + __xen_clac(); return res; } @@ -424,9 +440,9 @@ HYPERVISOR_dm_op( domid_t dom, unsigned int nr_bufs, struct xen_dm_op_buf *bufs) { int ret; - stac(); + __xen_stac(); ret = _hypercall3(int, dm_op, dom, nr_bufs, bufs); - clac(); + __xen_clac(); return ret; } diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index dabfcf7c3941..7a0e64ccd6ff 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -381,6 +381,7 @@ struct kvm_sync_regs { #define KVM_X86_QUIRK_LINT0_REENABLED (1 << 0) #define KVM_X86_QUIRK_CD_NW_CLEARED (1 << 1) #define KVM_X86_QUIRK_LAPIC_MMIO_HOLE (1 << 2) +#define KVM_X86_QUIRK_OUT_7E_INC_RIP (1 << 3) #define KVM_STATE_NESTED_GUEST_MODE 0x00000001 #define KVM_STATE_NESTED_RUN_PENDING 0x00000002 diff --git a/arch/x86/include/uapi/asm/perf_regs.h b/arch/x86/include/uapi/asm/perf_regs.h index f3329cabce5c..ac67bbea10ca 100644 --- a/arch/x86/include/uapi/asm/perf_regs.h +++ b/arch/x86/include/uapi/asm/perf_regs.h @@ -27,8 +27,29 @@ enum perf_event_x86_regs { PERF_REG_X86_R13, PERF_REG_X86_R14, PERF_REG_X86_R15, - + /* These are the limits for the GPRs. */ PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1, PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1, + + /* These all need two bits set because they are 128bit */ + PERF_REG_X86_XMM0 = 32, + PERF_REG_X86_XMM1 = 34, + PERF_REG_X86_XMM2 = 36, + PERF_REG_X86_XMM3 = 38, + PERF_REG_X86_XMM4 = 40, + PERF_REG_X86_XMM5 = 42, + PERF_REG_X86_XMM6 = 44, + PERF_REG_X86_XMM7 = 46, + PERF_REG_X86_XMM8 = 48, + PERF_REG_X86_XMM9 = 50, + PERF_REG_X86_XMM10 = 52, + PERF_REG_X86_XMM11 = 54, + PERF_REG_X86_XMM12 = 56, + PERF_REG_X86_XMM13 = 58, + PERF_REG_X86_XMM14 = 60, + PERF_REG_X86_XMM15 = 62, + + /* These include both GPRs and XMMX registers */ + PERF_REG_X86_XMM_MAX = PERF_REG_X86_XMM15 + 2, }; #endif /* _ASM_X86_PERF_REGS_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index f0b0c90dd398..d213ec5c3766 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -146,6 +146,7 @@ #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 #define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 +#define VMX_ABORT_VMCS_CORRUPTED 3 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 158ad1483c43..cb6e076a6d39 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -51,6 +51,18 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, if (c->x86_vendor == X86_VENDOR_INTEL && (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f))) flags->bm_control = 0; + /* + * For all recent Centaur CPUs, the ucode will make sure that each + * core can keep cache coherence with each other while entering C3 + * type state. So, set bm_check to 1 to indicate that the kernel + * doesn't need to execute a cache flush operation (WBINVD) when + * entering C3 type state. + */ + if (c->x86_vendor == X86_VENDOR_CENTAUR) { + if (c->x86 > 6 || (c->x86 == 6 && c->x86_model == 0x0f && + c->x86_stepping >= 0x0e)) + flags->bm_check = 1; + } } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 9a79c7808f9c..7b9b49dfc05a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/kdebug.h> #include <linux/kprobes.h> +#include <linux/mmu_context.h> #include <asm/text-patching.h> #include <asm/alternative.h> #include <asm/sections.h> @@ -264,7 +265,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; -void *text_poke_early(void *addr, const void *opcode, size_t len); +void text_poke_early(void *addr, const void *opcode, size_t len); /* * Are we looking at a near JMP with a 1 or 4-byte displacement. @@ -666,16 +667,136 @@ void __init alternative_instructions(void) * instructions. And on the local CPU you need to be protected again NMI or MCE * handlers seeing an inconsistent instruction while you patch. */ -void *__init_or_module text_poke_early(void *addr, const void *opcode, - size_t len) +void __init_or_module text_poke_early(void *addr, const void *opcode, + size_t len) { unsigned long flags; + + if (boot_cpu_has(X86_FEATURE_NX) && + is_module_text_address((unsigned long)addr)) { + /* + * Modules text is marked initially as non-executable, so the + * code cannot be running and speculative code-fetches are + * prevented. Just change the code. + */ + memcpy(addr, opcode, len); + } else { + local_irq_save(flags); + memcpy(addr, opcode, len); + local_irq_restore(flags); + sync_core(); + + /* + * Could also do a CLFLUSH here to speed up CPU recovery; but + * that causes hangs on some VIA CPUs. + */ + } +} + +__ro_after_init struct mm_struct *poking_mm; +__ro_after_init unsigned long poking_addr; + +static void *__text_poke(void *addr, const void *opcode, size_t len) +{ + bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; + struct page *pages[2] = {NULL}; + temp_mm_state_t prev; + unsigned long flags; + pte_t pte, *ptep; + spinlock_t *ptl; + pgprot_t pgprot; + + /* + * While boot memory allocator is running we cannot use struct pages as + * they are not yet initialized. There is no way to recover. + */ + BUG_ON(!after_bootmem); + + if (!core_kernel_text((unsigned long)addr)) { + pages[0] = vmalloc_to_page(addr); + if (cross_page_boundary) + pages[1] = vmalloc_to_page(addr + PAGE_SIZE); + } else { + pages[0] = virt_to_page(addr); + WARN_ON(!PageReserved(pages[0])); + if (cross_page_boundary) + pages[1] = virt_to_page(addr + PAGE_SIZE); + } + /* + * If something went wrong, crash and burn since recovery paths are not + * implemented. + */ + BUG_ON(!pages[0] || (cross_page_boundary && !pages[1])); + local_irq_save(flags); - memcpy(addr, opcode, len); + + /* + * Map the page without the global bit, as TLB flushing is done with + * flush_tlb_mm_range(), which is intended for non-global PTEs. + */ + pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL); + + /* + * The lock is not really needed, but this allows to avoid open-coding. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + + /* + * This must not fail; preallocated in poking_init(). + */ + VM_BUG_ON(!ptep); + + pte = mk_pte(pages[0], pgprot); + set_pte_at(poking_mm, poking_addr, ptep, pte); + + if (cross_page_boundary) { + pte = mk_pte(pages[1], pgprot); + set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + } + + /* + * Loading the temporary mm behaves as a compiler barrier, which + * guarantees that the PTE will be set at the time memcpy() is done. + */ + prev = use_temporary_mm(poking_mm); + + kasan_disable_current(); + memcpy((u8 *)poking_addr + offset_in_page(addr), opcode, len); + kasan_enable_current(); + + /* + * Ensure that the PTE is only cleared after the instructions of memcpy + * were issued by using a compiler barrier. + */ + barrier(); + + pte_clear(poking_mm, poking_addr, ptep); + if (cross_page_boundary) + pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + + /* + * Loading the previous page-table hierarchy requires a serializing + * instruction that already allows the core to see the updated version. + * Xen-PV is assumed to serialize execution in a similar manner. + */ + unuse_temporary_mm(prev); + + /* + * Flushing the TLB might involve IPIs, which would require enabled + * IRQs, but not if the mm is not used, as it is in this point. + */ + flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, + PAGE_SHIFT, false); + + /* + * If the text does not match what we just wrote then something is + * fundamentally screwy; there's nothing we can really do about that. + */ + BUG_ON(memcmp(addr, opcode, len)); + + pte_unmap_unlock(ptep, ptl); local_irq_restore(flags); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ return addr; } @@ -689,48 +810,36 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, * It means the size must be writable atomically and the address must be aligned * in a way that permits an atomic write. It also makes sure we fit on a single * page. + * + * Note that the caller must ensure that if the modified code is part of a + * module, the module would not be removed during poking. This can be achieved + * by registering a module notifier, and ordering module removal and patching + * trough a mutex. */ void *text_poke(void *addr, const void *opcode, size_t len) { - unsigned long flags; - char *vaddr; - struct page *pages[2]; - int i; - - /* - * While boot memory allocator is runnig we cannot use struct - * pages as they are not yet initialized. - */ - BUG_ON(!after_bootmem); - lockdep_assert_held(&text_mutex); - if (!core_kernel_text((unsigned long)addr)) { - pages[0] = vmalloc_to_page(addr); - pages[1] = vmalloc_to_page(addr + PAGE_SIZE); - } else { - pages[0] = virt_to_page(addr); - WARN_ON(!PageReserved(pages[0])); - pages[1] = virt_to_page(addr + PAGE_SIZE); - } - BUG_ON(!pages[0]); - local_irq_save(flags); - set_fixmap(FIX_TEXT_POKE0, page_to_phys(pages[0])); - if (pages[1]) - set_fixmap(FIX_TEXT_POKE1, page_to_phys(pages[1])); - vaddr = (char *)fix_to_virt(FIX_TEXT_POKE0); - memcpy(&vaddr[(unsigned long)addr & ~PAGE_MASK], opcode, len); - clear_fixmap(FIX_TEXT_POKE0); - if (pages[1]) - clear_fixmap(FIX_TEXT_POKE1); - local_flush_tlb(); - sync_core(); - /* Could also do a CLFLUSH here to speed up CPU recovery; but - that causes hangs on some VIA CPUs. */ - for (i = 0; i < len; i++) - BUG_ON(((char *)addr)[i] != ((char *)opcode)[i]); - local_irq_restore(flags); - return addr; + return __text_poke(addr, opcode, len); +} + +/** + * text_poke_kgdb - Update instructions on a live kernel by kgdb + * @addr: address to modify + * @opcode: source of the copy + * @len: length to copy + * + * Only atomic text poke/set should be allowed when not doing early patching. + * It means the size must be writable atomically and the address must be aligned + * in a way that permits an atomic write. It also makes sure we fit on a single + * page. + * + * Context: should only be used by kgdb, which ensures no other core is running, + * despite the fact it does not hold the text_mutex. + */ +void *text_poke_kgdb(void *addr, const void *opcode, size_t len) +{ + return __text_poke(addr, opcode, len); } static void do_sync_core(void *info) @@ -788,7 +897,7 @@ NOKPROBE_SYMBOL(poke_int3_handler); * replacing opcode * - sync cores */ -void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +void text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) { unsigned char int3 = 0xcc; @@ -830,7 +939,5 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) * the writing of the new instruction. */ bp_patching_in_progress = false; - - return addr; } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b7bcdd781651..ab6af775f06c 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -802,6 +802,24 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc) return 0; } +static int __init lapic_init_clockevent(void) +{ + if (!lapic_timer_frequency) + return -1; + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, + TICK_NSEC, lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); + lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + lapic_clockevent.min_delta_ticks = 0xF; + + return 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = this_cpu_ptr(&lapic_events); @@ -810,25 +828,21 @@ static int __init calibrate_APIC_clock(void) long delta, deltatsc; int pm_referenced = 0; - /** - * check if lapic timer has already been calibrated by platform - * specific routine, such as tsc calibration code. if so, we just fill + if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) + return 0; + + /* + * Check if lapic timer has already been calibrated by platform + * specific routine, such as tsc calibration code. If so just fill * in the clockevent structure and return. */ - - if (boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER)) { - return 0; - } else if (lapic_timer_frequency) { + if (!lapic_init_clockevent()) { apic_printk(APIC_VERBOSE, "lapic timer already calibrated %d\n", - lapic_timer_frequency); - lapic_clockevent.mult = div_sc(lapic_timer_frequency/APIC_DIVISOR, - TICK_NSEC, lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; + lapic_timer_frequency); + /* + * Direct calibration methods must have an always running + * local APIC timer, no need for broadcast timer. + */ lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; return 0; } @@ -869,17 +883,8 @@ static int __init calibrate_APIC_clock(void) pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, &delta, &deltatsc); - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent); - lapic_clockevent.max_delta_ticks = 0x7FFFFFFF; - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - lapic_clockevent.min_delta_ticks = 0xF; - lapic_timer_frequency = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + lapic_init_clockevent(); apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult); diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 78778b54f904..a5464b8b6c46 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -175,7 +175,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) this_cpu_write(cpu_llc_id, node); /* Account for nodes per socket in multi-core-module processors */ - if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { + if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { rdmsrl(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index ddced33184b5..d3d075226c0a 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -68,10 +68,12 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) - + offsetof(struct cea_exception_stacks, DB1_stack)); BLANK(); #ifdef CONFIG_STACKPROTECTOR - DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + DEFINE(stack_canary_offset, offsetof(struct fixed_percpu_data, stack_canary)); BLANK(); #endif diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index cfd24f9f7614..1796d2bdcaaa 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -28,7 +28,7 @@ obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o intel_pconfig.o intel_epb.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 01004bfb1a1b..fb6a64bd765f 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -82,11 +82,14 @@ static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) * performance at the same time.. */ +#ifdef CONFIG_X86_32 extern __visible void vide(void); -__asm__(".globl vide\n" +__asm__(".text\n" + ".globl vide\n" ".type vide, @function\n" ".align 4\n" "vide: ret\n"); +#endif static void init_amd_k5(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 804c49493938..64d5aec24203 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -83,7 +83,7 @@ unsigned int aperfmperf_get_khz(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; aperfmperf_snapshot_cpu(cpu, ktime_get(), true); @@ -99,7 +99,7 @@ void arch_freq_prepare_all(void) if (!cpu_khz) return; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return; for_each_online_cpu(cpu) @@ -115,7 +115,7 @@ unsigned int arch_freq_get_on_cpu(int cpu) if (!cpu_khz) return 0; - if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + if (!boot_cpu_has(X86_FEATURE_APERFMPERF)) return 0; if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 2da82eff0eb4..29630393f300 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -275,7 +275,7 @@ static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; -} v2_user_options[] __initdata = { +} v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, @@ -419,7 +419,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] __initdata = { +} mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -440,7 +440,8 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || + cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); @@ -658,7 +659,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initdata = { +} ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ @@ -672,7 +673,8 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) char arg[20]; int ret, i; - if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable")) { + if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || + cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", @@ -1008,6 +1010,11 @@ static void __init l1tf_select_mitigation(void) if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; + if (cpu_mitigations_off()) + l1tf_mitigation = L1TF_MITIGATION_OFF; + else if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb28e98a0659..8739bdfe9bdf 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -372,6 +372,8 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { + struct pkru_state *pk; + /* check the boot processor, plus compile options for PKU: */ if (!cpu_feature_enabled(X86_FEATURE_PKU)) return; @@ -382,6 +384,9 @@ static __always_inline void setup_pku(struct cpuinfo_x86 *c) return; cr4_set_bits(X86_CR4_PKE); + pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); + if (pk) + pk->pkru = init_pkru_value; /* * Seting X86_CR4_PKE will cause the X86_FEATURE_OSPKE * cpuid bit to be set. We need to ensure that we @@ -507,19 +512,6 @@ void load_percpu_segment(int cpu) DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif -#ifdef CONFIG_X86_64 -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; -#endif - /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) { @@ -1511,9 +1503,9 @@ static __init int setup_clearcpuid(char *arg) __setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 -DEFINE_PER_CPU_FIRST(union irq_stack_union, - irq_stack_union) __aligned(PAGE_SIZE) __visible; -EXPORT_PER_CPU_SYMBOL_GPL(irq_stack_union); +DEFINE_PER_CPU_FIRST(struct fixed_percpu_data, + fixed_percpu_data) __aligned(PAGE_SIZE) __visible; +EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data); /* * The following percpu variables are hot. Align current_task to @@ -1523,9 +1515,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); -DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; - +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; @@ -1562,23 +1552,7 @@ void syscall_init(void) X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT); } -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -static DEFINE_PER_CPU(unsigned long, debug_stack_addr); DEFINE_PER_CPU(int, debug_stack_usage); - -int is_debug_stack(unsigned long addr) -{ - return __this_cpu_read(debug_stack_usage) || - (addr <= __this_cpu_read(debug_stack_addr) && - addr > (__this_cpu_read(debug_stack_addr) - DEBUG_STKSZ)); -} -NOKPROBE_SYMBOL(is_debug_stack); - DEFINE_PER_CPU(u32, debug_idt_ctr); void debug_stack_set_zero(void) @@ -1668,7 +1642,7 @@ static void setup_getcpu(int cpu) unsigned long cpudata = vdso_encode_cpunode(cpu, early_cpu_to_node(cpu)); struct desc_struct d = { }; - if (static_cpu_has(X86_FEATURE_RDTSCP)) + if (boot_cpu_has(X86_FEATURE_RDTSCP)) write_rdtscp_aux(cpudata); /* Store CPU and node number in limit. */ @@ -1690,17 +1664,14 @@ static void setup_getcpu(int cpu) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit */ #ifdef CONFIG_X86_64 void cpu_init(void) { - struct orig_ist *oist; + int cpu = raw_smp_processor_id(); struct task_struct *me; struct tss_struct *t; - unsigned long v; - int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); @@ -1715,7 +1686,6 @@ void cpu_init(void) load_ucode_ap(); t = &per_cpu(cpu_tss_rw, cpu); - oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA if (this_cpu_read(numa_node) == 0 && @@ -1753,16 +1723,11 @@ void cpu_init(void) /* * set up and load the per-CPU TSS */ - if (!oist->ist[0]) { - char *estacks = get_cpu_entry_area(cpu)->exception_stacks; - - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - estacks += exception_stack_sizes[v]; - oist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - if (v == DEBUG_STACK-1) - per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; - } + if (!t->x86_tss.ist[0]) { + t->x86_tss.ist[IST_INDEX_DF] = __this_cpu_ist_top_va(DF); + t->x86_tss.ist[IST_INDEX_NMI] = __this_cpu_ist_top_va(NMI); + t->x86_tss.ist[IST_INDEX_DB] = __this_cpu_ist_top_va(DB); + t->x86_tss.ist[IST_INDEX_MCE] = __this_cpu_ist_top_va(MCE); } t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; @@ -1864,23 +1829,6 @@ void cpu_init(void) } #endif -static void bsp_resume(void) -{ - if (this_cpu->c_bsp_resume) - this_cpu->c_bsp_resume(&boot_cpu_data); -} - -static struct syscore_ops cpu_syscore_ops = { - .resume = bsp_resume, -}; - -static int __init init_cpu_syscore(void) -{ - register_syscore_ops(&cpu_syscore_ops); - return 0; -} -core_initcall(init_cpu_syscore); - /* * The microcode loader calls this upon late microcode load to recheck features, * only when microcode has been updated. Caller holds microcode_mutex and CPU diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 5eb946b9a9f3..c0e2407abdd6 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -14,7 +14,6 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); void (*c_detect_tlb)(struct cpuinfo_x86 *); - void (*c_bsp_resume)(struct cpuinfo_x86 *); int c_x86_vendor; #ifdef CONFIG_X86_32 /* Optional vendor specific routine to obtain the cache size. */ diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index cf25405444ab..415621ddb8a2 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -19,6 +19,8 @@ #include "cpu.h" +#define APICID_SOCKET_ID_BIT 6 + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to CPUID Fn8000_001E_ECX Node Identifiers[10:8] @@ -87,6 +89,9 @@ static void hygon_get_topology(struct cpuinfo_x86 *c) if (!err) c->x86_coreid_bits = get_count_order(c->x86_max_cores); + /* Socket ID is ApicId[6] for these processors. */ + c->phys_proc_id = c->apicid >> APICID_SOCKET_ID_BIT; + cacheinfo_hygon_init_llc_id(c, cpu, node_id); } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fc3c07fe7df5..f17c1a714779 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -596,36 +596,6 @@ detect_keyid_bits: c->x86_phys_bits -= keyid_bits; } -static void init_intel_energy_perf(struct cpuinfo_x86 *c) -{ - u64 epb; - - /* - * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized. - * (x86_energy_perf_policy(8) is available to change it at run-time.) - */ - if (!cpu_has(c, X86_FEATURE_EPB)) - return; - - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); - if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE) - return; - - pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); - pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n"); - epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL; - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); -} - -static void intel_bsp_resume(struct cpuinfo_x86 *c) -{ - /* - * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume, - * so reinitialize it properly like during bootup: - */ - init_intel_energy_perf(c); -} - static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; @@ -763,8 +733,6 @@ static void init_intel(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_TME)) detect_tme(c); - init_intel_energy_perf(c); - init_intel_misc_features(c); } @@ -1023,9 +991,7 @@ static const struct cpu_dev intel_cpu_dev = { .c_detect_tlb = intel_detect_tlb, .c_early_init = early_init_intel, .c_init = init_intel, - .c_bsp_resume = intel_bsp_resume, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); - diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c new file mode 100644 index 000000000000..f4dd73396f28 --- /dev/null +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Intel Performance and Energy Bias Hint support. + * + * Copyright (C) 2019 Intel Corporation + * + * Author: + * Rafael J. Wysocki <rafael.j.wysocki@intel.com> + */ + +#include <linux/cpuhotplug.h> +#include <linux/cpu.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/string.h> +#include <linux/syscore_ops.h> +#include <linux/pm.h> + +#include <asm/cpufeature.h> +#include <asm/msr.h> + +/** + * DOC: overview + * + * The Performance and Energy Bias Hint (EPB) allows software to specify its + * preference with respect to the power-performance tradeoffs present in the + * processor. Generally, the EPB is expected to be set by user space (directly + * via sysfs or with the help of the x86_energy_perf_policy tool), but there are + * two reasons for the kernel to update it. + * + * First, there are systems where the platform firmware resets the EPB during + * system-wide transitions from sleep states back into the working state + * effectively causing the previous EPB updates by user space to be lost. + * Thus the kernel needs to save the current EPB values for all CPUs during + * system-wide transitions to sleep states and restore them on the way back to + * the working state. That can be achieved by saving EPB for secondary CPUs + * when they are taken offline during transitions into system sleep states and + * for the boot CPU in a syscore suspend operation, so that it can be restored + * for the boot CPU in a syscore resume operation and for the other CPUs when + * they are brought back online. However, CPUs that are already offline when + * a system-wide PM transition is started are not taken offline again, but their + * EPB values may still be reset by the platform firmware during the transition, + * so in fact it is necessary to save the EPB of any CPU taken offline and to + * restore it when the given CPU goes back online at all times. + * + * Second, on many systems the initial EPB value coming from the platform + * firmware is 0 ('performance') and at least on some of them that is because + * the platform firmware does not initialize EPB at all with the assumption that + * the OS will do that anyway. That sometimes is problematic, as it may cause + * the system battery to drain too fast, for example, so it is better to adjust + * it on CPU bring-up and if the initial EPB value for a given CPU is 0, the + * kernel changes it to 6 ('normal'). + */ + +static DEFINE_PER_CPU(u8, saved_epb); + +#define EPB_MASK 0x0fULL +#define EPB_SAVED 0x10ULL +#define MAX_EPB EPB_MASK + +static int intel_epb_save(void) +{ + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + /* + * Ensure that saved_epb will always be nonzero after this write even if + * the EPB value read from the MSR is 0. + */ + this_cpu_write(saved_epb, (epb & EPB_MASK) | EPB_SAVED); + + return 0; +} + +static void intel_epb_restore(void) +{ + u64 val = this_cpu_read(saved_epb); + u64 epb; + + rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + if (val) { + val &= EPB_MASK; + } else { + /* + * Because intel_epb_save() has not run for the current CPU yet, + * it is going online for the first time, so if its EPB value is + * 0 ('performance') at this point, assume that it has not been + * initialized by the platform firmware and set it to 6 + * ('normal'). + */ + val = epb & EPB_MASK; + if (val == ENERGY_PERF_BIAS_PERFORMANCE) { + val = ENERGY_PERF_BIAS_NORMAL; + pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); + } + } + wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); +} + +static struct syscore_ops intel_epb_syscore_ops = { + .suspend = intel_epb_save, + .resume = intel_epb_restore, +}; + +static const char * const energy_perf_strings[] = { + "performance", + "balance-performance", + "normal", + "balance-power", + "power" +}; +static const u8 energ_perf_values[] = { + ENERGY_PERF_BIAS_PERFORMANCE, + ENERGY_PERF_BIAS_BALANCE_PERFORMANCE, + ENERGY_PERF_BIAS_NORMAL, + ENERGY_PERF_BIAS_BALANCE_POWERSAVE, + ENERGY_PERF_BIAS_POWERSAVE +}; + +static ssize_t energy_perf_bias_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + unsigned int cpu = dev->id; + u64 epb; + int ret; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + return sprintf(buf, "%llu\n", epb); +} + +static ssize_t energy_perf_bias_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpu = dev->id; + u64 epb, val; + int ret; + + ret = __sysfs_match_string(energy_perf_strings, + ARRAY_SIZE(energy_perf_strings), buf); + if (ret >= 0) + val = energ_perf_values[ret]; + else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) + return -EINVAL; + + ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + if (ret < 0) + return ret; + + ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + (epb & ~EPB_MASK) | val); + if (ret < 0) + return ret; + + return count; +} + +static DEVICE_ATTR_RW(energy_perf_bias); + +static struct attribute *intel_epb_attrs[] = { + &dev_attr_energy_perf_bias.attr, + NULL +}; + +static const struct attribute_group intel_epb_attr_group = { + .name = power_group_name, + .attrs = intel_epb_attrs +}; + +static int intel_epb_online(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + intel_epb_restore(); + if (!cpuhp_tasks_frozen) + sysfs_merge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + return 0; +} + +static int intel_epb_offline(unsigned int cpu) +{ + struct device *cpu_dev = get_cpu_device(cpu); + + if (!cpuhp_tasks_frozen) + sysfs_unmerge_group(&cpu_dev->kobj, &intel_epb_attr_group); + + intel_epb_save(); + return 0; +} + +static __init int intel_epb_init(void) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_EPB)) + return -ENODEV; + + ret = cpuhp_setup_state(CPUHP_AP_X86_INTEL_EPB_ONLINE, + "x86/intel/epb:online", intel_epb_online, + intel_epb_offline); + if (ret < 0) + goto err_out_online; + + register_syscore_ops(&intel_epb_syscore_ops); + return 0; + +err_out_online: + cpuhp_remove_state(CPUHP_AP_X86_INTEL_EPB_ONLINE); + return ret; +} +subsys_initcall(intel_epb_init); diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index e64de5149e50..d904aafe6409 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -563,33 +563,59 @@ out: return offset; } +bool amd_filter_mce(struct mce *m) +{ + enum smca_bank_types bank_type = smca_get_bank_type(m->bank); + struct cpuinfo_x86 *c = &boot_cpu_data; + u8 xec = (m->status >> 16) & 0x3F; + + /* See Family 17h Models 10h-2Fh Erratum #1114. */ + if (c->x86 == 0x17 && + c->x86_model >= 0x10 && c->x86_model <= 0x2F && + bank_type == SMCA_IF && xec == 10) + return true; + + return false; +} + /* - * Turn off MC4_MISC thresholding banks on all family 0x15 models since - * they're not supported there. + * Turn off thresholding banks for the following conditions: + * - MC4_MISC thresholding is not supported on Family 0x15. + * - Prevent possible spurious interrupts from the IF bank on Family 0x17 + * Models 0x10-0x2F due to Erratum #1114. */ -void disable_err_thresholding(struct cpuinfo_x86 *c) +void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) { - int i; + int i, num_msrs; u64 hwcr; bool need_toggle; - u32 msrs[] = { - 0x00000413, /* MC4_MISC0 */ - 0xc0000408, /* MC4_MISC1 */ - }; + u32 msrs[NR_BLOCKS]; + + if (c->x86 == 0x15 && bank == 4) { + msrs[0] = 0x00000413; /* MC4_MISC0 */ + msrs[1] = 0xc0000408; /* MC4_MISC1 */ + num_msrs = 2; + } else if (c->x86 == 0x17 && + (c->x86_model >= 0x10 && c->x86_model <= 0x2F)) { - if (c->x86 != 0x15) + if (smca_get_bank_type(bank) != SMCA_IF) + return; + + msrs[0] = MSR_AMD64_SMCA_MCx_MISC(bank); + num_msrs = 1; + } else { return; + } rdmsrl(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); - if (need_toggle) wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ - for (i = 0; i < ARRAY_SIZE(msrs); i++) + for (i = 0; i < num_msrs; i++) msr_clear_bit(msrs[i], 62); /* restore old settings */ @@ -604,12 +630,12 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; - disable_err_thresholding(c); - for (bank = 0; bank < mca_cfg.banks; ++bank) { if (mce_flags.smca) smca_configure(bank, cpu); + disable_err_thresholding(c, bank); + for (block = 0; block < NR_BLOCKS; ++block) { address = get_block_address(address, low, high, bank, block); if (!address) diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index b7fb541a4873..5112a50e6486 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -460,23 +460,6 @@ static void mce_irq_work_cb(struct irq_work *entry) mce_schedule_work(); } -static void mce_report_event(struct pt_regs *regs) -{ - if (regs->flags & (X86_VM_MASK|X86_EFLAGS_IF)) { - mce_notify_irq(); - /* - * Triggering the work queue here is just an insurance - * policy in case the syscall exit notify handler - * doesn't run soon enough or ends up running on the - * wrong CPU (can happen when audit sleeps) - */ - mce_schedule_work(); - return; - } - - irq_work_queue(&mce_irq_work); -} - /* * Check if the address reported by the CPU is in a format we can parse. * It would be possible to add code for most other cases, but all would @@ -712,19 +695,49 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); + + /* If this entry is not valid, ignore it */ if (!(m.status & MCI_STATUS_VAL)) continue; /* - * Uncorrected or signalled events are handled by the exception - * handler when it is enabled, so don't process those here. - * - * TBD do the same check for MCI_STATUS_EN here? + * If we are logging everything (at CPU online) or this + * is a corrected error, then we must log it. */ - if (!(flags & MCP_UC) && - (m.status & (mca_cfg.ser ? MCI_STATUS_S : MCI_STATUS_UC))) - continue; + if ((flags & MCP_UC) || !(m.status & MCI_STATUS_UC)) + goto log_it; + + /* + * Newer Intel systems that support software error + * recovery need to make additional checks. Other + * CPUs should skip over uncorrected errors, but log + * everything else. + */ + if (!mca_cfg.ser) { + if (m.status & MCI_STATUS_UC) + continue; + goto log_it; + } + /* Log "not enabled" (speculative) errors */ + if (!(m.status & MCI_STATUS_EN)) + goto log_it; + + /* + * Log UCNA (SDM: 15.6.3 "UCR Error Classification") + * UC == 1 && PCC == 0 && S == 0 + */ + if (!(m.status & MCI_STATUS_PCC) && !(m.status & MCI_STATUS_S)) + goto log_it; + + /* + * Skip anything else. Presumption is that our read of this + * bank is racing with a machine check. Leave the log alone + * for do_machine_check() to deal with it. + */ + continue; + +log_it: error_seen = true; mce_read_aux(&m, i); @@ -1301,7 +1314,8 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_panic("Fatal machine check on current CPU", &m, msg); if (worst > 0) - mce_report_event(regs); + irq_work_queue(&mce_irq_work); + mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); sync_core(); @@ -1451,13 +1465,12 @@ EXPORT_SYMBOL_GPL(mce_notify_irq); static int __mcheck_cpu_mce_banks_init(void) { int i; - u8 num_banks = mca_cfg.banks; - mce_banks = kcalloc(num_banks, sizeof(struct mce_bank), GFP_KERNEL); + mce_banks = kcalloc(MAX_NR_BANKS, sizeof(struct mce_bank), GFP_KERNEL); if (!mce_banks) return -ENOMEM; - for (i = 0; i < num_banks; i++) { + for (i = 0; i < MAX_NR_BANKS; i++) { struct mce_bank *b = &mce_banks[i]; b->ctl = -1ULL; @@ -1471,28 +1484,19 @@ static int __mcheck_cpu_mce_banks_init(void) */ static int __mcheck_cpu_cap_init(void) { - unsigned b; u64 cap; + u8 b; rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - if (!mca_cfg.banks) - pr_info("CPU supports %d MCE banks\n", b); - - if (b > MAX_NR_BANKS) { - pr_warn("Using only %u machine check banks out of %u\n", - MAX_NR_BANKS, b); + if (WARN_ON_ONCE(b > MAX_NR_BANKS)) b = MAX_NR_BANKS; - } - /* Don't support asymmetric configurations today */ - WARN_ON(mca_cfg.banks != 0 && b != mca_cfg.banks); - mca_cfg.banks = b; + mca_cfg.banks = max(mca_cfg.banks, b); if (!mce_banks) { int err = __mcheck_cpu_mce_banks_init(); - if (err) return err; } @@ -1771,6 +1775,14 @@ static void __mcheck_cpu_init_timer(void) mce_start_timer(t); } +bool filter_mce(struct mce *m) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return amd_filter_mce(m); + + return false; +} + /* Handle unconfigured int18 (should never happen) */ static void unexpected_machine_check(struct pt_regs *regs, long error_code) { @@ -2425,8 +2437,8 @@ static int fake_panic_set(void *data, u64 val) return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, - fake_panic_set, "%llu\n"); +DEFINE_DEBUGFS_ATTRIBUTE(fake_panic_fops, fake_panic_get, fake_panic_set, + "%llu\n"); static int __init mcheck_debugfs_init(void) { @@ -2435,8 +2447,8 @@ static int __init mcheck_debugfs_init(void) dmce = mce_get_debugfs_dir(); if (!dmce) return -ENOMEM; - ffake_panic = debugfs_create_file("fake_panic", 0444, dmce, NULL, - &fake_panic_fops); + ffake_panic = debugfs_create_file_unsafe("fake_panic", 0444, dmce, + NULL, &fake_panic_fops); if (!ffake_panic) return -ENOMEM; @@ -2451,6 +2463,8 @@ EXPORT_SYMBOL_GPL(mcsafe_key); static int __init mcheck_late_init(void) { + pr_info("Using %d MCE banks\n", mca_cfg.banks); + if (mca_cfg.recovery) static_branch_inc(&mcsafe_key); diff --git a/arch/x86/kernel/cpu/mce/genpool.c b/arch/x86/kernel/cpu/mce/genpool.c index 3395549c51d3..64d1d5a00f39 100644 --- a/arch/x86/kernel/cpu/mce/genpool.c +++ b/arch/x86/kernel/cpu/mce/genpool.c @@ -99,6 +99,9 @@ int mce_gen_pool_add(struct mce *mce) { struct mce_evt_llist *node; + if (filter_mce(mce)) + return -EINVAL; + if (!mce_evt_pool) return -EINVAL; diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 8492ef7d9015..a6026170af92 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -46,8 +46,6 @@ static struct mce i_mce; static struct dentry *dfs_inj; -static u8 n_banks; - #define MAX_FLAG_OPT_SIZE 4 #define NBCFG 0x44 @@ -528,7 +526,7 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && + if (boot_cpu_has(X86_FEATURE_AMD_DCM) && b == 4 && boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); @@ -570,9 +568,15 @@ err: static int inj_bank_set(void *data, u64 val) { struct mce *m = (struct mce *)data; + u8 n_banks; + u64 cap; + + /* Get bank count on target CPU so we can handle non-uniform values. */ + rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { - pr_err("Non-existent MCE bank: %llu\n", val); + pr_err("MCA bank %llu non-existent on CPU%d\n", val, m->extcpu); return -EINVAL; } @@ -665,10 +669,6 @@ static struct dfs_node { static int __init debugfs_init(void) { unsigned int i; - u64 cap; - - rdmsrl(MSR_IA32_MCG_CAP, cap); - n_banks = cap & MCG_BANKCNT_MASK; dfs_inj = debugfs_create_dir("mce-inject", NULL); if (!dfs_inj) diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index af5eab1e65e2..a34b55baa7aa 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -173,4 +173,13 @@ struct mca_msr_regs { extern struct mca_msr_regs msr_ops; +/* Decide whether to add MCE record to MCE event pool or filter it out. */ +extern bool filter_mce(struct mce *m); + +#ifdef CONFIG_X86_MCE_AMD +extern bool amd_filter_mce(struct mce *m); +#else +static inline bool amd_filter_mce(struct mce *m) { return false; }; +#endif + #endif /* __X86_MCE_INTERNAL_H__ */ diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5260185cbf7b..c321f4f513f9 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -418,8 +418,9 @@ static int do_microcode_update(const void __user *buf, size_t size) if (ustate == UCODE_ERROR) { error = -1; break; - } else if (ustate == UCODE_OK) + } else if (ustate == UCODE_NEW) { apply_microcode_on_target(cpu); + } } return error; @@ -427,7 +428,7 @@ static int do_microcode_update(const void __user *buf, size_t size) static int microcode_open(struct inode *inode, struct file *file) { - return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; + return capable(CAP_SYS_RAWIO) ? stream_open(inode, file) : -EPERM; } static ssize_t microcode_write(struct file *file, const char __user *buf, diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 16936a24795c..a44bdbe7c55e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -31,6 +31,7 @@ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/uio.h> #include <linux/mm.h> #include <asm/microcode_intel.h> @@ -861,32 +862,33 @@ out: return ret; } -static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, struct iov_iter *iter) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; - int new_rev = uci->cpu_sig.rev; - unsigned int leftover = size; unsigned int curr_mc_size = 0, new_mc_size = 0; - unsigned int csig, cpf; enum ucode_state ret = UCODE_OK; + int new_rev = uci->cpu_sig.rev; + u8 *new_mc = NULL, *mc = NULL; + unsigned int csig, cpf; - while (leftover) { + while (iov_iter_count(iter)) { struct microcode_header_intel mc_header; - unsigned int mc_size; + unsigned int mc_size, data_size; + u8 *data; - if (leftover < sizeof(mc_header)) { - pr_err("error! Truncated header in microcode data file\n"); + if (!copy_from_iter_full(&mc_header, sizeof(mc_header), iter)) { + pr_err("error! Truncated or inaccessible header in microcode data file\n"); break; } - if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) - break; - mc_size = get_totalsize(&mc_header); - if (!mc_size || mc_size > leftover) { - pr_err("error! Bad data in microcode data file\n"); + if (mc_size < sizeof(mc_header)) { + pr_err("error! Bad data in microcode data file (totalsize too small)\n"); + break; + } + data_size = mc_size - sizeof(mc_header); + if (data_size > iov_iter_count(iter)) { + pr_err("error! Bad data in microcode data file (truncated file?)\n"); break; } @@ -899,7 +901,9 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, curr_mc_size = mc_size; } - if (get_ucode_data(mc, ucode_ptr, mc_size) || + memcpy(mc, &mc_header, sizeof(mc_header)); + data = mc + sizeof(mc_header); + if (!copy_from_iter_full(data, data_size, iter) || microcode_sanity_check(mc, 1) < 0) { break; } @@ -914,14 +918,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, mc = NULL; /* trigger new vmalloc */ ret = UCODE_NEW; } - - ucode_ptr += mc_size; - leftover -= mc_size; } vfree(mc); - if (leftover) { + if (iov_iter_count(iter)) { vfree(new_mc); return UCODE_ERROR; } @@ -945,12 +946,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, return ret; } -static int get_ucode_fw(void *to, const void *from, size_t n) -{ - memcpy(to, from, n); - return 0; -} - static bool is_blacklisted(unsigned int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -977,10 +972,12 @@ static bool is_blacklisted(unsigned int cpu) static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { - char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; + struct iov_iter iter; enum ucode_state ret; + struct kvec kvec; + char name[30]; if (is_blacklisted(cpu)) return UCODE_NFOUND; @@ -993,26 +990,30 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device, return UCODE_NFOUND; } - ret = generic_load_microcode(cpu, (void *)firmware->data, - firmware->size, &get_ucode_fw); + kvec.iov_base = (void *)firmware->data; + kvec.iov_len = firmware->size; + iov_iter_kvec(&iter, WRITE, &kvec, 1, firmware->size); + ret = generic_load_microcode(cpu, &iter); release_firmware(firmware); return ret; } -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + struct iov_iter iter; + struct iovec iov; + if (is_blacklisted(cpu)) return UCODE_NFOUND; - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); + iov.iov_base = (void __user *)buf; + iov.iov_len = size; + iov_iter_init(&iter, WRITE, &iov, 1, size); + + return generic_load_microcode(cpu, &iter); } static struct microcode_ops microcode_intel_ops = { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 2c8522a39ed5..cb2e49810d68 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -35,11 +35,11 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) "fpu_exception\t: %s\n" "cpuid level\t: %d\n" "wp\t\t: yes\n", - static_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", - static_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", - static_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_FDIV) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_F00F) ? "yes" : "no", + boot_cpu_has_bug(X86_BUG_COMA) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", + boot_cpu_has(X86_FEATURE_FPU) ? "yes" : "no", c->cpuid_level); } #else diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 2dbd990a2eb7..89320c0396b1 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -342,10 +342,10 @@ int update_domains(struct rdt_resource *r, int closid) if (cpumask_empty(cpu_mask) || mba_sc) goto done; cpu = get_cpu(); - /* Update CBM on this cpu if it's in cpu_mask. */ + /* Update resource control msr on this CPU if it's in cpu_mask. */ if (cpumask_test_cpu(cpu, cpu_mask)) rdt_ctrl_update(&msr_param); - /* Update CBM on other cpus. */ + /* Update resource control msr on other CPUs. */ smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); put_cpu(); diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index 399601eda8e4..333c177a2471 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -2039,14 +2039,14 @@ out: enum rdt_param { Opt_cdp, Opt_cdpl2, - Opt_mba_mpbs, + Opt_mba_mbps, nr__rdt_params }; static const struct fs_parameter_spec rdt_param_specs[] = { fsparam_flag("cdp", Opt_cdp), fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_mpbs", Opt_mba_mpbs), + fsparam_flag("mba_MBps", Opt_mba_mbps), {} }; @@ -2072,7 +2072,7 @@ static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_cdpl2: ctx->enable_cdpl2 = true; return 0; - case Opt_mba_mpbs: + case Opt_mba_mbps: if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) return -EINVAL; ctx->enable_mba_mbps = true; @@ -2516,103 +2516,131 @@ static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r) bitmap_clear(val, zero_bit, cbm_len - zero_bit); } -/** - * rdtgroup_init_alloc - Initialize the new RDT group's allocations - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. That is, all shareable and unused bits. +/* + * Initialize cache resources per RDT domain * - * All-zero CBM is invalid. If there are no more shareable bits available - * on any domain then the entire allocation will fail. + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +static int __init_one_rdt_domain(struct rdt_domain *d, struct rdt_resource *r, + u32 closid) { struct rdt_resource *r_cdp = NULL; struct rdt_domain *d_cdp = NULL; u32 used_b = 0, unused_b = 0; - u32 closid = rdtgrp->closid; - struct rdt_resource *r; unsigned long tmp_cbm; enum rdtgrp_mode mode; - struct rdt_domain *d; u32 peer_ctl, *ctrl; - int i, ret; + int i; - for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; - list_for_each_entry(d, &r->domains, list) { - rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); - d->have_new_ctrl = false; - d->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - ctrl = d->ctrl_val; - for (i = 0; i < closids_supported(); i++, ctrl++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - break; - /* - * If CDP is active include peer - * domain's usage to ensure there - * is no overlap with an exclusive - * group. - */ - if (d_cdp) - peer_ctl = d_cdp->ctrl_val[i]; - else - peer_ctl = 0; - used_b |= *ctrl | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - d->new_ctrl |= *ctrl | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - d->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cbm_ensure_valid(&d->new_ctrl, r); + rdt_cdp_peer_get(r, d, &r_cdp, &d_cdp); + d->have_new_ctrl = false; + d->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + ctrl = d->ctrl_val; + for (i = 0; i < closids_supported(); i++, ctrl++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + break; /* - * Assign the u32 CBM to an unsigned long to ensure - * that bitmap_weight() does not access out-of-bound - * memory. + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. */ - tmp_cbm = d->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < - r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", - r->name, d->id); - return -ENOSPC; - } - d->have_new_ctrl = true; + if (d_cdp) + peer_ctl = d_cdp->ctrl_val[i]; + else + peer_ctl = 0; + used_b |= *ctrl | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + d->new_ctrl |= *ctrl | peer_ctl; } } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + d->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cbm_ensure_valid(&d->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = d->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", r->name, d->id); + return -ENOSPC; + } + d->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct rdt_resource *r, u32 closid) +{ + struct rdt_domain *d; + int ret; + + list_for_each_entry(d, &r->domains, list) { + ret = __init_one_rdt_domain(d, r, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + d->new_ctrl = is_mba_sc(r) ? MBA_MAX_MBPS : r->default_ctrl; + d->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct rdt_resource *r; + int ret; for_each_alloc_enabled_rdt_resource(r) { - /* - * Only initialize default allocations for CBM cache - * resources - */ - if (r->rid == RDT_RESOURCE_MBA) - continue; + if (r->rid == RDT_RESOURCE_MBA) { + rdtgroup_init_mba(r); + } else { + ret = rdtgroup_init_cat(r, rdtgrp->closid); + if (ret < 0) + return ret; + } + ret = update_domains(r, rdtgrp->closid); if (ret < 0) { rdt_last_cmd_puts("Failed to initialize allocations\n"); return ret; } - rdtgrp->mode = RDT_MODE_SHAREABLE; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + return 0; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 17ffc869cab8..a96ca8584803 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -204,8 +204,7 @@ static struct crash_mem *fill_up_crash_elf_data(void) * another range split. So add extra two slots here. */ nr_ranges += 2; - cmem = vzalloc(sizeof(struct crash_mem) + - sizeof(struct crash_mem_range) * nr_ranges); + cmem = vzalloc(struct_size(cmem, ranges, nr_ranges)); if (!cmem) return NULL; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index cd53f3030e40..64a59d726639 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -34,14 +34,14 @@ const char *stack_type_name(enum stack_type type) static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -59,14 +59,14 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack_ptr); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5cdb9e84da57..753b8cfe8b8a 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,23 +16,21 @@ #include <linux/bug.h> #include <linux/nmi.h> +#include <asm/cpu_entry_area.h> #include <asm/stacktrace.h> -static char *exception_stack_names[N_EXCEPTION_STACKS] = { - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ NMI_STACK-1 ] = "NMI", - [ DEBUG_STACK-1 ] = "#DB", - [ MCE_STACK-1 ] = "#MC", -}; - -static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ +static const char * const exception_stack_names[] = { + [ ESTACK_DF ] = "#DF", + [ ESTACK_NMI ] = "NMI", + [ ESTACK_DB2 ] = "#DB2", + [ ESTACK_DB1 ] = "#DB1", + [ ESTACK_DB ] = "#DB", + [ ESTACK_MCE ] = "#MC", }; const char *stack_type_name(enum stack_type type) { - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); if (type == STACK_TYPE_IRQ) return "IRQ"; @@ -52,43 +50,84 @@ const char *stack_type_name(enum stack_type type) return NULL; } +/** + * struct estack_pages - Page descriptor for exception stacks + * @offs: Offset from the start of the exception stack area + * @size: Size of the exception stack + * @type: Type to store in the stack_info struct + */ +struct estack_pages { + u32 offs; + u16 size; + u16 type; +}; + +#define EPAGERANGE(st) \ + [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ + PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ + .offs = CEA_ESTACK_OFFS(st), \ + .size = CEA_ESTACK_SIZE(st), \ + .type = STACK_TYPE_EXCEPTION + ESTACK_ ##st, } + +/* + * Array of exception stack page descriptors. If the stack is larger than + * PAGE_SIZE, all pages covering a particular stack will have the same + * info. The guard pages including the not mapped DB2 stack are zeroed + * out. + */ +static const +struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = { + EPAGERANGE(DF), + EPAGERANGE(NMI), + EPAGERANGE(DB1), + EPAGERANGE(DB), + EPAGERANGE(MCE), +}; + static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *begin, *end; + unsigned long begin, end, stk = (unsigned long)stack; + const struct estack_pages *ep; struct pt_regs *regs; - unsigned k; + unsigned int k; - BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - (exception_stack_sizes[k] / sizeof(long)); - regs = (struct pt_regs *)end - 1; - - if (stack <= begin || stack >= end) - continue; + begin = (unsigned long)__this_cpu_read(cea_exception_stacks); + end = begin + sizeof(struct cea_exception_stacks); + /* Bail if @stack is outside the exception stack area. */ + if (stk < begin || stk >= end) + return false; - info->type = STACK_TYPE_EXCEPTION + k; - info->begin = begin; - info->end = end; - info->next_sp = (unsigned long *)regs->sp; + /* Calc page offset from start of exception stacks */ + k = (stk - begin) >> PAGE_SHIFT; + /* Lookup the page descriptor */ + ep = &estack_pages[k]; + /* Guard page? */ + if (!ep->size) + return false; - return true; - } + begin += (unsigned long)ep->offs; + end = begin + (unsigned long)ep->size; + regs = (struct pt_regs *)end - 1; - return false; + info->type = ep->type; + info->begin = (unsigned long *)begin; + info->end = (unsigned long *)end; + info->next_sp = (unsigned long *)regs->sp; + return true; } static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); /* * This is a software stack, so 'end' can be a valid stack pointer. * It just means the stack is empty. */ - if (stack <= begin || stack > end) + if (stack < begin || stack >= end) return false; info->type = STACK_TYPE_IRQ; diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 2e5003fef51a..ce243f76bdb7 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -101,24 +101,21 @@ static void __kernel_fpu_begin(void) kernel_fpu_disable(); - if (fpu->initialized) { - /* - * Ignore return value -- we don't care if reg state - * is clobbered. - */ - copy_fpregs_to_fpstate(fpu); - } else { - __cpu_invalidate_fpregs_state(); + if (current->mm) { + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { + set_thread_flag(TIF_NEED_FPU_LOAD); + /* + * Ignore return value -- we don't care if reg state + * is clobbered. + */ + copy_fpregs_to_fpstate(fpu); + } } + __cpu_invalidate_fpregs_state(); } static void __kernel_fpu_end(void) { - struct fpu *fpu = ¤t->thread.fpu; - - if (fpu->initialized) - copy_kernel_to_fpregs(&fpu->state); - kernel_fpu_enable(); } @@ -145,15 +142,17 @@ void fpu__save(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - preempt_disable(); + fpregs_lock(); trace_x86_fpu_before_save(fpu); - if (fpu->initialized) { + + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } } + trace_x86_fpu_after_save(fpu); - preempt_enable(); + fpregs_unlock(); } EXPORT_SYMBOL_GPL(fpu__save); @@ -186,11 +185,14 @@ void fpstate_init(union fpregs_state *state) } EXPORT_SYMBOL_GPL(fpstate_init); -int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) +int fpu__copy(struct task_struct *dst, struct task_struct *src) { + struct fpu *dst_fpu = &dst->thread.fpu; + struct fpu *src_fpu = &src->thread.fpu; + dst_fpu->last_cpu = -1; - if (!src_fpu->initialized || !static_cpu_has(X86_FEATURE_FPU)) + if (!static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != ¤t->thread.fpu); @@ -202,16 +204,23 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* - * Save current FPU registers directly into the child - * FPU context, without any memory-to-memory copying. + * If the FPU registers are not current just memcpy() the state. + * Otherwise save current FPU registers directly into the child's FPU + * context, without any memory-to-memory copying. * * ( The function 'fails' in the FNSAVE case, which destroys - * register contents so we have to copy them back. ) + * register contents so we have to load them back. ) */ - if (!copy_fpregs_to_fpstate(dst_fpu)) { - memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - copy_kernel_to_fpregs(&src_fpu->state); - } + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); + + else if (!copy_fpregs_to_fpstate(dst_fpu)) + copy_kernel_to_fpregs(&dst_fpu->state); + + fpregs_unlock(); + + set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); @@ -223,20 +232,14 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Activate the current task's in-memory FPU context, * if it has not been used before: */ -void fpu__initialize(struct fpu *fpu) +static void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != ¤t->thread.fpu); - if (!fpu->initialized) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for the current task: */ - fpu->initialized = 1; - } + set_thread_flag(TIF_NEED_FPU_LOAD); + fpstate_init(&fpu->state); + trace_x86_fpu_init_state(fpu); } -EXPORT_SYMBOL_GPL(fpu__initialize); /* * This function must be called before we read a task's fpstate. @@ -248,32 +251,20 @@ EXPORT_SYMBOL_GPL(fpu__initialize); * * - or it's called for stopped tasks (ptrace), in which case the * registers were already saved by the context-switch code when - * the task scheduled out - we only have to initialize the registers - * if they've never been initialized. + * the task scheduled out. * * If the task has used the FPU before then save it. */ void fpu__prepare_read(struct fpu *fpu) { - if (fpu == ¤t->thread.fpu) { + if (fpu == ¤t->thread.fpu) fpu__save(fpu); - } else { - if (!fpu->initialized) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for current and for stopped child tasks: */ - fpu->initialized = 1; - } - } } /* * This function must be called before we write a task's fpstate. * - * If the task has used the FPU before then invalidate any cached FPU registers. - * If the task has not used the FPU before then initialize its fpstate. + * Invalidate any cached FPU registers. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will @@ -290,44 +281,11 @@ void fpu__prepare_write(struct fpu *fpu) */ WARN_ON_FPU(fpu == ¤t->thread.fpu); - if (fpu->initialized) { - /* Invalidate any cached state: */ - __fpu_invalidate_fpregs_state(fpu); - } else { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - - trace_x86_fpu_activate_state(fpu); - /* Safe to do for stopped child tasks: */ - fpu->initialized = 1; - } + /* Invalidate any cached state: */ + __fpu_invalidate_fpregs_state(fpu); } /* - * 'fpu__restore()' is called to copy FPU registers from - * the FPU fpstate to the live hw registers and to activate - * access to the hardware registers, so that FPU instructions - * can be used afterwards. - * - * Must be called with kernel preemption disabled (for example - * with local interrupts disabled, as it is in the case of - * do_device_not_available()). - */ -void fpu__restore(struct fpu *fpu) -{ - fpu__initialize(fpu); - - /* Avoid __kernel_fpu_begin() right after fpregs_activate() */ - kernel_fpu_disable(); - trace_x86_fpu_before_restore(fpu); - fpregs_activate(fpu); - copy_kernel_to_fpregs(&fpu->state); - trace_x86_fpu_after_restore(fpu); - kernel_fpu_enable(); -} -EXPORT_SYMBOL_GPL(fpu__restore); - -/* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. @@ -341,17 +299,13 @@ void fpu__drop(struct fpu *fpu) preempt_disable(); if (fpu == ¤t->thread.fpu) { - if (fpu->initialized) { - /* Ignore delayed exceptions from user space */ - asm volatile("1: fwait\n" - "2:\n" - _ASM_EXTABLE(1b, 2b)); - fpregs_deactivate(fpu); - } + /* Ignore delayed exceptions from user space */ + asm volatile("1: fwait\n" + "2:\n" + _ASM_EXTABLE(1b, 2b)); + fpregs_deactivate(fpu); } - fpu->initialized = 0; - trace_x86_fpu_dropped(fpu); preempt_enable(); @@ -363,6 +317,8 @@ void fpu__drop(struct fpu *fpu) */ static inline void copy_init_fpstate_to_fpregs(void) { + fpregs_lock(); + if (use_xsave()) copy_kernel_to_xregs(&init_fpstate.xsave, -1); else if (static_cpu_has(X86_FEATURE_FXSR)) @@ -372,6 +328,9 @@ static inline void copy_init_fpstate_to_fpregs(void) if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); + + fpregs_mark_activate(); + fpregs_unlock(); } /* @@ -389,16 +348,52 @@ void fpu__clear(struct fpu *fpu) /* * Make sure fpstate is cleared and initialized. */ - if (static_cpu_has(X86_FEATURE_FPU)) { - preempt_disable(); - fpu__initialize(fpu); - user_fpu_begin(); + fpu__initialize(fpu); + if (static_cpu_has(X86_FEATURE_FPU)) copy_init_fpstate_to_fpregs(); - preempt_enable(); - } } /* + * Load FPU context before returning to userspace. + */ +void switch_fpu_return(void) +{ + if (!static_cpu_has(X86_FEATURE_FPU)) + return; + + __fpregs_load_activate(); +} +EXPORT_SYMBOL_GPL(switch_fpu_return); + +#ifdef CONFIG_X86_DEBUG_FPU +/* + * If current FPU state according to its tracking (loaded FPU context on this + * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is + * loaded on return to userland. + */ +void fpregs_assert_state_consistent(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + return; + + WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); +} +EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); +#endif + +void fpregs_mark_activate(void) +{ + struct fpu *fpu = ¤t->thread.fpu; + + fpregs_activate(fpu); + fpu->last_cpu = smp_processor_id(); + clear_thread_flag(TIF_NEED_FPU_LOAD); +} +EXPORT_SYMBOL_GPL(fpregs_mark_activate); + +/* * x87 math exception handling: */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 6abd83572b01..20d8fa7124c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -239,8 +239,6 @@ static void __init fpu__init_system_ctx_switch(void) WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; - - WARN_ON_FPU(current->thread.fpu.initialized); } /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index bc02f5144b95..d652b939ccfb 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -15,16 +15,12 @@ */ int regset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { - struct fpu *target_fpu = &target->thread.fpu; - - return target_fpu->initialized ? regset->n : 0; + return regset->n; } int regset_xregset_fpregs_active(struct task_struct *target, const struct user_regset *regset) { - struct fpu *target_fpu = &target->thread.fpu; - - if (boot_cpu_has(X86_FEATURE_FXSR) && target_fpu->initialized) + if (boot_cpu_has(X86_FEATURE_FXSR)) return regset->n; else return 0; @@ -269,11 +265,10 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) memcpy(&to[i], &from[i], sizeof(to[0])); } -void convert_to_fxsr(struct task_struct *tsk, +void convert_to_fxsr(struct fxregs_state *fxsave, const struct user_i387_ia32_struct *env) { - struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave; struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -350,7 +345,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &env, 0, -1); if (!ret) - convert_to_fxsr(target, &env); + convert_to_fxsr(&target->thread.fpu.state.fxsave, &env); /* * update the header bit in the xsave header, indicating the @@ -371,16 +366,9 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, int dump_fpu(struct pt_regs *regs, struct user_i387_struct *ufpu) { struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; - int fpvalid; - - fpvalid = fpu->initialized; - if (fpvalid) - fpvalid = !fpregs_get(tsk, NULL, - 0, sizeof(struct user_i387_ia32_struct), - ufpu, NULL); - return fpvalid; + return !fpregs_get(tsk, NULL, 0, sizeof(struct user_i387_ia32_struct), + ufpu, NULL); } EXPORT_SYMBOL(dump_fpu); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index f6a1d299627c..5a8d118bc423 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -92,13 +92,13 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) return err; err |= __put_user(FP_XSTATE_MAGIC2, - (__u32 *)(buf + fpu_user_xstate_size)); + (__u32 __user *)(buf + fpu_user_xstate_size)); /* * Read the xfeatures which we copied (directly from the cpu or * from the state in task struct) to the user buffers. */ - err |= __get_user(xfeatures, (__u32 *)&x->header.xfeatures); + err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); /* * For legacy compatible, we always set FP/SSE bits in the bit @@ -113,7 +113,7 @@ static inline int save_xstate_epilog(void __user *buf, int ia32_frame) */ xfeatures |= XFEATURE_MASK_FPSSE; - err |= __put_user(xfeatures, (__u32 *)&x->header.xfeatures); + err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); return err; } @@ -144,9 +144,10 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) * buf == buf_fx for 64-bit frames and 32-bit fsave frame. * buf != buf_fx for 32-bit frames with fxstate. * - * If the fpu, extended register state is live, save the state directly - * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, - * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * Try to save it directly to the user frame with disabled page fault handler. + * If this fails then do the slow path where the FPU state is first saved to + * task's fpu->state and then copy it to the user frame pointed to by the + * aligned pointer 'buf_fx'. * * If this is a 32-bit frame with fxstate, put a fsave header before * the aligned state at 'buf_fx'. @@ -156,10 +157,9 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf) */ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) { - struct fpu *fpu = ¤t->thread.fpu; - struct xregs_state *xsave = &fpu->state.xsave; struct task_struct *tsk = current; int ia32_fxstate = (buf != buf_fx); + int ret; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -172,28 +172,34 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) sizeof(struct user_i387_ia32_struct), NULL, (struct _fpstate_32 __user *) buf) ? -1 : 1; - if (fpu->initialized || using_compacted_format()) { - /* Save the live register state to the user directly. */ - if (copy_fpregs_to_sigframe(buf_fx)) - return -1; - /* Update the thread's fxstate to save the fsave header. */ - if (ia32_fxstate) - copy_fxregs_to_kernel(fpu); - } else { - /* - * It is a *bug* if kernel uses compacted-format for xsave - * area and we copy it out directly to a signal frame. It - * should have been handled above by saving the registers - * directly. - */ - if (boot_cpu_has(X86_FEATURE_XSAVES)) { - WARN_ONCE(1, "x86/fpu: saving compacted-format xsave area to a signal frame!\n"); - return -1; - } - - fpstate_sanitize_xstate(fpu); - if (__copy_to_user(buf_fx, xsave, fpu_user_xstate_size)) - return -1; +retry: + /* + * Load the FPU registers if they are not valid for the current task. + * With a valid FPU state we can attempt to save the state directly to + * userland's stack frame which will likely succeed. If it does not, + * resolve the fault in the user memory and try again. + */ + fpregs_lock(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + __fpregs_load_activate(); + + pagefault_disable(); + ret = copy_fpregs_to_sigframe(buf_fx); + pagefault_enable(); + fpregs_unlock(); + + if (ret) { + int aligned_size; + int nr_pages; + + aligned_size = offset_in_page(buf_fx) + fpu_user_xstate_size; + nr_pages = DIV_ROUND_UP(aligned_size, PAGE_SIZE); + + ret = get_user_pages_unlocked((unsigned long)buf_fx, nr_pages, + NULL, FOLL_WRITE); + if (ret == nr_pages) + goto retry; + return -EFAULT; } /* Save the fsave header for the 32-bit frames. */ @@ -207,11 +213,11 @@ int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size) } static inline void -sanitize_restored_xstate(struct task_struct *tsk, +sanitize_restored_xstate(union fpregs_state *state, struct user_i387_ia32_struct *ia32_env, u64 xfeatures, int fx_only) { - struct xregs_state *xsave = &tsk->thread.fpu.state.xsave; + struct xregs_state *xsave = &state->xsave; struct xstate_header *header = &xsave->header; if (use_xsave()) { @@ -238,17 +244,18 @@ sanitize_restored_xstate(struct task_struct *tsk, */ xsave->i387.mxcsr &= mxcsr_feature_mask; - convert_to_fxsr(tsk, ia32_env); + if (ia32_env) + convert_to_fxsr(&state->fxsave, ia32_env); } } /* * Restore the extended state if present. Otherwise, restore the FP/SSE state. */ -static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) +static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only) { if (use_xsave()) { - if ((unsigned long)buf % 64 || fx_only) { + if (fx_only) { u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); return copy_user_to_fxregs(buf); @@ -266,12 +273,15 @@ static inline int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) { + struct user_i387_ia32_struct *envp = NULL; + int state_size = fpu_kernel_xstate_size; int ia32_fxstate = (buf != buf_fx); struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; - int state_size = fpu_kernel_xstate_size; + struct user_i387_ia32_struct env; u64 xfeatures = 0; int fx_only = 0; + int ret = 0; ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -284,8 +294,6 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) if (!access_ok(buf, size)) return -EACCES; - fpu__initialize(fpu); - if (!static_cpu_has(X86_FEATURE_FPU)) return fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), @@ -308,61 +316,101 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } } + /* + * The current state of the FPU registers does not matter. By setting + * TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate + * is not modified on context switch and that the xstate is considered + * to be loaded again on return to userland (overriding last_cpu avoids + * the optimisation). + */ + set_thread_flag(TIF_NEED_FPU_LOAD); + __fpu_invalidate_fpregs_state(fpu); + + if ((unsigned long)buf_fx % 64) + fx_only = 1; + /* + * For 32-bit frames with fxstate, copy the fxstate so it can be + * reconstructed later. + */ if (ia32_fxstate) { + ret = __copy_from_user(&env, buf, sizeof(env)); + if (ret) + goto err_out; + envp = &env; + } else { /* - * For 32-bit frames with fxstate, copy the user state to the - * thread's fpu state, reconstruct fxstate from the fsave - * header. Validate and sanitize the copied state. + * Attempt to restore the FPU registers directly from user + * memory. For that to succeed, the user access cannot cause + * page faults. If it does, fall back to the slow path below, + * going through the kernel buffer with the enabled pagefault + * handler. */ - struct user_i387_ia32_struct env; - int err = 0; + fpregs_lock(); + pagefault_disable(); + ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only); + pagefault_enable(); + if (!ret) { + fpregs_mark_activate(); + fpregs_unlock(); + return 0; + } + fpregs_unlock(); + } - /* - * Drop the current fpu which clears fpu->initialized. This ensures - * that any context-switch during the copy of the new state, - * avoids the intermediate state from getting restored/saved. - * Thus avoiding the new restored state from getting corrupted. - * We will be ready to restore/save the state only after - * fpu->initialized is again set. - */ - fpu__drop(fpu); + + if (use_xsave() && !fx_only) { + u64 init_bv = xfeatures_mask & ~xfeatures; if (using_compacted_format()) { - err = copy_user_to_xstate(&fpu->state.xsave, buf_fx); + ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx); } else { - err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); - if (!err && state_size > offsetof(struct xregs_state, header)) - err = validate_xstate_header(&fpu->state.xsave.header); + if (!ret && state_size > offsetof(struct xregs_state, header)) + ret = validate_xstate_header(&fpu->state.xsave.header); } + if (ret) + goto err_out; - if (err || __copy_from_user(&env, buf, sizeof(env))) { - fpstate_init(&fpu->state); - trace_x86_fpu_init_state(fpu); - err = -1; - } else { - sanitize_restored_xstate(tsk, &env, xfeatures, fx_only); + sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); + + fpregs_lock(); + if (unlikely(init_bv)) + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); + ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures); + + } else if (use_fxsr()) { + ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size); + if (ret) { + ret = -EFAULT; + goto err_out; } - local_bh_disable(); - fpu->initialized = 1; - fpu__restore(fpu); - local_bh_enable(); + sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only); - return err; - } else { - /* - * For 64-bit frames and 32-bit fsave frames, restore the user - * state to the registers directly (with exceptions handled). - */ - user_fpu_begin(); - if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { - fpu__clear(fpu); - return -1; + fpregs_lock(); + if (use_xsave()) { + u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE; + copy_kernel_to_xregs(&init_fpstate.xsave, init_bv); } + + ret = copy_kernel_to_fxregs_err(&fpu->state.fxsave); + } else { + ret = __copy_from_user(&fpu->state.fsave, buf_fx, state_size); + if (ret) + goto err_out; + + fpregs_lock(); + ret = copy_kernel_to_fregs_err(&fpu->state.fsave); } + if (!ret) + fpregs_mark_activate(); + fpregs_unlock(); - return 0; +err_out: + if (ret) + fpu__clear(fpu); + return ret; } static inline int xstate_sigframe_size(void) diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index d7432c2b1051..9c459fd1d38e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -805,20 +805,18 @@ void fpu__resume_cpu(void) } /* - * Given an xstate feature mask, calculate where in the xsave + * Given an xstate feature nr, calculate where in the xsave * buffer the state is. Callers should ensure that the buffer * is valid. */ -static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask) +static void *__raw_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { - int feature_nr = fls64(xstate_feature_mask) - 1; - - if (!xfeature_enabled(feature_nr)) { + if (!xfeature_enabled(xfeature_nr)) { WARN_ON_FPU(1); return NULL; } - return (void *)xsave + xstate_comp_offsets[feature_nr]; + return (void *)xsave + xstate_comp_offsets[xfeature_nr]; } /* * Given the xsave area and a state inside, this function returns the @@ -832,13 +830,13 @@ static void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask * * Inputs: * xstate: the thread's storage area for all FPU data - * xstate_feature: state which is defined in xsave.h (e.g. - * XFEATURE_MASK_FP, XFEATURE_MASK_SSE, etc...) + * xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area, or NULL if the * field is not present in the xsave buffer. */ -void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) +void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr) { /* * Do we even *have* xsave state? @@ -851,11 +849,11 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) * have not enabled. Remember that pcntxt_mask is * what we write to the XCR0 register. */ - WARN_ONCE(!(xfeatures_mask & xstate_feature), + WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)), "get of unsupported state"); /* * This assumes the last 'xsave*' instruction to - * have requested that 'xstate_feature' be saved. + * have requested that 'xfeature_nr' be saved. * If it did not, we might be seeing and old value * of the field in the buffer. * @@ -864,10 +862,10 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) * or because the "init optimization" caused it * to not be saved. */ - if (!(xsave->header.xfeatures & xstate_feature)) + if (!(xsave->header.xfeatures & BIT_ULL(xfeature_nr))) return NULL; - return __raw_xsave_addr(xsave, xstate_feature); + return __raw_xsave_addr(xsave, xfeature_nr); } EXPORT_SYMBOL_GPL(get_xsave_addr); @@ -882,25 +880,23 @@ EXPORT_SYMBOL_GPL(get_xsave_addr); * Note that this only works on the current task. * * Inputs: - * @xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP, - * XFEATURE_MASK_SSE, etc...) + * @xfeature_nr: state which is defined in xsave.h (e.g. XFEATURE_FP, + * XFEATURE_SSE, etc...) * Output: * address of the state in the xsave area or NULL if the state * is not present or is in its 'init state'. */ -const void *get_xsave_field_ptr(int xsave_state) +const void *get_xsave_field_ptr(int xfeature_nr) { struct fpu *fpu = ¤t->thread.fpu; - if (!fpu->initialized) - return NULL; /* * fpu__save() takes the CPU's xstate registers * and saves them off to the 'fpu memory buffer. */ fpu__save(fpu); - return get_xsave_addr(&fpu->state.xsave, xsave_state); + return get_xsave_addr(&fpu->state.xsave, xfeature_nr); } #ifdef CONFIG_ARCH_HAS_PKEYS @@ -1016,7 +1012,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, 1 << i); + void *src = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1102,7 +1098,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i * Copy only in-use xstates: */ if ((header.xfeatures >> i) & 1) { - void *src = __raw_xsave_addr(xsave, 1 << i); + void *src = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1159,7 +1155,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf) u64 mask = ((u64)1 << i); if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, 1 << i); + void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; @@ -1213,7 +1209,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf) u64 mask = ((u64)1 << i); if (hdr.xfeatures & mask) { - void *dst = __raw_xsave_addr(xsave, 1 << i); + void *dst = __raw_xsave_addr(xsave, i); offset = xstate_offsets[i]; size = xstate_sizes[i]; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ef49517f6bb2..0caf8122d680 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -678,12 +678,8 @@ static inline void *alloc_tramp(unsigned long size) { return module_alloc(size); } -static inline void tramp_free(void *tramp, int size) +static inline void tramp_free(void *tramp) { - int npages = PAGE_ALIGN(size) >> PAGE_SHIFT; - - set_memory_nx((unsigned long)tramp, npages); - set_memory_rw((unsigned long)tramp, npages); module_memfree(tramp); } #else @@ -692,7 +688,7 @@ static inline void *alloc_tramp(unsigned long size) { return NULL; } -static inline void tramp_free(void *tramp, int size) { } +static inline void tramp_free(void *tramp) { } #endif /* Defined as markers to the end of the ftrace default trampolines */ @@ -730,6 +726,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned long end_offset; unsigned long op_offset; unsigned long offset; + unsigned long npages; unsigned long size; unsigned long retq; unsigned long *ptr; @@ -762,6 +759,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) return 0; *tramp_size = size + RET_SIZE + sizeof(void *); + npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ ret = probe_kernel_read(trampoline, (void *)start_offset, size); @@ -806,9 +804,17 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; + set_vm_flush_reset_perms(trampoline); + + /* + * Module allocation needs to be completed by making the page + * executable. The page is still writable, which is a security hazard, + * but anyhow ftrace breaks W^X completely. + */ + set_memory_x((unsigned long)trampoline, npages); return (unsigned long)trampoline; fail: - tramp_free(trampoline, *tramp_size); + tramp_free(trampoline); return 0; } @@ -939,7 +945,7 @@ void arch_ftrace_trampoline_free(struct ftrace_ops *ops) if (!ops || !(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP)) return; - tramp_free((void *)ops->trampoline, ops->trampoline_size); + tramp_free((void *)ops->trampoline); ops->trampoline = 0; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index d1dbe8e4eb82..bcd206c8ac90 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -265,7 +265,7 @@ ENDPROC(start_cpu0) GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) - .quad INIT_PER_CPU_VAR(irq_stack_union) + .quad INIT_PER_CPU_VAR(fixed_percpu_data) GLOBAL(initial_stack) /* * The SIZEOF_PTREGS gap is a convention which helps the in-kernel diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c index 01adea278a71..6d8917875f44 100644 --- a/arch/x86/kernel/idt.c +++ b/arch/x86/kernel/idt.c @@ -41,13 +41,12 @@ struct idt_data { #define SYSG(_vector, _addr) \ G(_vector, _addr, DEFAULT_STACK, GATE_INTERRUPT, DPL3, __KERNEL_CS) -/* Interrupt gate with interrupt stack */ +/* + * Interrupt gate with interrupt stack. The _ist index is the index in + * the tss.ist[] array, but for the descriptor it needs to start at 1. + */ #define ISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL0, __KERNEL_CS) - -/* System interrupt gate with interrupt stack */ -#define SISTG(_vector, _addr, _ist) \ - G(_vector, _addr, _ist, GATE_INTERRUPT, DPL3, __KERNEL_CS) + G(_vector, _addr, _ist + 1, GATE_INTERRUPT, DPL0, __KERNEL_CS) /* Task gate */ #define TSKG(_vector, _gdt) \ @@ -184,11 +183,11 @@ gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss; * cpu_init() when the TSS has been initialized. */ static const __initconst struct idt_data ist_idts[] = { - ISTG(X86_TRAP_DB, debug, DEBUG_STACK), - ISTG(X86_TRAP_NMI, nmi, NMI_STACK), - ISTG(X86_TRAP_DF, double_fault, DOUBLEFAULT_STACK), + ISTG(X86_TRAP_DB, debug, IST_INDEX_DB), + ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI), + ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF), #ifdef CONFIG_X86_MCE - ISTG(X86_TRAP_MC, &machine_check, MCE_STACK), + ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE), #endif }; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 95600a99ae93..fc34816c6f04 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -51,8 +51,8 @@ static inline int check_stack_overflow(void) { return 0; } static inline void print_stack_overflow(void) { } #endif -DEFINE_PER_CPU(struct irq_stack *, hardirq_stack); -DEFINE_PER_CPU(struct irq_stack *, softirq_stack); +DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); +DEFINE_PER_CPU(struct irq_stack *, softirq_stack_ptr); static void call_on_stack(void *func, void *stack) { @@ -76,7 +76,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) u32 *isp, *prev_esp, arg1; curstk = (struct irq_stack *) current_stack(); - irqstk = __this_cpu_read(hardirq_stack); + irqstk = __this_cpu_read(hardirq_stack_ptr); /* * this is where we switch to the IRQ stack. However, if we are @@ -107,27 +107,28 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc) } /* - * allocate per-cpu stacks for hardirq and for softirq processing + * Allocate per-cpu stacks for hardirq and softirq processing */ -void irq_ctx_init(int cpu) +int irq_init_percpu_irqstack(unsigned int cpu) { - struct irq_stack *irqstk; - - if (per_cpu(hardirq_stack, cpu)) - return; + int node = cpu_to_node(cpu); + struct page *ph, *ps; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(hardirq_stack, cpu) = irqstk; + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; - irqstk = page_address(alloc_pages_node(cpu_to_node(cpu), - THREADINFO_GFP, - THREAD_SIZE_ORDER)); - per_cpu(softirq_stack, cpu) = irqstk; + ph = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ph) + return -ENOMEM; + ps = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (!ps) { + __free_pages(ph, THREAD_SIZE_ORDER); + return -ENOMEM; + } - printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n", - cpu, per_cpu(hardirq_stack, cpu), per_cpu(softirq_stack, cpu)); + per_cpu(hardirq_stack_ptr, cpu) = page_address(ph); + per_cpu(softirq_stack_ptr, cpu) = page_address(ps); + return 0; } void do_softirq_own_stack(void) @@ -135,7 +136,7 @@ void do_softirq_own_stack(void) struct irq_stack *irqstk; u32 *isp, *prev_esp; - irqstk = __this_cpu_read(softirq_stack); + irqstk = __this_cpu_read(softirq_stack_ptr); /* build the stack frame on the softirq stack */ isp = (u32 *) ((char *)irqstk + sizeof(*irqstk)); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 0469cd078db1..6bf6517a05bb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,63 +18,64 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <linux/sched/task_stack.h> + +#include <asm/cpu_entry_area.h> #include <asm/io_apic.h> #include <asm/apic.h> -int sysctl_panic_on_stackoverflow; +DEFINE_PER_CPU_PAGE_ALIGNED(struct irq_stack, irq_stack_backing_store) __visible; +DECLARE_INIT_PER_CPU(irq_stack_backing_store); -/* - * Probabilistic stack overflow check: - * - * Only check the stack in process context, because everything else - * runs on the big interrupt stacks. Checking reliably is too expensive, - * so we just check from interrupts. - */ -static inline void stack_overflow_check(struct pt_regs *regs) +bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) { -#ifdef CONFIG_DEBUG_STACKOVERFLOW -#define STACK_TOP_MARGIN 128 - struct orig_ist *oist; - u64 irq_stack_top, irq_stack_bottom; - u64 estack_top, estack_bottom; - u64 curbase = (u64)task_stack_page(current); + if (IS_ERR_OR_NULL(desc)) + return false; - if (user_mode(regs)) - return; + generic_handle_irq_desc(desc); + return true; +} - if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && - regs->sp <= curbase + THREAD_SIZE) - return; +#ifdef CONFIG_VMAP_STACK +/* + * VMAP the backing store with guard pages + */ +static int map_irq_stack(unsigned int cpu) +{ + char *stack = (char *)per_cpu_ptr(&irq_stack_backing_store, cpu); + struct page *pages[IRQ_STACK_SIZE / PAGE_SIZE]; + void *va; + int i; - irq_stack_top = (u64)this_cpu_ptr(irq_stack_union.irq_stack) + - STACK_TOP_MARGIN; - irq_stack_bottom = (u64)__this_cpu_read(irq_stack_ptr); - if (regs->sp >= irq_stack_top && regs->sp <= irq_stack_bottom) - return; + for (i = 0; i < IRQ_STACK_SIZE / PAGE_SIZE; i++) { + phys_addr_t pa = per_cpu_ptr_to_phys(stack + (i << PAGE_SHIFT)); - oist = this_cpu_ptr(&orig_ist); - estack_top = (u64)oist->ist[0] - EXCEPTION_STKSZ + STACK_TOP_MARGIN; - estack_bottom = (u64)oist->ist[N_EXCEPTION_STACKS - 1]; - if (regs->sp >= estack_top && regs->sp <= estack_bottom) - return; + pages[i] = pfn_to_page(pa >> PAGE_SHIFT); + } - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", - current->comm, curbase, regs->sp, - irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom, (void *)regs->ip); + va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL); + if (!va) + return -ENOMEM; - if (sysctl_panic_on_stackoverflow) - panic("low stack detected by irq handler - check messages\n"); -#endif + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; } - -bool handle_irq(struct irq_desc *desc, struct pt_regs *regs) +#else +/* + * If VMAP stacks are disabled due to KASAN, just use the per cpu + * backing store without guard pages. + */ +static int map_irq_stack(unsigned int cpu) { - stack_overflow_check(regs); + void *va = per_cpu_ptr(&irq_stack_backing_store, cpu); - if (IS_ERR_OR_NULL(desc)) - return false; + per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE; + return 0; +} +#endif - generic_handle_irq_desc(desc); - return true; +int irq_init_percpu_irqstack(unsigned int cpu) +{ + if (per_cpu(hardirq_stack_ptr, cpu)) + return 0; + return map_irq_stack(cpu); } diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index a0693b71cfc1..16919a9671fa 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -91,6 +91,8 @@ void __init init_IRQ(void) for (i = 0; i < nr_legacy_irqs(); i++) per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i); + BUG_ON(irq_init_percpu_irqstack(smp_processor_id())); + x86_init.irqs.intr_init(); } @@ -104,6 +106,4 @@ void __init native_init_IRQ(void) if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) setup_irq(2, &irq2); - - irq_ctx_init(smp_processor_id()); } diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f99bd26bd3f1..e631c358f7f4 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,7 +37,6 @@ static void bug_at(unsigned char *ip, int line) static void __ref __jump_label_transform(struct jump_entry *entry, enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), int init) { union jump_code_union jmp; @@ -50,9 +49,6 @@ static void __ref __jump_label_transform(struct jump_entry *entry, jmp.offset = jump_entry_target(entry) - (jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE); - if (early_boot_irqs_disabled) - poker = text_poke_early; - if (type == JUMP_LABEL_JMP) { if (init) { expect = default_nop; line = __LINE__; @@ -75,16 +71,19 @@ static void __ref __jump_label_transform(struct jump_entry *entry, bug_at((void *)jump_entry_code(entry), line); /* - * Make text_poke_bp() a default fallback poker. + * As long as only a single processor is running and the code is still + * not marked as RO, text_poke_early() can be used; Checking that + * system_state is SYSTEM_BOOTING guarantees it. It will be set to + * SYSTEM_SCHEDULING before other cores are awaken and before the + * code is write-protected. * * At the time the change is being done, just ignore whether we * are doing nop -> jump or jump -> nop transition, and assume * always nop being the 'currently valid' instruction - * */ - if (poker) { - (*poker)((void *)jump_entry_code(entry), code, - JUMP_LABEL_NOP_SIZE); + if (init || system_state == SYSTEM_BOOTING) { + text_poke_early((void *)jump_entry_code(entry), code, + JUMP_LABEL_NOP_SIZE); return; } @@ -96,7 +95,7 @@ void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type) { mutex_lock(&text_mutex); - __jump_label_transform(entry, type, NULL, 0); + __jump_label_transform(entry, type, 0); mutex_unlock(&text_mutex); } @@ -126,5 +125,5 @@ __init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, jlstate = JL_STATE_NO_UPDATE; } if (jlstate == JL_STATE_UPDATE) - __jump_label_transform(entry, type, text_poke_early, 1); + __jump_label_transform(entry, type, 1); } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 4ff6b4cdb941..13b13311b792 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -747,7 +747,6 @@ void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) { int err; - char opc[BREAK_INSTR_SIZE]; bpt->type = BP_BREAKPOINT; err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, @@ -759,18 +758,13 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) if (!err) return err; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) return -EBUSY; - text_poke((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, - BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err) - return err; - if (memcmp(opc, arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE)) - return -EINVAL; + text_poke_kgdb((void *)bpt->bpt_addr, arch_kgdb_ops.gdb_bpt_instr, + BREAK_INSTR_SIZE); bpt->type = BP_POKE_BREAKPOINT; return err; @@ -778,22 +772,17 @@ int kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) int kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) { - int err; - char opc[BREAK_INSTR_SIZE]; - if (bpt->type != BP_POKE_BREAKPOINT) goto knl_write; /* - * It is safe to call text_poke() because normal kernel execution + * It is safe to call text_poke_kgdb() because normal kernel execution * is stopped on all cores, so long as the text_mutex is not locked. */ if (mutex_is_locked(&text_mutex)) goto knl_write; - text_poke((void *)bpt->bpt_addr, bpt->saved_instr, BREAK_INSTR_SIZE); - err = probe_kernel_read(opc, (char *)bpt->bpt_addr, BREAK_INSTR_SIZE); - if (err || memcmp(opc, bpt->saved_instr, BREAK_INSTR_SIZE)) - goto knl_write; - return err; + text_poke_kgdb((void *)bpt->bpt_addr, bpt->saved_instr, + BREAK_INSTR_SIZE); + return 0; knl_write: return probe_kernel_write((char *)bpt->bpt_addr, diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index a034cb808e7e..cf52ee0d8711 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -431,8 +431,21 @@ void *alloc_insn_page(void) void *page; page = module_alloc(PAGE_SIZE); - if (page) - set_memory_ro((unsigned long)page & PAGE_MASK, 1); + if (!page) + return NULL; + + set_vm_flush_reset_perms(page); + /* + * First make the page read-only, and only then make it executable to + * prevent it from being W+X in between. + */ + set_memory_ro((unsigned long)page, 1); + + /* + * TODO: Once additional kernel code protection mechanisms are set, ensure + * that the page was not maliciously altered and it is still zeroed. + */ + set_memory_x((unsigned long)page, 1); return page; } @@ -440,8 +453,6 @@ void *alloc_insn_page(void) /* Recover page to RW mode before releasing it */ void free_insn_page(void *page) { - set_memory_nx((unsigned long)page & PAGE_MASK, 1); - set_memory_rw((unsigned long)page & PAGE_MASK, 1); module_memfree(page); } @@ -569,6 +580,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -715,6 +727,7 @@ NOKPROBE_SYMBOL(kprobe_int3_handler); * calls trampoline_handler() runs, which calls the kretprobe's handler. */ asm( + ".text\n" ".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" @@ -748,26 +761,48 @@ asm( NOKPROBE_SYMBOL(kretprobe_trampoline); STACK_FRAME_NON_STANDARD(kretprobe_trampoline); +static struct kprobe kretprobe_kprobe = { + .addr = (void *)kretprobe_trampoline, +}; + /* * Called from kretprobe_trampoline */ static __used void *trampoline_handler(struct pt_regs *regs) { + struct kprobe_ctlblk *kcb; struct kretprobe_instance *ri = NULL; struct hlist_head *head, empty_rp; struct hlist_node *tmp; unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; + + preempt_disable(); + + /* + * Set a dummy kprobe for avoiding kretprobe recursion. + * Since kretprobe never run in kprobe handler, kprobe must not + * be running at this point. + */ + kcb = get_kprobe_ctlblk(); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); + kcb->kprobe_status = KPROBE_HIT_ACTIVE; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -789,8 +824,25 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -808,14 +860,15 @@ static __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { __this_cpu_write(current_kprobe, &ri->rp->kp); - get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; ri->ret_addr = correct_ret_addr; ri->rp->handler(ri, regs); - __this_cpu_write(current_kprobe, NULL); + __this_cpu_write(current_kprobe, &kretprobe_kprobe); } recycle_rp_inst(ri, &empty_rp); @@ -831,6 +884,9 @@ static __used void *trampoline_handler(struct pt_regs *regs) kretprobe_hash_unlock(current, &flags); + __this_cpu_write(current_kprobe, NULL); + preempt_enable(); + hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); kfree(ri); diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5c93a65ee1e5..3f0cc828cc36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -67,7 +67,7 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); -static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); +DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible; static int has_steal_clock = 0; /* diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6135ae8ce036..b2463fcb20a8 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -113,7 +113,7 @@ static void do_sanity_check(struct mm_struct *mm, * tables. */ WARN_ON(!had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(!had_user_mapping); } else { /* @@ -121,7 +121,7 @@ static void do_sanity_check(struct mm_struct *mm, * Sync the pgd to the usermode tables. */ WARN_ON(had_kernel_mapping); - if (static_cpu_has(X86_FEATURE_PTI)) + if (boot_cpu_has(X86_FEATURE_PTI)) WARN_ON(had_user_mapping); } } @@ -156,7 +156,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) k_pmd = pgd_to_pmd_walk(k_pgd, LDT_BASE_ADDR); u_pmd = pgd_to_pmd_walk(u_pgd, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pmd(u_pmd, *k_pmd); } @@ -181,7 +181,7 @@ static void map_ldt_struct_to_user(struct mm_struct *mm) { pgd_t *pgd = pgd_offset(mm, LDT_BASE_ADDR); - if (static_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) + if (boot_cpu_has(X86_FEATURE_PTI) && !mm->context.ldt) set_pgd(kernel_to_user_pgdp(pgd), *pgd); } @@ -208,7 +208,7 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) spinlock_t *ptl; int i, nr_pages; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return 0; /* @@ -271,7 +271,7 @@ static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) return; /* LDT map/unmap is only required for PTI */ - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); @@ -311,7 +311,7 @@ static void free_ldt_pgtables(struct mm_struct *mm) unsigned long start = LDT_BASE_ADDR; unsigned long end = LDT_END_ADDR; - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; tlb_gather_mmu(&tlb, mm, start, end); diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b052e883dd8c..cfa3106faee4 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -87,7 +87,7 @@ void *module_alloc(unsigned long size) p = __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR + get_module_load_offset(), MODULES_END, GFP_KERNEL, - PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE, + PAGE_KERNEL, 0, NUMA_NO_NODE, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { vfree(p); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 18bc9b51ac9b..3755d0310026 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -21,13 +21,14 @@ #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/export.h> +#include <linux/atomic.h> #include <linux/sched/clock.h> #if defined(CONFIG_EDAC) #include <linux/edac.h> #endif -#include <linux/atomic.h> +#include <asm/cpu_entry_area.h> #include <asm/traps.h> #include <asm/mach_traps.h> #include <asm/nmi.h> @@ -487,6 +488,23 @@ static DEFINE_PER_CPU(unsigned long, nmi_cr2); * switch back to the original IDT. */ static DEFINE_PER_CPU(int, update_debug_stack); + +static bool notrace is_debug_stack(unsigned long addr) +{ + struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks); + unsigned long top = CEA_ESTACK_TOP(cs, DB); + unsigned long bot = CEA_ESTACK_BOT(cs, DB1); + + if (__this_cpu_read(debug_stack_usage)) + return true; + /* + * Note, this covers the guard page between DB and DB1 as well to + * avoid two checks. But by all means @addr can never point into + * the guard page. + */ + return addr >= bot && addr < top; +} +NOKPROBE_SYMBOL(is_debug_stack); #endif dotraplinkage notrace void diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c0e0101133f3..7bbaa6baf37f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -121,7 +121,7 @@ DEFINE_STATIC_KEY_TRUE(virt_spin_lock_key); void __init native_pv_lock_init(void) { - if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_disable(&virt_spin_lock_key); } diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index c06c4c16c6b6..07c30ee17425 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -59,18 +59,34 @@ static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { u64 perf_reg_value(struct pt_regs *regs, int idx) { + struct x86_perf_regs *perf_regs; + + if (idx >= PERF_REG_X86_XMM0 && idx < PERF_REG_X86_XMM_MAX) { + perf_regs = container_of(regs, struct x86_perf_regs, regs); + if (!perf_regs->xmm_regs) + return 0; + return perf_regs->xmm_regs[idx - PERF_REG_X86_XMM0]; + } + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) return 0; return regs_get_register(regs, pt_regs_offset[idx]); } -#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) - #ifdef CONFIG_X86_32 +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) + int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; @@ -96,10 +112,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, int perf_reg_validate(u64 mask) { - if (!mask || mask & REG_RESERVED) - return -EINVAL; - - if (mask & REG_NOSUPPORT) + if (!mask || (mask & REG_NOSUPPORT)) return -EINVAL; return 0; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 58ac7be52c7a..75fea0d48c0e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -101,7 +101,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.vm86 = NULL; #endif - return fpu__copy(&dst->thread.fpu, &src->thread.fpu); + return fpu__copy(dst, src); } /* @@ -236,7 +236,7 @@ static int get_cpuid_mode(void) static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled) { - if (!static_cpu_has(X86_FEATURE_CPUID_FAULT)) + if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT)) return -ENODEV; if (cpuid_enabled) @@ -426,6 +426,8 @@ static __always_inline void __speculation_ctrl_update(unsigned long tifp, u64 msr = x86_spec_ctrl_base; bool updmsr = false; + lockdep_assert_irqs_disabled(); + /* * If TIF_SSBD is different, select the proper mitigation * method. Note that if SSBD mitigation is disabled or permanentely @@ -477,10 +479,12 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) void speculation_ctrl_update(unsigned long tif) { + unsigned long flags; + /* Forced update. Make sure all relevant TIF flags are different */ - preempt_disable(); + local_irq_save(flags); __speculation_ctrl_update(~tif, tif); - preempt_enable(); + local_irq_restore(flags); } /* Called from seccomp/prctl update */ @@ -666,7 +670,7 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) if (c->x86_vendor != X86_VENDOR_INTEL) return 0; - if (!cpu_has(c, X86_FEATURE_MWAIT) || static_cpu_has_bug(X86_BUG_MONITOR)) + if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR)) return 0; return 1; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e471d8e6f0b2..2399e910d109 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -127,6 +127,13 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, struct task_struct *tsk; int err; + /* + * For a new task use the RESET flags value since there is no before. + * All the status flags are zero; DF and all the system flags must also + * be 0, specifically IF must be 0 because we context switch to the new + * task with interrupts disabled. + */ + frame->flags = X86_EFLAGS_FIXED; frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; @@ -234,7 +241,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - switch_fpu_prepare(prev_fpu, cpu); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -267,9 +275,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Leave lazy mode, flushing any hypercalls made here. * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before fpu__restore(), so the TS bit is up - * to date. + * the GDT and LDT are properly updated. */ arch_end_context_switch(next_p); @@ -290,10 +296,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu, cpu); - this_cpu_write(current_task, next_p); + switch_fpu_finish(next_fpu); + /* Load the Intel cache allocation PQR MSR. */ resctrl_sched_in(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 6a62f4af9fcf..f8e1af380cdf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -392,6 +392,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; + frame->bp = 0; frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; @@ -520,7 +521,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(irq_count) != -1); - switch_fpu_prepare(prev_fpu, cpu); + if (!test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -538,9 +540,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Leave lazy mode, flushing any hypercalls made here. This * must be done after loading TLS entries in the GDT but before - * loading segments that might reference them, and and it must - * be done before fpu__restore(), so the TS bit is up to - * date. + * loading segments that might reference them. */ arch_end_context_switch(next_p); @@ -568,14 +568,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) x86_fsgsbase_load(prev, next); - switch_fpu_finish(next_fpu, cpu); - /* * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); + switch_fpu_finish(next_fpu); + /* Reload sp0. */ update_task_stack(next_p); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 725624b6c0c0..09d6bded3c1e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -81,6 +81,19 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +/* + * Some machines don't handle the default ACPI reboot method and + * require the EFI reboot method: + */ +static int __init set_efi_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_EFI && !efi_runtime_disabled()) { + reboot_type = BOOT_EFI; + pr_info("%s series board detected. Selecting EFI-method for reboot.\n", d->ident); + } + return 0; +} + void __noreturn machine_real_restart(unsigned int type) { local_irq_disable(); @@ -108,7 +121,7 @@ void __noreturn machine_real_restart(unsigned int type) write_cr3(real_mode_header->trampoline_pgd); /* Exiting long mode will fail if CR4.PCIDE is set. */ - if (static_cpu_has(X86_FEATURE_PCID)) + if (boot_cpu_has(X86_FEATURE_PCID)) cr4_clear_bits(X86_CR4_PCIDE); #endif @@ -166,6 +179,14 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = { DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"), }, }, + { /* Handle reboot issue on Acer TravelMate X514-51T */ + .callback = set_efi_reboot, + .ident = "Acer TravelMate X514-51T", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate X514-51T"), + }, + }, /* Apple */ { /* Handle problems with rebooting on Apple MacBook5 */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3d872a527cd9..905dae880563 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -71,6 +71,7 @@ #include <linux/tboot.h> #include <linux/jiffies.h> #include <linux/mem_encrypt.h> +#include <linux/sizes.h> #include <linux/usb/xhci-dbgp.h> #include <video/edid.h> @@ -448,18 +449,17 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC_CORE /* 16M alignment for crash kernel regions */ -#define CRASH_ALIGN (16 << 20) +#define CRASH_ALIGN SZ_16M /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. - * On 64bit, old kexec-tools need to under 896MiB. */ #ifdef CONFIG_X86_32 -# define CRASH_ADDR_LOW_MAX (512 << 20) -# define CRASH_ADDR_HIGH_MAX (512 << 20) +# define CRASH_ADDR_LOW_MAX SZ_512M +# define CRASH_ADDR_HIGH_MAX SZ_512M #else -# define CRASH_ADDR_LOW_MAX (896UL << 20) +# define CRASH_ADDR_LOW_MAX SZ_4G # define CRASH_ADDR_HIGH_MAX MAXMEM #endif @@ -541,21 +541,27 @@ static void __init reserve_crashkernel(void) } /* 0 means: find the address automatically */ - if (crash_base <= 0) { + if (!crash_base) { /* * Set CRASH_ADDR_LOW_MAX upper bound for crash memory, - * as old kexec-tools loads bzImage below that, unless - * "crashkernel=size[KMG],high" is specified. + * crashkernel=x,high reserves memory over 4G, also allocates + * 256M extra low memory for DMA buffers and swiotlb. + * But the extra memory is not required for all machines. + * So try low memory first and fall back to high memory + * unless "crashkernel=size[KMG],high" is specified. */ - crash_base = memblock_find_in_range(CRASH_ALIGN, - high ? CRASH_ADDR_HIGH_MAX - : CRASH_ADDR_LOW_MAX, - crash_size, CRASH_ALIGN); + if (!high) + crash_base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_LOW_MAX, + crash_size, CRASH_ALIGN); + if (!crash_base) + crash_base = memblock_find_in_range(CRASH_ALIGN, + CRASH_ADDR_HIGH_MAX, + crash_size, CRASH_ALIGN); if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } - } else { unsigned long long start; @@ -1005,13 +1011,11 @@ void __init setup_arch(char **cmdline_p) if (efi_enabled(EFI_BOOT)) efi_init(); - dmi_scan_machine(); - dmi_memdev_walk(); - dmi_set_dump_stack_arch_desc(); + dmi_setup(); /* * VMware detection requires dmi to be available, so this - * needs to be done after dmi_scan_machine(), for the boot CPU. + * needs to be done after dmi_setup(), for the boot CPU. */ init_hypervisor_platform(); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4bf46575568a..86663874ef04 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -244,11 +244,6 @@ void __init setup_per_cpu_areas(void) per_cpu(x86_cpu_to_logical_apicid, cpu) = early_per_cpu_map(x86_cpu_to_logical_apicid, cpu); #endif -#ifdef CONFIG_X86_64 - per_cpu(irq_stack_ptr, cpu) = - per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE; -#endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 08dfd4c1a4f9..364813cea647 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -132,16 +132,6 @@ static int restore_sigcontext(struct pt_regs *regs, COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#ifdef CONFIG_X86_64 - /* - * Fix up SS if needed for the benefit of old DOSEMU and - * CRIU. - */ - if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && - user_64bit_mode(regs))) - force_valid_ss(regs); -#endif - get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ @@ -150,6 +140,15 @@ static int restore_sigcontext(struct pt_regs *regs, buf = (void __user *)buf_val; } get_user_catch(err); +#ifdef CONFIG_X86_64 + /* + * Fix up SS if needed for the benefit of old DOSEMU and + * CRIU. + */ + if (unlikely(!(uc_flags & UC_STRICT_RESTORE_SS) && user_64bit_mode(regs))) + force_valid_ss(regs); +#endif + err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); force_iret(); @@ -206,7 +205,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ - put_user_ex(fpstate, &sc->fpstate); + put_user_ex(fpstate, (unsigned long __user *)&sc->fpstate); /* non-iBCS2 extensions.. */ put_user_ex(mask, &sc->oldmask); @@ -246,7 +245,7 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, unsigned long sp = regs->sp; unsigned long buf_fx = 0; int onsigstack = on_sig_stack(sp); - struct fpu *fpu = ¤t->thread.fpu; + int ret; /* redzone */ if (IS_ENABLED(CONFIG_X86_64)) @@ -265,11 +264,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, sp = (unsigned long) ka->sa.sa_restorer; } - if (fpu->initialized) { - sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), - &buf_fx, &math_size); - *fpstate = (void __user *)sp; - } + sp = fpu__alloc_mathframe(sp, IS_ENABLED(CONFIG_X86_32), + &buf_fx, &math_size); + *fpstate = (void __user *)sp; sp = align_sigframe(sp - frame_size); @@ -281,8 +278,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, return (void __user *)-1L; /* save i387 and extended state */ - if (fpu->initialized && - copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size) < 0) + ret = copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size); + if (ret < 0) return (void __user *)-1L; return (void __user *)sp; @@ -461,6 +458,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, { struct rt_sigframe __user *frame; void __user *fp = NULL; + unsigned long uc_flags; int err = 0; frame = get_sigframe(&ksig->ka, regs, sizeof(struct rt_sigframe), &fp); @@ -473,9 +471,11 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); save_altstack_ex(&frame->uc.uc_stack, regs->sp); @@ -541,6 +541,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, { #ifdef CONFIG_X86_X32_ABI struct rt_sigframe_x32 __user *frame; + unsigned long uc_flags; void __user *restorer; int err = 0; void __user *fpstate = NULL; @@ -555,9 +556,11 @@ static int x32_setup_rt_frame(struct ksignal *ksig, return -EFAULT; } + uc_flags = frame_uc_flags(regs); + put_user_try { /* Create the ucontext. */ - put_user_ex(frame_uc_flags(regs), &frame->uc.uc_flags); + put_user_ex(uc_flags, &frame->uc.uc_flags); put_user_ex(0, &frame->uc.uc_link); compat_save_altstack_ex(&frame->uc.uc_stack, regs->sp); put_user_ex(0, &frame->uc.uc__pad0); @@ -569,7 +572,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig, restorer = NULL; err |= -EFAULT; } - put_user_ex(restorer, &frame->pretcode); + put_user_ex(restorer, (unsigned long __user *)&frame->pretcode); } put_user_catch(err); err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, @@ -688,10 +691,7 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs) sigset_t *set = sigmask_to_save(); compat_sigset_t *cset = (compat_sigset_t *) set; - /* - * Increment event counter and perform fixup for the pre-signal - * frame. - */ + /* Perform fixup for the pre-signal frame. */ rseq_signal_deliver(ksig, regs); /* Set up the stack frame */ @@ -763,8 +763,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) /* * Ensure the signal handler starts with the new fpu state. */ - if (fpu->initialized) - fpu__clear(fpu); + fpu__clear(fpu); } signal_setup_done(failed, ksig, stepping); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index ce1a67b70168..73e69aaaa117 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -455,7 +455,7 @@ static bool match_llc(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) * multicore group inside a NUMA node. If this happens, we will * discard the MC level of the topology later. */ -static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) +static bool match_pkg(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) { if (c->phys_proc_id == o->phys_proc_id) return true; @@ -546,7 +546,7 @@ void set_cpu_sibling_map(int cpu) for_each_cpu(i, cpu_sibling_setup_mask) { o = &cpu_data(i); - if ((i == cpu) || (has_mp && match_die(c, o))) { + if ((i == cpu) || (has_mp && match_pkg(c, o))) { link_mask(topology_core_cpumask, cpu, i); /* @@ -570,7 +570,7 @@ void set_cpu_sibling_map(int cpu) } else if (i != cpu && !c->booted_cores) c->booted_cores = cpu_data(i).booted_cores; } - if (match_die(c, o) && !topology_same_node(c, o)) + if (match_pkg(c, o) && !topology_same_node(c, o)) x86_has_numa_in_package = true; } @@ -935,20 +935,27 @@ out: return boot_error; } -void common_cpu_up(unsigned int cpu, struct task_struct *idle) +int common_cpu_up(unsigned int cpu, struct task_struct *idle) { + int ret; + /* Just in case we booted with a single CPU. */ alternatives_enable_smp(); per_cpu(current_task, cpu) = idle; + /* Initialize the interrupt stack(s) */ + ret = irq_init_percpu_irqstack(cpu); + if (ret) + return ret; + #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle); #else initial_gs = per_cpu_offset(cpu); #endif + return 0; } /* @@ -1106,7 +1113,9 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) /* the FPU context is blank, nobody can own it */ per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; - common_cpu_up(cpu, tidle); + err = common_cpu_up(cpu, tidle); + if (err) + return err; err = do_boot_cpu(apicid, cpu, tidle, &cpu0_nmi_registered); if (err) { diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 5c2d71a1dc06..2abf27d7df6b 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -12,78 +12,31 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -static int save_stack_address(struct stack_trace *trace, unsigned long addr, - bool nosched) -{ - if (nosched && in_sched_functions(addr)) - return 0; - - if (trace->skip > 0) { - trace->skip--; - return 0; - } - - if (trace->nr_entries >= trace->max_entries) - return -1; - - trace->entries[trace->nr_entries++] = addr; - return 0; -} - -static void noinline __save_stack_trace(struct stack_trace *trace, - struct task_struct *task, struct pt_regs *regs, - bool nosched) +void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, + struct task_struct *task, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; - if (regs) - save_stack_address(trace, regs->ip, nosched); + if (regs && !consume_entry(cookie, regs->ip, false)) + return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); - if (!addr || save_stack_address(trace, addr, nosched)) + if (!addr || !consume_entry(cookie, addr, false)) break; } - - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; } /* - * Save stack-backtrace addresses into a stack_trace buffer. + * This function returns an error if it detects any unreliable features of the + * stack. Otherwise it guarantees that the stack trace is reliable. + * + * If the task is not 'current', the caller *must* ensure the task is inactive. */ -void save_stack_trace(struct stack_trace *trace) -{ - trace->skip++; - __save_stack_trace(trace, current, NULL, false); -} -EXPORT_SYMBOL_GPL(save_stack_trace); - -void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) -{ - __save_stack_trace(trace, current, regs, false); -} - -void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) -{ - if (!try_get_task_stack(tsk)) - return; - - if (tsk == current) - trace->skip++; - __save_stack_trace(trace, tsk, NULL, true); - - put_task_stack(tsk); -} -EXPORT_SYMBOL_GPL(save_stack_trace_tsk); - -#ifdef CONFIG_HAVE_RELIABLE_STACKTRACE - -static int __always_inline -__save_stack_trace_reliable(struct stack_trace *trace, - struct task_struct *task) +int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; @@ -97,7 +50,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (regs) { /* Success path for user tasks */ if (user_mode(regs)) - goto success; + return 0; /* * Kernel mode registers on the stack indicate an @@ -120,7 +73,7 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!addr) return -EINVAL; - if (save_stack_address(trace, addr, false)) + if (!consume_entry(cookie, addr, false)) return -EINVAL; } @@ -132,39 +85,9 @@ __save_stack_trace_reliable(struct stack_trace *trace, if (!(task->flags & (PF_KTHREAD | PF_IDLE))) return -EINVAL; -success: - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; - return 0; } -/* - * This function returns an error if it detects any unreliable features of the - * stack. Otherwise it guarantees that the stack trace is reliable. - * - * If the task is not 'current', the caller *must* ensure the task is inactive. - */ -int save_stack_trace_tsk_reliable(struct task_struct *tsk, - struct stack_trace *trace) -{ - int ret; - - /* - * If the task doesn't have a stack (e.g., a zombie), the stack is - * "reliably" empty. - */ - if (!try_get_task_stack(tsk)) - return 0; - - ret = __save_stack_trace_reliable(trace, tsk); - - put_task_stack(tsk); - - return ret; -} -#endif /* CONFIG_HAVE_RELIABLE_STACKTRACE */ - /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { @@ -189,15 +112,15 @@ copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) return ret; } -static inline void __save_stack_trace_user(struct stack_trace *trace) +void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, + const struct pt_regs *regs) { - const struct pt_regs *regs = task_pt_regs(current); const void __user *fp = (const void __user *)regs->bp; - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = regs->ip; + if (!consume_entry(cookie, regs->ip, false)) + return; - while (trace->nr_entries < trace->max_entries) { + while (1) { struct stack_frame_user frame; frame.next_fp = NULL; @@ -207,8 +130,8 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { - trace->entries[trace->nr_entries++] = - frame.ret_addr; + if (!consume_entry(cookie, frame.ret_addr, false)) + return; } if (fp == frame.next_fp) break; @@ -216,14 +139,3 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) } } -void save_stack_trace_user(struct stack_trace *trace) -{ - /* - * Trace user stack if we are not a kernel thread - */ - if (current->mm) { - __save_stack_trace_user(trace); - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; -} diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 738bf42b0218..be5bc2e47c71 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -71,7 +71,7 @@ int _debug_hotplug_cpu(int cpu, int action) case 0: ret = cpu_down(cpu); if (!ret) { - pr_info("CPU %u is now offline\n", cpu); + pr_info("DEBUG_HOTPLUG_CPU0: CPU %u is now offline\n", cpu); dev->offline = true; kobject_uevent(&dev->kobj, KOBJ_OFFLINE); } else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d26f9e9c3d83..8b6d03e55d2f 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -456,7 +456,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) * which is all zeros which indicates MPX was not * responsible for the exception. */ - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) goto exit_trap; diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 3fae23834069..15b5e98a86f9 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -185,8 +185,7 @@ static void __init cyc2ns_init_boot_cpu(void) /* * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * speed as the bootup CPU. */ static void __init cyc2ns_init_secondary_cpus(void) { @@ -283,6 +282,7 @@ int __init notsc_setup(char *str) __setup("notsc", notsc_setup); static int no_sched_irq_time; +static int no_tsc_watchdog; static int __init tsc_setup(char *str) { @@ -292,6 +292,8 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); + if (!strcmp(str, "nowatchdog")) + no_tsc_watchdog = 1; return 1; } @@ -937,12 +939,12 @@ void tsc_restore_sched_clock_state(void) } #ifdef CONFIG_CPU_FREQ -/* Frequency scaling support. Adjust the TSC based timer when the cpu frequency +/* + * Frequency scaling support. Adjust the TSC based timer when the CPU frequency * changes. * - * RED-PEN: On SMP we assume all CPUs run with the same frequency. It's - * not that important because current Opteron setups do not support - * scaling on SMP anyroads. + * NOTE: On SMP the situation is not fixable in general, so simply mark the TSC + * as unstable and give up in those cases. * * Should fix up last_tsc too. Currently gettimeofday in the * first tick after the change will be slightly wrong. @@ -956,22 +958,22 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) { struct cpufreq_freqs *freq = data; - unsigned long *lpj; - lpj = &boot_cpu_data.loops_per_jiffy; -#ifdef CONFIG_SMP - if (!(freq->flags & CPUFREQ_CONST_LOOPS)) - lpj = &cpu_data(freq->cpu).loops_per_jiffy; -#endif + if (num_online_cpus() > 1) { + mark_tsc_unstable("cpufreq changes on SMP"); + return 0; + } if (!ref_freq) { ref_freq = freq->old; - loops_per_jiffy_ref = *lpj; + loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy; tsc_khz_ref = tsc_khz; } + if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || - (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + (val == CPUFREQ_POSTCHANGE && freq->old > freq->new)) { + boot_cpu_data.loops_per_jiffy = + cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) @@ -1349,7 +1351,7 @@ static int __init init_tsc_clocksource(void) if (tsc_unstable) goto unreg; - if (tsc_clocksource_reliable) + if (tsc_clocksource_reliable || no_tsc_watchdog) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3)) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index a092b6b40c6b..6a38717d179c 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -369,7 +369,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) preempt_disable(); tsk->thread.sp0 += 16; - if (static_cpu_has(X86_FEATURE_SEP)) { + if (boot_cpu_has(X86_FEATURE_SEP)) { tsk->thread.sysenter_cs = 0; refresh_sysenter_cs(&tsk->thread); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index bad8c51fee6e..0850b5149345 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -141,11 +141,11 @@ SECTIONS *(.text.__x86.indirect_thunk) __indirect_thunk_end = .; #endif - - /* End of text section */ - _etext = .; } :text = 0x9090 + /* End of text section */ + _etext = .; + NOTES :text :note EXCEPTION_TABLE(16) :text = 0x9090 @@ -362,7 +362,7 @@ SECTIONS .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; *(.bss..page_aligned) - *(.bss) + *(BSS_MAIN) BSS_DECRYPTED . = ALIGN(PAGE_SIZE); __bss_stop = .; @@ -403,7 +403,8 @@ SECTIONS */ #define INIT_PER_CPU(x) init_per_cpu__##x = ABSOLUTE(x) + __per_cpu_load INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); +INIT_PER_CPU(fixed_percpu_data); +INIT_PER_CPU(irq_stack_backing_store); /* * Build-time check on the image size: @@ -412,8 +413,8 @@ INIT_PER_CPU(irq_stack_union); "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((fixed_percpu_data == 0), + "fixed_percpu_data is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index c338984c850d..d0d5dd44b4f4 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2331,24 +2331,18 @@ static int em_lseg(struct x86_emulate_ctxt *ctxt) static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt) { +#ifdef CONFIG_X86_64 u32 eax, ebx, ecx, edx; eax = 0x80000001; ecx = 0; ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx, false); return edx & bit(X86_FEATURE_LM); +#else + return false; +#endif } -#define GET_SMSTATE(type, smbase, offset) \ - ({ \ - type __val; \ - int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val, \ - sizeof(__val)); \ - if (r != X86EMUL_CONTINUE) \ - return X86EMUL_UNHANDLEABLE; \ - __val; \ - }) - static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) { desc->g = (flags >> 23) & 1; @@ -2361,27 +2355,30 @@ static void rsm_set_desc_flags(struct desc_struct *desc, u32 flags) desc->type = (flags >> 8) & 15; } -static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +static int rsm_load_seg_32(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) { struct desc_struct desc; int offset; u16 selector; - selector = GET_SMSTATE(u32, smbase, 0x7fa8 + n * 4); + selector = GET_SMSTATE(u32, smstate, 0x7fa8 + n * 4); if (n < 3) offset = 0x7f84 + n * 12; else offset = 0x7f2c + (n - 3) * 12; - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, offset)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, offset)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, n); return X86EMUL_CONTINUE; } -static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) +#ifdef CONFIG_X86_64 +static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, const char *smstate, + int n) { struct desc_struct desc; int offset; @@ -2390,15 +2387,16 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n) offset = 0x7e00 + n * 16; - selector = GET_SMSTATE(u16, smbase, offset); - rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smbase, offset + 2) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, offset + 4)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, offset + 8)); - base3 = GET_SMSTATE(u32, smbase, offset + 12); + selector = GET_SMSTATE(u16, smstate, offset); + rsm_set_desc_flags(&desc, GET_SMSTATE(u16, smstate, offset + 2) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, offset + 4)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, offset + 8)); + base3 = GET_SMSTATE(u32, smstate, offset + 12); ctxt->ops->set_segment(ctxt, selector, &desc, base3, n); return X86EMUL_CONTINUE; } +#endif static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, u64 cr0, u64 cr3, u64 cr4) @@ -2445,7 +2443,8 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) +static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, + const char *smstate) { struct desc_struct desc; struct desc_ptr dt; @@ -2453,53 +2452,55 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase) u32 val, cr0, cr3, cr4; int i; - cr0 = GET_SMSTATE(u32, smbase, 0x7ffc); - cr3 = GET_SMSTATE(u32, smbase, 0x7ff8); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED; - ctxt->_eip = GET_SMSTATE(u32, smbase, 0x7ff0); + cr0 = GET_SMSTATE(u32, smstate, 0x7ffc); + cr3 = GET_SMSTATE(u32, smstate, 0x7ff8); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7ff4) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u32, smstate, 0x7ff0); for (i = 0; i < 8; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u32, smbase, 0x7fd0 + i * 4); + *reg_write(ctxt, i) = GET_SMSTATE(u32, smstate, 0x7fd0 + i * 4); - val = GET_SMSTATE(u32, smbase, 0x7fcc); + val = GET_SMSTATE(u32, smstate, 0x7fcc); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7fc8); + val = GET_SMSTATE(u32, smstate, 0x7fc8); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - selector = GET_SMSTATE(u32, smbase, 0x7fc4); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f64)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f60)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f5c)); + selector = GET_SMSTATE(u32, smstate, 0x7fc4); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f64)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f60)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f5c)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_TR); - selector = GET_SMSTATE(u32, smbase, 0x7fc0); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7f80)); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7f7c)); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7f78)); + selector = GET_SMSTATE(u32, smstate, 0x7fc0); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7f80)); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7f7c)); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7f78)); ctxt->ops->set_segment(ctxt, selector, &desc, 0, VCPU_SREG_LDTR); - dt.address = GET_SMSTATE(u32, smbase, 0x7f74); - dt.size = GET_SMSTATE(u32, smbase, 0x7f70); + dt.address = GET_SMSTATE(u32, smstate, 0x7f74); + dt.size = GET_SMSTATE(u32, smstate, 0x7f70); ctxt->ops->set_gdt(ctxt, &dt); - dt.address = GET_SMSTATE(u32, smbase, 0x7f58); - dt.size = GET_SMSTATE(u32, smbase, 0x7f54); + dt.address = GET_SMSTATE(u32, smstate, 0x7f58); + dt.size = GET_SMSTATE(u32, smstate, 0x7f54); ctxt->ops->set_idt(ctxt, &dt); for (i = 0; i < 6; i++) { - int r = rsm_load_seg_32(ctxt, smbase, i); + int r = rsm_load_seg_32(ctxt, smstate, i); if (r != X86EMUL_CONTINUE) return r; } - cr4 = GET_SMSTATE(u32, smbase, 0x7f14); + cr4 = GET_SMSTATE(u32, smstate, 0x7f14); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7ef8)); return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); } -static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) +#ifdef CONFIG_X86_64 +static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, + const char *smstate) { struct desc_struct desc; struct desc_ptr dt; @@ -2509,43 +2510,43 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) int i, r; for (i = 0; i < 16; i++) - *reg_write(ctxt, i) = GET_SMSTATE(u64, smbase, 0x7ff8 - i * 8); + *reg_write(ctxt, i) = GET_SMSTATE(u64, smstate, 0x7ff8 - i * 8); - ctxt->_eip = GET_SMSTATE(u64, smbase, 0x7f78); - ctxt->eflags = GET_SMSTATE(u32, smbase, 0x7f70) | X86_EFLAGS_FIXED; + ctxt->_eip = GET_SMSTATE(u64, smstate, 0x7f78); + ctxt->eflags = GET_SMSTATE(u32, smstate, 0x7f70) | X86_EFLAGS_FIXED; - val = GET_SMSTATE(u32, smbase, 0x7f68); + val = GET_SMSTATE(u32, smstate, 0x7f68); ctxt->ops->set_dr(ctxt, 6, (val & DR6_VOLATILE) | DR6_FIXED_1); - val = GET_SMSTATE(u32, smbase, 0x7f60); + val = GET_SMSTATE(u32, smstate, 0x7f60); ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1); - cr0 = GET_SMSTATE(u64, smbase, 0x7f58); - cr3 = GET_SMSTATE(u64, smbase, 0x7f50); - cr4 = GET_SMSTATE(u64, smbase, 0x7f48); - ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00)); - val = GET_SMSTATE(u64, smbase, 0x7ed0); + cr0 = GET_SMSTATE(u64, smstate, 0x7f58); + cr3 = GET_SMSTATE(u64, smstate, 0x7f50); + cr4 = GET_SMSTATE(u64, smstate, 0x7f48); + ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smstate, 0x7f00)); + val = GET_SMSTATE(u64, smstate, 0x7ed0); ctxt->ops->set_msr(ctxt, MSR_EFER, val & ~EFER_LMA); - selector = GET_SMSTATE(u32, smbase, 0x7e90); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e92) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e94)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e98)); - base3 = GET_SMSTATE(u32, smbase, 0x7e9c); + selector = GET_SMSTATE(u32, smstate, 0x7e90); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e92) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e94)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e98)); + base3 = GET_SMSTATE(u32, smstate, 0x7e9c); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_TR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e84); - dt.address = GET_SMSTATE(u64, smbase, 0x7e88); + dt.size = GET_SMSTATE(u32, smstate, 0x7e84); + dt.address = GET_SMSTATE(u64, smstate, 0x7e88); ctxt->ops->set_idt(ctxt, &dt); - selector = GET_SMSTATE(u32, smbase, 0x7e70); - rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smbase, 0x7e72) << 8); - set_desc_limit(&desc, GET_SMSTATE(u32, smbase, 0x7e74)); - set_desc_base(&desc, GET_SMSTATE(u32, smbase, 0x7e78)); - base3 = GET_SMSTATE(u32, smbase, 0x7e7c); + selector = GET_SMSTATE(u32, smstate, 0x7e70); + rsm_set_desc_flags(&desc, GET_SMSTATE(u32, smstate, 0x7e72) << 8); + set_desc_limit(&desc, GET_SMSTATE(u32, smstate, 0x7e74)); + set_desc_base(&desc, GET_SMSTATE(u32, smstate, 0x7e78)); + base3 = GET_SMSTATE(u32, smstate, 0x7e7c); ctxt->ops->set_segment(ctxt, selector, &desc, base3, VCPU_SREG_LDTR); - dt.size = GET_SMSTATE(u32, smbase, 0x7e64); - dt.address = GET_SMSTATE(u64, smbase, 0x7e68); + dt.size = GET_SMSTATE(u32, smstate, 0x7e64); + dt.address = GET_SMSTATE(u64, smstate, 0x7e68); ctxt->ops->set_gdt(ctxt, &dt); r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4); @@ -2553,37 +2554,49 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase) return r; for (i = 0; i < 6; i++) { - r = rsm_load_seg_64(ctxt, smbase, i); + r = rsm_load_seg_64(ctxt, smstate, i); if (r != X86EMUL_CONTINUE) return r; } return X86EMUL_CONTINUE; } +#endif static int em_rsm(struct x86_emulate_ctxt *ctxt) { unsigned long cr0, cr4, efer; + char buf[512]; u64 smbase; int ret; if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); + smbase = ctxt->ops->get_smbase(ctxt); + + ret = ctxt->ops->read_phys(ctxt, smbase + 0xfe00, buf, sizeof(buf)); + if (ret != X86EMUL_CONTINUE) + return X86EMUL_UNHANDLEABLE; + + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + ctxt->ops->set_nmi_mask(ctxt, false); + + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); + /* * Get back to real mode, to prepare a safe state in which to load * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU * supports long mode. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); if (emulator_has_longmode(ctxt)) { struct desc_struct cs_desc; /* Zero CR4.PCIDE before CR0.PG. */ - if (cr4 & X86_CR4_PCIDE) { + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PCIDE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - cr4 &= ~X86_CR4_PCIDE; - } /* A 32-bit code segment is required to clear EFER.LMA. */ memset(&cs_desc, 0, sizeof(cs_desc)); @@ -2597,39 +2610,39 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + if (emulator_has_longmode(ctxt)) { + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - smbase = ctxt->ops->get_smbase(ctxt); + /* And finally go back to 32-bit mode. */ + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + } /* * Give pre_leave_smm() a chance to make ISA-specific changes to the * vCPU state (e.g. enter guest mode) before loading state from the SMM * state-save area. */ - if (ctxt->ops->pre_leave_smm(ctxt, smbase)) + if (ctxt->ops->pre_leave_smm(ctxt, buf)) return X86EMUL_UNHANDLEABLE; +#ifdef CONFIG_X86_64 if (emulator_has_longmode(ctxt)) - ret = rsm_load_state_64(ctxt, smbase + 0x8000); + ret = rsm_load_state_64(ctxt, buf); else - ret = rsm_load_state_32(ctxt, smbase + 0x8000); +#endif + ret = rsm_load_state_32(ctxt, buf); if (ret != X86EMUL_CONTINUE) { /* FIXME: should triple fault */ return X86EMUL_UNHANDLEABLE; } - if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) - ctxt->ops->set_nmi_mask(ctxt, false); + ctxt->ops->post_leave_smm(ctxt); - ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & - ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 421899f6ad7b..cc24b3a32c44 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1371,7 +1371,16 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *current_vcpu, u64 ingpa, valid_bank_mask = BIT_ULL(0); sparse_banks[0] = flush.processor_mask; - all_cpus = flush.flags & HV_FLUSH_ALL_PROCESSORS; + + /* + * Work around possible WS2012 bug: it sends hypercalls + * with processor_mask = 0x0 and HV_FLUSH_ALL_PROCESSORS clear, + * while also expecting us to flush something and crashing if + * we don't. Let's treat processor_mask == 0 same as + * HV_FLUSH_ALL_PROCESSORS. + */ + all_cpus = (flush.flags & HV_FLUSH_ALL_PROCESSORS) || + flush.processor_mask == 0; } else { if (unlikely(kvm_read_guest(kvm, ingpa, &flush_ex, sizeof(flush_ex)))) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 991fdf7fc17f..bd13fdddbdc4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -70,7 +70,6 @@ #define APIC_BROADCAST 0xFF #define X2APIC_BROADCAST 0xFFFFFFFFul -static bool lapic_timer_advance_adjust_done = false; #define LAPIC_TIMER_ADVANCE_ADJUST_DONE 100 /* step-by-step approximation to mitigate fluctuation */ #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8 @@ -138,6 +137,7 @@ static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map, if (offset <= max_apic_id) { u8 cluster_size = min(max_apic_id - offset + 1, 16U); + offset = array_index_nospec(offset, map->max_apic_id + 1); *cluster = &map->phys_map[offset]; *mask = dest_id & (0xffff >> (16 - cluster_size)); } else { @@ -901,7 +901,8 @@ static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm, if (irq->dest_id > map->max_apic_id) { *bitmap = 0; } else { - *dst = &map->phys_map[irq->dest_id]; + u32 dest_id = array_index_nospec(irq->dest_id, map->max_apic_id + 1); + *dst = &map->phys_map[dest_id]; *bitmap = 1; } return true; @@ -1480,14 +1481,32 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu) return false; } +static inline void __wait_lapic_expire(struct kvm_vcpu *vcpu, u64 guest_cycles) +{ + u64 timer_advance_ns = vcpu->arch.apic->lapic_timer.timer_advance_ns; + + /* + * If the guest TSC is running at a different ratio than the host, then + * convert the delay to nanoseconds to achieve an accurate delay. Note + * that __delay() uses delay_tsc whenever the hardware has TSC, thus + * always for VMX enabled hardware. + */ + if (vcpu->arch.tsc_scaling_ratio == kvm_default_tsc_scaling_ratio) { + __delay(min(guest_cycles, + nsec_to_cycles(vcpu, timer_advance_ns))); + } else { + u64 delay_ns = guest_cycles * 1000000ULL; + do_div(delay_ns, vcpu->arch.virtual_tsc_khz); + ndelay(min_t(u32, delay_ns, timer_advance_ns)); + } +} + void wait_lapic_expire(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns; u64 guest_tsc, tsc_deadline, ns; - if (!lapic_in_kernel(vcpu)) - return; - if (apic->lapic_timer.expired_tscdeadline == 0) return; @@ -1499,33 +1518,37 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); - /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ if (guest_tsc < tsc_deadline) - __delay(min(tsc_deadline - guest_tsc, - nsec_to_cycles(vcpu, lapic_timer_advance_ns))); + __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc); - if (!lapic_timer_advance_adjust_done) { + if (!apic->lapic_timer.timer_advance_adjust_done) { /* too early */ if (guest_tsc < tsc_deadline) { ns = (tsc_deadline - guest_tsc) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns -= min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns -= min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } else { /* too late */ ns = (guest_tsc - tsc_deadline) * 1000000ULL; do_div(ns, vcpu->arch.virtual_tsc_khz); - lapic_timer_advance_ns += min((unsigned int)ns, - lapic_timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); + timer_advance_ns += min((u32)ns, + timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP); } if (abs(guest_tsc - tsc_deadline) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) - lapic_timer_advance_adjust_done = true; + apic->lapic_timer.timer_advance_adjust_done = true; + if (unlikely(timer_advance_ns > 5000)) { + timer_advance_ns = 0; + apic->lapic_timer.timer_advance_adjust_done = true; + } + apic->lapic_timer.timer_advance_ns = timer_advance_ns; } } static void start_sw_tscdeadline(struct kvm_lapic *apic) { - u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + struct kvm_timer *ktimer = &apic->lapic_timer; + u64 guest_tsc, tscdeadline = ktimer->tscdeadline; u64 ns = 0; ktime_t expire; struct kvm_vcpu *vcpu = apic->vcpu; @@ -1540,13 +1563,15 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); - if (likely(tscdeadline > guest_tsc)) { - ns = (tscdeadline - guest_tsc) * 1000000ULL; - do_div(ns, this_tsc_khz); + + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + + if (likely(tscdeadline > guest_tsc) && + likely(ns > apic->lapic_timer.timer_advance_ns)) { expire = ktime_add_ns(now, ns); - expire = ktime_sub_ns(expire, lapic_timer_advance_ns); - hrtimer_start(&apic->lapic_timer.timer, - expire, HRTIMER_MODE_ABS_PINNED); + expire = ktime_sub_ns(expire, ktimer->timer_advance_ns); + hrtimer_start(&ktimer->timer, expire, HRTIMER_MODE_ABS_PINNED); } else apic_timer_expired(apic); @@ -2253,7 +2278,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) return HRTIMER_NORESTART; } -int kvm_create_lapic(struct kvm_vcpu *vcpu) +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) { struct kvm_lapic *apic; @@ -2277,6 +2302,14 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu) hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); apic->lapic_timer.timer.function = apic_timer_fn; + if (timer_advance_ns == -1) { + apic->lapic_timer.timer_advance_ns = 1000; + apic->lapic_timer.timer_advance_adjust_done = false; + } else { + apic->lapic_timer.timer_advance_ns = timer_advance_ns; + apic->lapic_timer.timer_advance_adjust_done = true; + } + /* * APIC is created enabled. This will prevent kvm_lapic_set_base from diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index ff6ef9c3d760..d6d049ba3045 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -31,8 +31,10 @@ struct kvm_timer { u32 timer_mode_mask; u64 tscdeadline; u64 expired_tscdeadline; + u32 timer_advance_ns; atomic_t pending; /* accumulated triggered timers */ bool hv_timer_in_use; + bool timer_advance_adjust_done; }; struct kvm_lapic { @@ -62,7 +64,7 @@ struct kvm_lapic { struct dest_map; -int kvm_create_lapic(struct kvm_vcpu *vcpu); +int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns); void kvm_free_lapic(struct kvm_vcpu *vcpu); int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index eee455a8a612..d9c7b45d231f 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2007,7 +2007,7 @@ static int is_empty_shadow_page(u64 *spt) * aggregate version in order to make the slab shrinker * faster */ -static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr) +static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr) { kvm->arch.n_used_mmu_pages += nr; percpu_counter_add(&kvm_total_used_mmu_pages, nr); @@ -2238,7 +2238,7 @@ static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm, struct list_head *invalid_list, bool remote_flush) { - if (!remote_flush && !list_empty(invalid_list)) + if (!remote_flush && list_empty(invalid_list)) return false; if (!list_empty(invalid_list)) @@ -2763,7 +2763,7 @@ static bool prepare_zap_oldest_mmu_page(struct kvm *kvm, * Changing the number of mmu pages allocated to the vm * Note: if goal_nr_mmu_pages is too small, you will get dead lock */ -void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages) +void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long goal_nr_mmu_pages) { LIST_HEAD(invalid_list); @@ -4781,6 +4781,7 @@ static union kvm_mmu_extended_role kvm_calc_mmu_role_ext(struct kvm_vcpu *vcpu) union kvm_mmu_extended_role ext = {0}; ext.cr0_pg = !!is_paging(vcpu); + ext.cr4_pae = !!is_pae(vcpu); ext.cr4_smep = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMEP); ext.cr4_smap = !!kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); ext.cr4_pse = !!is_pse(vcpu); @@ -6031,10 +6032,10 @@ out: /* * Calculate mmu pages needed for kvm. */ -unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) +unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) { - unsigned int nr_mmu_pages; - unsigned int nr_pages = 0; + unsigned long nr_mmu_pages; + unsigned long nr_pages = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int i; @@ -6047,8 +6048,7 @@ unsigned int kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm) } nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; - nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index bbdc60f2fae8..54c2a377795b 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -64,7 +64,7 @@ bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu); int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code, u64 fault_address, char *insn, int insn_len); -static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) +static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm) { if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages) return kvm->arch.n_max_mmu_pages - diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 58ead7db71a3..e39741997893 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -281,9 +281,13 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) { bool fast_mode = idx & (1u << 31); + struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); struct kvm_pmc *pmc; u64 ctr_val; + if (!pmu->version) + return 1; + if (is_vmware_backdoor_pmc(idx)) return kvm_pmu_rdpmc_vmware(vcpu, idx, data); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e0a791c3d4fc..406b558abfef 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -262,6 +262,7 @@ struct amd_svm_iommu_ir { }; #define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31 #define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) #define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) @@ -2692,6 +2693,7 @@ static int npf_interception(struct vcpu_svm *svm) static int db_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + struct kvm_vcpu *vcpu = &svm->vcpu; if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && @@ -2702,6 +2704,8 @@ static int db_interception(struct vcpu_svm *svm) if (svm->nmi_singlestep) { disable_nmi_singlestep(svm); + /* Make sure we check for pending NMIs upon entry */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } if (svm->vcpu.guest_debug & @@ -4517,14 +4521,25 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; struct kvm_lapic *apic = svm->vcpu.arch.apic; /* - * Update ICR high and low, then emulate sending IPI, - * which is handled when writing APIC_ICR. + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. */ - kvm_lapic_reg_write(apic, APIC_ICR2, icrh); - kvm_lapic_reg_write(apic, APIC_ICR, icrl); + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & KVM_APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & KVM_APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } break; } case AVIC_IPI_FAILURE_INVALID_TARGET: @@ -4596,7 +4611,7 @@ static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat); if (entry) - WRITE_ONCE(*entry, (u32) ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK); + clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry); } static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) @@ -5621,6 +5636,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) svm->vmcb->save.cr2 = vcpu->arch.cr2; clgi(); + kvm_load_guest_xcr0(vcpu); /* * If this vCPU has touched SPEC_CTRL, restore the guest's value if @@ -5766,6 +5782,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(&svm->vcpu); + kvm_put_guest_xcr0(vcpu); stgi(); /* Any pending NMI will happen here */ @@ -6215,32 +6232,24 @@ static int svm_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb *nested_vmcb; struct page *page; - struct { - u64 guest; - u64 vmcb; - } svm_state_save; - int ret; + u64 guest; + u64 vmcb; - ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save, - sizeof(svm_state_save)); - if (ret) - return ret; + guest = GET_SMSTATE(u64, smstate, 0x7ed8); + vmcb = GET_SMSTATE(u64, smstate, 0x7ee0); - if (svm_state_save.guest) { - vcpu->arch.hflags &= ~HF_SMM_MASK; - nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page); - if (nested_vmcb) - enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page); - else - ret = 1; - vcpu->arch.hflags |= HF_SMM_MASK; + if (guest) { + nested_vmcb = nested_svm_map(svm, vmcb, &page); + if (!nested_vmcb) + return 1; + enter_svm_guest_mode(svm, vmcb, nested_vmcb, page); } - return ret; + return 0; } static int enable_smi_window(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 6432d08c7de7..4d47a2631d1f 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -438,13 +438,13 @@ TRACE_EVENT(kvm_apic_ipi, ); TRACE_EVENT(kvm_apic_accept_irq, - TP_PROTO(__u32 apicid, __u16 dm, __u8 tm, __u8 vec), + TP_PROTO(__u32 apicid, __u16 dm, __u16 tm, __u8 vec), TP_ARGS(apicid, dm, tm, vec), TP_STRUCT__entry( __field( __u32, apicid ) __field( __u16, dm ) - __field( __u8, tm ) + __field( __u16, tm ) __field( __u8, vec ) ), diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 7ec9bb1dd723..0c601d079cd2 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2873,20 +2873,27 @@ static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu) /* * If translation failed, VM entry will fail because * prepare_vmcs02 set VIRTUAL_APIC_PAGE_ADDR to -1ull. - * Failing the vm entry is _not_ what the processor - * does but it's basically the only possibility we - * have. We could still enter the guest if CR8 load - * exits are enabled, CR8 store exits are enabled, and - * virtualize APIC access is disabled; in this case - * the processor would never use the TPR shadow and we - * could simply clear the bit from the execution - * control. But such a configuration is useless, so - * let's keep the code simple. */ if (!is_error_page(page)) { vmx->nested.virtual_apic_page = page; hpa = page_to_phys(vmx->nested.virtual_apic_page); vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, hpa); + } else if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING) && + nested_cpu_has(vmcs12, CPU_BASED_CR8_STORE_EXITING) && + !nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { + /* + * The processor will never use the TPR shadow, simply + * clear the bit from the execution control. Such a + * configuration is useless, but it happens in tests. + * For any other configuration, failing the vm entry is + * _not_ what the processor does but it's basically the + * only possibility we have. + */ + vmcs_clear_bits(CPU_BASED_VM_EXEC_CONTROL, + CPU_BASED_TPR_SHADOW); + } else { + printk("bad virtual-APIC page address\n"); + dump_vmcs(); } } @@ -3789,8 +3796,18 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) vmx_set_cr4(vcpu, vmcs_readl(CR4_READ_SHADOW)); nested_ept_uninit_mmu_context(vcpu); - vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); - __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + + /* + * This is only valid if EPT is in use, otherwise the vmcs01 GUEST_CR3 + * points to shadow pages! Fortunately we only get here after a WARN_ON + * if EPT is disabled, so a VMabort is perfectly fine. + */ + if (enable_ept) { + vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } else { + nested_vmx_abort(vcpu, VMX_ABORT_VMCS_CORRUPTED); + } /* * Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs @@ -5406,7 +5423,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, return ret; /* Empty 'VMXON' state is permitted */ - if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12)) + if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) return 0; if (kvm_state->vmx.vmcs_pa != -1ull) { @@ -5450,7 +5467,7 @@ static int vmx_set_nested_state(struct kvm_vcpu *vcpu, vmcs12->vmcs_link_pointer != -1ull) { struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu); - if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12)) + if (kvm_state->size < sizeof(*kvm_state) + 2 * sizeof(*vmcs12)) return -EINVAL; if (copy_from_user(shadow_vmcs12, @@ -5738,6 +5755,14 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *)) { int i; + /* + * Without EPT it is not possible to restore L1's CR3 and PDPTR on + * VMfail, because they are not available in vmcs01. Just always + * use hardware checks. + */ + if (!enable_ept) + nested_early_check = 1; + if (!cpu_has_vmx_shadow_vmcs()) enable_shadow_vmcs = 0; if (enable_shadow_vmcs) { diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 7b272738c576..d4cb1945b2e3 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -3,6 +3,7 @@ #include <asm/asm.h> #include <asm/bitsperlong.h> #include <asm/kvm_vcpu_regs.h> +#include <asm/nospec-branch.h> #define WORD_SIZE (BITS_PER_LONG / 8) @@ -77,6 +78,17 @@ ENDPROC(vmx_vmenter) * referred to by VMCS.HOST_RIP. */ ENTRY(vmx_vmexit) +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE + /* Preserve guest's RAX, it's used to stuff the RSB. */ + push %_ASM_AX + + /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */ + FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE + + pop %_ASM_AX +.Lvmexit_skip_rsb: +#endif ret ENDPROC(vmx_vmexit) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index ab432a930ae8..9663d41cc2bc 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -5603,7 +5603,7 @@ static void vmx_dump_dtsel(char *name, uint32_t limit) vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT)); } -static void dump_vmcs(void) +void dump_vmcs(void) { u32 vmentry_ctl = vmcs_read32(VM_ENTRY_CONTROLS); u32 vmexit_ctl = vmcs_read32(VM_EXIT_CONTROLS); @@ -6410,6 +6410,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) vmx_set_interrupt_shadow(vcpu, 0); + kvm_load_guest_xcr0(vcpu); + if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE) && vcpu->arch.pkru != vmx->host_pkru) @@ -6460,9 +6462,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); - /* Eliminate branch target predictions from guest mode */ - vmexit_fill_RSB(); - /* All fields are clean at this point */ if (static_branch_unlikely(&enable_evmcs)) current_evmcs->hv_clean_fields |= @@ -6501,11 +6500,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu) */ if (static_cpu_has(X86_FEATURE_PKU) && kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) { - vcpu->arch.pkru = __read_pkru(); + vcpu->arch.pkru = rdpkru(); if (vcpu->arch.pkru != vmx->host_pkru) __write_pkru(vmx->host_pkru); } + kvm_put_guest_xcr0(vcpu); + vmx->nested.nested_run_pending = 0; vmx->idt_vectoring_info = 0; @@ -6852,6 +6853,30 @@ static void nested_vmx_entry_exit_ctls_update(struct kvm_vcpu *vcpu) } } +static bool guest_cpuid_has_pmu(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *entry; + union cpuid10_eax eax; + + entry = kvm_find_cpuid_entry(vcpu, 0xa, 0); + if (!entry) + return false; + + eax.full = entry->eax; + return (eax.split.version_id > 0); +} + +static void nested_vmx_procbased_ctls_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + bool pmu_enabled = guest_cpuid_has_pmu(vcpu); + + if (pmu_enabled) + vmx->nested.msrs.procbased_ctls_high |= CPU_BASED_RDPMC_EXITING; + else + vmx->nested.msrs.procbased_ctls_high &= ~CPU_BASED_RDPMC_EXITING; +} + static void update_intel_pt_cfg(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -6940,6 +6965,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) if (nested_vmx_allowed(vcpu)) { nested_vmx_cr_fixed1_bits_update(vcpu); nested_vmx_entry_exit_ctls_update(vcpu); + nested_vmx_procbased_ctls_update(vcpu); } if (boot_cpu_has(X86_FEATURE_INTEL_PT) && @@ -7003,6 +7029,7 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) { struct vcpu_vmx *vmx; u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles; + struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer; if (kvm_mwait_in_guest(vcpu->kvm)) return -EOPNOTSUPP; @@ -7011,7 +7038,8 @@ static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc) tscl = rdtsc(); guest_tscl = kvm_read_l1_tsc(vcpu, tscl); delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl; - lapic_timer_advance_cycles = nsec_to_cycles(vcpu, lapic_timer_advance_ns); + lapic_timer_advance_cycles = nsec_to_cycles(vcpu, + ktimer->timer_advance_ns); if (delta_tsc > lapic_timer_advance_cycles) delta_tsc -= lapic_timer_advance_cycles; @@ -7369,7 +7397,7 @@ static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate) return 0; } -static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) +static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate) { struct vcpu_vmx *vmx = to_vmx(vcpu); int ret; @@ -7380,9 +7408,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase) } if (vmx->nested.smm.guest_mode) { - vcpu->arch.hflags &= ~HF_SMM_MASK; ret = nested_vmx_enter_non_root_mode(vcpu, false); - vcpu->arch.hflags |= HF_SMM_MASK; if (ret) return ret; diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index a1e00d0a2482..f879529906b4 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -517,4 +517,6 @@ static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx) vmcs_write64(TSC_MULTIPLIER, vmx->current_tsc_ratio); } +void dump_vmcs(void); + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 099b851dabaf..d75bb97b983c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -136,10 +136,14 @@ EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio); static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); -/* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int __read_mostly lapic_timer_advance_ns = 1000; +/* + * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables + * adaptive tuning starting from default advancment of 1000ns. '0' disables + * advancement entirely. Any other value is used as-is and disables adaptive + * tuning, i.e. allows priveleged userspace to set an exact advancement time. + */ +static int __read_mostly lapic_timer_advance_ns = -1; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -EXPORT_SYMBOL_GPL(lapic_timer_advance_ns); static bool __read_mostly vector_hashing = true; module_param(vector_hashing, bool, S_IRUGO); @@ -800,7 +804,7 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) } EXPORT_SYMBOL_GPL(kvm_lmsw); -static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) { if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) && !vcpu->guest_xcr0_loaded) { @@ -810,8 +814,9 @@ static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu) vcpu->guest_xcr0_loaded = 1; } } +EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0); -static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) +void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) { if (vcpu->guest_xcr0_loaded) { if (vcpu->arch.xcr0 != host_xcr0) @@ -819,6 +824,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) vcpu->guest_xcr0_loaded = 0; } } +EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0); static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) { @@ -3093,7 +3099,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) break; case KVM_CAP_NESTED_STATE: r = kvm_x86_ops->get_nested_state ? - kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0; + kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0; break; default: break; @@ -3528,7 +3534,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, memset(&events->reserved, 0, sizeof(events->reserved)); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags); +static void kvm_smm_changed(struct kvm_vcpu *vcpu); static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) @@ -3588,12 +3594,13 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, vcpu->arch.apic->sipi_vector = events->sipi_vector; if (events->flags & KVM_VCPUEVENT_VALID_SMM) { - u32 hflags = vcpu->arch.hflags; - if (events->smi.smm) - hflags |= HF_SMM_MASK; - else - hflags &= ~HF_SMM_MASK; - kvm_set_hflags(vcpu, hflags); + if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { + if (events->smi.smm) + vcpu->arch.hflags |= HF_SMM_MASK; + else + vcpu->arch.hflags &= ~HF_SMM_MASK; + kvm_smm_changed(vcpu); + } vcpu->arch.smi_pending = events->smi.pending; @@ -3674,15 +3681,15 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *src = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *src = get_xsave_addr(xsave, xfeature_nr); if (src) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(dest + offset, &vcpu->arch.pkru, sizeof(vcpu->arch.pkru)); else @@ -3690,7 +3697,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu) } - valid -= feature; + valid -= xfeature_mask; } } @@ -3717,22 +3724,22 @@ static void load_xsave(struct kvm_vcpu *vcpu, u8 *src) */ valid = xstate_bv & ~XFEATURE_MASK_FPSSE; while (valid) { - u64 feature = valid & -valid; - int index = fls64(feature) - 1; - void *dest = get_xsave_addr(xsave, feature); + u64 xfeature_mask = valid & -valid; + int xfeature_nr = fls64(xfeature_mask) - 1; + void *dest = get_xsave_addr(xsave, xfeature_nr); if (dest) { u32 size, offset, ecx, edx; - cpuid_count(XSTATE_CPUID, index, + cpuid_count(XSTATE_CPUID, xfeature_nr, &size, &offset, &ecx, &edx); - if (feature == XFEATURE_MASK_PKRU) + if (xfeature_nr == XFEATURE_PKRU) memcpy(&vcpu->arch.pkru, src + offset, sizeof(vcpu->arch.pkru)); else memcpy(dest, src + offset, size); } - valid -= feature; + valid -= xfeature_mask; } } @@ -4270,7 +4277,7 @@ static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, } static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, - u32 kvm_nr_mmu_pages) + unsigned long kvm_nr_mmu_pages) { if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) return -EINVAL; @@ -4284,7 +4291,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, return 0; } -static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) +static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) { return kvm->arch.n_max_mmu_pages; } @@ -5958,12 +5965,18 @@ static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) { - kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); + emul_to_vcpu(ctxt)->arch.hflags = emul_flags; +} + +static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, + const char *smstate) +{ + return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate); } -static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt, u64 smbase) +static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt) { - return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smbase); + kvm_smm_changed(emul_to_vcpu(ctxt)); } static const struct x86_emulate_ops emulate_ops = { @@ -6006,6 +6019,7 @@ static const struct x86_emulate_ops emulate_ops = { .get_hflags = emulator_get_hflags, .set_hflags = emulator_set_hflags, .pre_leave_smm = emulator_pre_leave_smm, + .post_leave_smm = emulator_post_leave_smm, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -6247,16 +6261,6 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); } -static void kvm_set_hflags(struct kvm_vcpu *vcpu, unsigned emul_flags) -{ - unsigned changed = vcpu->arch.hflags ^ emul_flags; - - vcpu->arch.hflags = emul_flags; - - if (changed & HF_SMM_MASK) - kvm_smm_changed(vcpu); -} - static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, unsigned long *db) { @@ -6535,6 +6539,12 @@ int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); +static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pio.count = 0; + return 1; +} + static int complete_fast_pio_out(struct kvm_vcpu *vcpu) { vcpu->arch.pio.count = 0; @@ -6551,12 +6561,23 @@ static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, size, port, &val, 1); + if (ret) + return ret; - if (!ret) { + /* + * Workaround userspace that relies on old KVM behavior of %rip being + * incremented prior to exiting to userspace to handle "OUT 0x7e". + */ + if (port == 0x7e && + kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) { + vcpu->arch.complete_userspace_io = + complete_fast_pio_out_port_0x7e; + kvm_skip_emulated_instruction(vcpu); + } else { vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); vcpu->arch.complete_userspace_io = complete_fast_pio_out; } - return ret; + return 0; } static int complete_fast_pio_in(struct kvm_vcpu *vcpu) @@ -7441,9 +7462,9 @@ static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf) put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase); } +#ifdef CONFIG_X86_64 static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) { -#ifdef CONFIG_X86_64 struct desc_ptr dt; struct kvm_segment seg; unsigned long val; @@ -7493,10 +7514,8 @@ static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf) for (i = 0; i < 6; i++) enter_smm_save_seg_64(vcpu, buf, i); -#else - WARN_ON_ONCE(1); -#endif } +#endif static void enter_smm(struct kvm_vcpu *vcpu) { @@ -7507,9 +7526,11 @@ static void enter_smm(struct kvm_vcpu *vcpu) trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true); memset(buf, 0, 512); +#ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) enter_smm_save_state_64(vcpu, buf); else +#endif enter_smm_save_state_32(vcpu, buf); /* @@ -7567,8 +7588,10 @@ static void enter_smm(struct kvm_vcpu *vcpu) kvm_set_segment(vcpu, &ds, VCPU_SREG_GS); kvm_set_segment(vcpu, &ds, VCPU_SREG_SS); +#ifdef CONFIG_X86_64 if (guest_cpuid_has(vcpu, X86_FEATURE_LM)) kvm_x86_ops->set_efer(vcpu, 0); +#endif kvm_update_cpuid(vcpu); kvm_mmu_reset_context(vcpu); @@ -7865,18 +7888,21 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) goto cancel_injection; } - kvm_load_guest_xcr0(vcpu); - if (req_immediate_exit) { kvm_make_request(KVM_REQ_EVENT, vcpu); kvm_x86_ops->request_immediate_exit(vcpu); } trace_kvm_entry(vcpu->vcpu_id); - if (lapic_timer_advance_ns) + if (lapic_in_kernel(vcpu) && + vcpu->arch.apic->lapic_timer.timer_advance_ns) wait_lapic_expire(vcpu); guest_enter_irqoff(); + fpregs_assert_state_consistent(); + if (test_thread_flag(TIF_NEED_FPU_LOAD)) + switch_fpu_return(); + if (unlikely(vcpu->arch.switch_db_regs)) { set_debugreg(0, 7); set_debugreg(vcpu->arch.eff_db[0], 0); @@ -7919,8 +7945,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); - kvm_put_guest_xcr0(vcpu); - kvm_before_interrupt(vcpu); kvm_x86_ops->handle_external_intr(vcpu); kvm_after_interrupt(vcpu); @@ -8137,22 +8161,30 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu) /* Swap (qemu) user FPU context for the guest FPU context. */ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); + fpregs_lock(); + copy_fpregs_to_fpstate(¤t->thread.fpu); /* PKRU is separately restored in kvm_x86_ops->run. */ __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state, ~XFEATURE_MASK_PKRU); - preempt_enable(); + + fpregs_mark_activate(); + fpregs_unlock(); + trace_kvm_fpu(1); } /* When vcpu_run ends, restore user space FPU context. */ static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - preempt_disable(); + fpregs_lock(); + copy_fpregs_to_fpstate(vcpu->arch.guest_fpu); copy_kernel_to_fpregs(¤t->thread.fpu.state); - preempt_enable(); + + fpregs_mark_activate(); + fpregs_unlock(); + ++vcpu->stat.fpu_reload; trace_kvm_fpu(0); } @@ -8850,11 +8882,11 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (init_event) kvm_put_guest_fpu(vcpu); mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_MASK_BNDREGS); + XFEATURE_BNDREGS); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state)); mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave, - XFEATURE_MASK_BNDCSR); + XFEATURE_BNDCSR); if (mpx_state_buffer) memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); if (init_event) @@ -9063,7 +9095,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu); - r = kvm_create_lapic(vcpu); + r = kvm_create_lapic(vcpu, lapic_timer_advance_ns); if (r < 0) goto fail_mmu_destroy; } else diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 28406aa1136d..534d3f28bb01 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -294,8 +294,6 @@ extern u64 kvm_supported_xcr0(void); extern unsigned int min_timer_period_us; -extern unsigned int lapic_timer_advance_ns; - extern bool enable_vmware_backdoor; extern struct static_key kvm_no_apic_vcpu; @@ -347,4 +345,6 @@ static inline void kvm_after_interrupt(struct kvm_vcpu *vcpu) __this_cpu_write(current_vcpu, NULL); } +void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu); +void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 140e61843a07..5246db42de45 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -6,6 +6,18 @@ # Produces uninteresting flaky coverage. KCOV_INSTRUMENT_delay.o := n +# Early boot use of cmdline; don't instrument it +ifdef CONFIG_AMD_MEM_ENCRYPT +KCOV_INSTRUMENT_cmdline.o := n +KASAN_SANITIZE_cmdline.o := n + +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_cmdline.o = -pg +endif + +CFLAGS_cmdline.o := $(call cc-option, -fno-stack-protector) +endif + inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ @@ -23,7 +35,6 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o lib-y := delay.o misc.o cmdline.o cpu.o lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o lib-y += memcpy_$(BITS).o -lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index db4e5aa0858b..b2f1822084ae 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,6 +16,30 @@ #include <asm/smap.h> #include <asm/export.h> +.macro ALIGN_DESTINATION + /* check for bad alignment of destination */ + movl %edi,%ecx + andl $7,%ecx + jz 102f /* already aligned */ + subl $8,%ecx + negl %ecx + subl %ecx,%edx +100: movb (%rsi),%al +101: movb %al,(%rdi) + incq %rsi + incq %rdi + decl %ecx + jnz 100b +102: + .section .fixup,"ax" +103: addl %ecx,%edx /* ecx is zerorest also */ + jmp copy_user_handle_tail + .previous + + _ASM_EXTABLE_UA(100b, 103b) + _ASM_EXTABLE_UA(101b, 103b) + .endm + /* * copy_user_generic_unrolled - memory copy with exception handling. * This version is for CPUs like P4 that don't have efficient micro @@ -194,6 +218,30 @@ ENDPROC(copy_user_enhanced_fast_string) EXPORT_SYMBOL(copy_user_enhanced_fast_string) /* + * Try to copy last bytes and clear the rest if needed. + * Since protection fault in copy_from/to_user is not a normal situation, + * it is not necessary to optimize tail handling. + * + * Input: + * rdi destination + * rsi source + * rdx count + * + * Output: + * eax uncopied bytes or 0 if successful. + */ +ALIGN; +copy_user_handle_tail: + movl %edx,%ecx +1: rep movsb +2: mov %ecx,%eax + ASM_CLAC + ret + + _ASM_EXTABLE_UA(1b, 2b) +ENDPROC(copy_user_handle_tail) + +/* * copy_user_nocache - Uncached memory copy with exception handling * This will force destination out of cache for more performance. * diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f5b7f1b3b6d7..b7375dc6898f 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -162,7 +162,7 @@ void __delay(unsigned long loops) } EXPORT_SYMBOL(__delay); -void __const_udelay(unsigned long xloops) +noinline void __const_udelay(unsigned long xloops) { unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy; int d0; diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c index 3cdf06128d13..be5b5fb1598b 100644 --- a/arch/x86/lib/error-inject.c +++ b/arch/x86/lib/error-inject.c @@ -6,6 +6,7 @@ asmlinkage void just_return_func(void); asm( + ".text\n" ".type just_return_func, @function\n" ".globl just_return_func\n" "just_return_func:\n" diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 3b24dc05251c..9d05572370ed 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -257,6 +257,7 @@ ENTRY(__memcpy_mcsafe) /* Copy successful. Return zero */ .L_done_memcpy_trap: xorl %eax, %eax +.L_done: ret ENDPROC(__memcpy_mcsafe) EXPORT_SYMBOL_GPL(__memcpy_mcsafe) @@ -273,7 +274,7 @@ EXPORT_SYMBOL_GPL(__memcpy_mcsafe) addl %edx, %ecx .E_trailing_bytes: mov %ecx, %eax - ret + jmp .L_done /* * For write fault handling, given the destination is unaligned, diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S deleted file mode 100644 index dc2ab6ea6768..000000000000 --- a/arch/x86/lib/rwsem.S +++ /dev/null @@ -1,156 +0,0 @@ -/* - * x86 semaphore implementation. - * - * (C) Copyright 1999 Linus Torvalds - * - * Portions Copyright 1999 Red Hat, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org> - */ - -#include <linux/linkage.h> -#include <asm/alternative-asm.h> -#include <asm/frame.h> - -#define __ASM_HALF_REG(reg) __ASM_SEL(reg, e##reg) -#define __ASM_HALF_SIZE(inst) __ASM_SEL(inst##w, inst##l) - -#ifdef CONFIG_X86_32 - -/* - * The semaphore operations have a special calling sequence that - * allow us to do a simpler in-line version of them. These routines - * need to convert that sequence back into the C sequence when - * there is contention on the semaphore. - * - * %eax contains the semaphore pointer on entry. Save the C-clobbered - * registers (%eax, %edx and %ecx) except %eax which is either a return - * value or just gets clobbered. Same is true for %edx so make sure GCC - * reloads it after the slow path, by making it hold a temporary, for - * example see ____down_write(). - */ - -#define save_common_regs \ - pushl %ecx - -#define restore_common_regs \ - popl %ecx - - /* Avoid uglifying the argument copying x86-64 needs to do. */ - .macro movq src, dst - .endm - -#else - -/* - * x86-64 rwsem wrappers - * - * This interfaces the inline asm code to the slow-path - * C routines. We need to save the call-clobbered regs - * that the asm does not mark as clobbered, and move the - * argument from %rax to %rdi. - * - * NOTE! We don't need to save %rax, because the functions - * will always return the semaphore pointer in %rax (which - * is also the input argument to these helpers) - * - * The following can clobber %rdx because the asm clobbers it: - * call_rwsem_down_write_failed - * call_rwsem_wake - * but %rdi, %rsi, %rcx, %r8-r11 always need saving. - */ - -#define save_common_regs \ - pushq %rdi; \ - pushq %rsi; \ - pushq %rcx; \ - pushq %r8; \ - pushq %r9; \ - pushq %r10; \ - pushq %r11 - -#define restore_common_regs \ - popq %r11; \ - popq %r10; \ - popq %r9; \ - popq %r8; \ - popq %rcx; \ - popq %rsi; \ - popq %rdi - -#endif - -/* Fix up special calling conventions */ -ENTRY(call_rwsem_down_read_failed) - FRAME_BEGIN - save_common_regs - __ASM_SIZE(push,) %__ASM_REG(dx) - movq %rax,%rdi - call rwsem_down_read_failed - __ASM_SIZE(pop,) %__ASM_REG(dx) - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_read_failed) - -ENTRY(call_rwsem_down_read_failed_killable) - FRAME_BEGIN - save_common_regs - __ASM_SIZE(push,) %__ASM_REG(dx) - movq %rax,%rdi - call rwsem_down_read_failed_killable - __ASM_SIZE(pop,) %__ASM_REG(dx) - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_read_failed_killable) - -ENTRY(call_rwsem_down_write_failed) - FRAME_BEGIN - save_common_regs - movq %rax,%rdi - call rwsem_down_write_failed - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_write_failed) - -ENTRY(call_rwsem_down_write_failed_killable) - FRAME_BEGIN - save_common_regs - movq %rax,%rdi - call rwsem_down_write_failed_killable - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_down_write_failed_killable) - -ENTRY(call_rwsem_wake) - FRAME_BEGIN - /* do nothing if still outstanding active readers */ - __ASM_HALF_SIZE(dec) %__ASM_HALF_REG(dx) - jnz 1f - save_common_regs - movq %rax,%rdi - call rwsem_wake - restore_common_regs -1: FRAME_END - ret -ENDPROC(call_rwsem_wake) - -ENTRY(call_rwsem_downgrade_wake) - FRAME_BEGIN - save_common_regs - __ASM_SIZE(push,) %__ASM_REG(dx) - movq %rax,%rdi - call rwsem_downgrade_wake - __ASM_SIZE(pop,) %__ASM_REG(dx) - restore_common_regs - FRAME_END - ret -ENDPROC(call_rwsem_downgrade_wake) diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index ee42bb0cbeb3..9952a01cad24 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -55,26 +55,6 @@ unsigned long clear_user(void __user *to, unsigned long n) EXPORT_SYMBOL(clear_user); /* - * Try to copy last bytes and clear the rest if needed. - * Since protection fault in copy_from/to_user is not a normal situation, - * it is not necessary to optimize tail handling. - */ -__visible unsigned long -copy_user_handle_tail(char *to, char *from, unsigned len) -{ - for (; len; --len, to++) { - char c; - - if (__get_user_nocheck(c, from++, sizeof(char))) - break; - if (__put_user_nocheck(c, to, sizeof(char))) - break; - } - clac(); - return len; -} - -/* * Similar to copy_user_handle_tail, probe for the write fault point, * but reuse __memcpy_mcsafe in case a new read error is encountered. * clac() is handled in _copy_to_iter_mcsafe(). diff --git a/arch/x86/math-emu/fpu_entry.c b/arch/x86/math-emu/fpu_entry.c index 9e2ba7e667f6..a873da6b46d6 100644 --- a/arch/x86/math-emu/fpu_entry.c +++ b/arch/x86/math-emu/fpu_entry.c @@ -113,9 +113,6 @@ void math_emulate(struct math_emu_info *info) unsigned long code_base = 0; unsigned long code_limit = 0; /* Initialized to stop compiler warnings */ struct desc_struct code_descriptor; - struct fpu *fpu = ¤t->thread.fpu; - - fpu__initialize(fpu); #ifdef RE_ENTRANT_CHECKING if (emulating) { diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c index 19c6abf9ea31..752ad11d6868 100644 --- a/arch/x86/mm/cpu_entry_area.c +++ b/arch/x86/mm/cpu_entry_area.c @@ -13,8 +13,8 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage); #ifdef CONFIG_X86_64 -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); +static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks); +DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks); #endif struct cpu_entry_area *get_cpu_entry_area(int cpu) @@ -52,10 +52,10 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot) cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot); } -static void __init percpu_setup_debug_store(int cpu) +static void __init percpu_setup_debug_store(unsigned int cpu) { #ifdef CONFIG_CPU_SUP_INTEL - int npages; + unsigned int npages; void *cea; if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) @@ -78,9 +78,43 @@ static void __init percpu_setup_debug_store(int cpu) #endif } +#ifdef CONFIG_X86_64 + +#define cea_map_stack(name) do { \ + npages = sizeof(estacks->name## _stack) / PAGE_SIZE; \ + cea_map_percpu_pages(cea->estacks.name## _stack, \ + estacks->name## _stack, npages, PAGE_KERNEL); \ + } while (0) + +static void __init percpu_setup_exception_stacks(unsigned int cpu) +{ + struct exception_stacks *estacks = per_cpu_ptr(&exception_stacks, cpu); + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); + unsigned int npages; + + BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); + + per_cpu(cea_exception_stacks, cpu) = &cea->estacks; + + /* + * The exceptions stack mappings in the per cpu area are protected + * by guard pages so each stack must be mapped separately. DB2 is + * not mapped; it just exists to catch triple nesting of #DB. + */ + cea_map_stack(DF); + cea_map_stack(NMI); + cea_map_stack(DB1); + cea_map_stack(DB); + cea_map_stack(MCE); +} +#else +static inline void percpu_setup_exception_stacks(unsigned int cpu) {} +#endif + /* Setup the fixmap mappings only once per-processor */ -static void __init setup_cpu_entry_area(int cpu) +static void __init setup_cpu_entry_area(unsigned int cpu) { + struct cpu_entry_area *cea = get_cpu_entry_area(cpu); #ifdef CONFIG_X86_64 /* On 64-bit systems, we use a read-only fixmap GDT and TSS. */ pgprot_t gdt_prot = PAGE_KERNEL_RO; @@ -101,10 +135,9 @@ static void __init setup_cpu_entry_area(int cpu) pgprot_t tss_prot = PAGE_KERNEL; #endif - cea_set_pte(&get_cpu_entry_area(cpu)->gdt, get_cpu_gdt_paddr(cpu), - gdt_prot); + cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->entry_stack_page, + cea_map_percpu_pages(&cea->entry_stack_page, per_cpu_ptr(&entry_stack_storage, cpu), 1, PAGE_KERNEL); @@ -128,22 +161,15 @@ static void __init setup_cpu_entry_area(int cpu) BUILD_BUG_ON((offsetof(struct tss_struct, x86_tss) ^ offsetofend(struct tss_struct, x86_tss)) & PAGE_MASK); BUILD_BUG_ON(sizeof(struct tss_struct) % PAGE_SIZE != 0); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->tss, - &per_cpu(cpu_tss_rw, cpu), + cea_map_percpu_pages(&cea->tss, &per_cpu(cpu_tss_rw, cpu), sizeof(struct tss_struct) / PAGE_SIZE, tss_prot); #ifdef CONFIG_X86_32 - per_cpu(cpu_entry_area, cpu) = get_cpu_entry_area(cpu); + per_cpu(cpu_entry_area, cpu) = cea; #endif -#ifdef CONFIG_X86_64 - BUILD_BUG_ON(sizeof(exception_stacks) % PAGE_SIZE != 0); - BUILD_BUG_ON(sizeof(exception_stacks) != - sizeof(((struct cpu_entry_area *)0)->exception_stacks)); - cea_map_percpu_pages(&get_cpu_entry_area(cpu)->exception_stacks, - &per_cpu(exception_stacks, cpu), - sizeof(exception_stacks) / PAGE_SIZE, PAGE_KERNEL); -#endif + percpu_setup_exception_stacks(cpu); + percpu_setup_debug_store(cpu); } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index ee8f8ab46941..6a7302d1161f 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -259,7 +259,8 @@ static void note_wx(struct pg_state *st) #endif /* Account the WX pages */ st->wx_pages += npages; - WARN_ONCE(1, "x86/mm: Found insecure W+X mapping at address %pS\n", + WARN_ONCE(__supported_pte_mask & _PAGE_NX, + "x86/mm: Found insecure W+X mapping at address %pS\n", (void *)st->start_address); } @@ -577,7 +578,7 @@ void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd) void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user) { #ifdef CONFIG_PAGE_TABLE_ISOLATION - if (user && static_cpu_has(X86_FEATURE_PTI)) + if (user && boot_cpu_has(X86_FEATURE_PTI)) pgd = kernel_to_user_pgdp(pgd); #endif ptdump_walk_pgd_level_core(m, pgd, false, false); @@ -590,7 +591,7 @@ void ptdump_walk_user_pgd_level_checkwx(void) pgd_t *pgd = INIT_PGD; if (!(__supported_pte_mask & _PAGE_NX) || - !static_cpu_has(X86_FEATURE_PTI)) + !boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("x86/mm: Checking user space page tables\n"); diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 3c4568f8fb28..b0a2de8d2f9e 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -145,7 +145,7 @@ __visible bool ex_handler_rdmsr_unsafe(const struct exception_table_entry *fixup unsigned long error_code, unsigned long fault_addr) { - if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pF)\n", + if (pr_warn_once("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, regs->ip, (void *)regs->ip)) show_stack_regs(regs); @@ -162,7 +162,7 @@ __visible bool ex_handler_wrmsr_unsafe(const struct exception_table_entry *fixup unsigned long error_code, unsigned long fault_addr) { - if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pF)\n", + if (pr_warn_once("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n", (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax, regs->ip, (void *)regs->ip)) show_stack_regs(regs); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 667f1da36208..46df4c6aae46 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -28,6 +28,7 @@ #include <asm/mmu_context.h> /* vma_pkey() */ #include <asm/efi.h> /* efi_recover_from_page_fault()*/ #include <asm/desc.h> /* store_idt(), ... */ +#include <asm/cpu_entry_area.h> /* exception stack */ #define CREATE_TRACE_POINTS #include <asm/trace/exceptions.h> @@ -359,8 +360,6 @@ static noinline int vmalloc_fault(unsigned long address) if (!(address >= VMALLOC_START && address < VMALLOC_END)) return -1; - WARN_ON_ONCE(in_nmi()); - /* * Copy kernel mappings over when needed. This can also * happen within a race in page table update. In the later @@ -603,24 +602,9 @@ static void show_ldttss(const struct desc_ptr *gdt, const char *name, u16 index) name, index, addr, (desc.limit0 | (desc.limit1 << 16))); } -/* - * This helper function transforms the #PF error_code bits into - * "[PROT] [USER]" type of descriptive, almost human-readable error strings: - */ -static void err_str_append(unsigned long error_code, char *buf, unsigned long mask, const char *txt) -{ - if (error_code & mask) { - if (buf[0]) - strcat(buf, " "); - strcat(buf, txt); - } -} - static void show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - char err_txt[64]; - if (!oops_may_print()) return; @@ -644,31 +628,29 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long ad from_kuid(&init_user_ns, current_uid())); } - pr_alert("BUG: unable to handle kernel %s at %px\n", - address < PAGE_SIZE ? "NULL pointer dereference" : "paging request", - (void *)address); - - err_txt[0] = 0; - - /* - * Note: length of these appended strings including the separation space and the - * zero delimiter must fit into err_txt[]. - */ - err_str_append(error_code, err_txt, X86_PF_PROT, "[PROT]" ); - err_str_append(error_code, err_txt, X86_PF_WRITE, "[WRITE]"); - err_str_append(error_code, err_txt, X86_PF_USER, "[USER]" ); - err_str_append(error_code, err_txt, X86_PF_RSVD, "[RSVD]" ); - err_str_append(error_code, err_txt, X86_PF_INSTR, "[INSTR]"); - err_str_append(error_code, err_txt, X86_PF_PK, "[PK]" ); - - pr_alert("#PF error: %s\n", error_code ? err_txt : "[normal kernel read fault]"); + if (address < PAGE_SIZE && !user_mode(regs)) + pr_alert("BUG: kernel NULL pointer dereference, address: %px\n", + (void *)address); + else + pr_alert("BUG: unable to handle page fault for address: %px\n", + (void *)address); + + pr_alert("#PF: %s %s in %s mode\n", + (error_code & X86_PF_USER) ? "user" : "supervisor", + (error_code & X86_PF_INSTR) ? "instruction fetch" : + (error_code & X86_PF_WRITE) ? "write access" : + "read access", + user_mode(regs) ? "user" : "kernel"); + pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code, + !(error_code & X86_PF_PROT) ? "not-present page" : + (error_code & X86_PF_RSVD) ? "reserved bit violation" : + (error_code & X86_PF_PK) ? "protection keys violation" : + "permissions violation"); if (!(error_code & X86_PF_USER) && user_mode(regs)) { struct desc_ptr idt, gdt; u16 ldtr, tr; - pr_alert("This was a system access from user code\n"); - /* * This can happen for quite a few reasons. The more obvious * ones are faults accessing the GDT, or LDT. Perhaps @@ -793,7 +775,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, if (is_vmalloc_addr((void *)address) && (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { - unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *); /* * We're likely to be running with very little stack space * left. It's plausible that we'd hit this condition but diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index f905a2371080..fd10d91a6115 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -5,6 +5,8 @@ #include <linux/memblock.h> #include <linux/swapfile.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> +#include <linux/sched/task.h> #include <asm/set_memory.h> #include <asm/e820/api.h> @@ -22,6 +24,7 @@ #include <asm/hypervisor.h> #include <asm/cpufeature.h> #include <asm/pti.h> +#include <asm/text-patching.h> /* * We need to define the tracepoints somewhere, and tlb.c @@ -701,6 +704,41 @@ void __init init_mem_mapping(void) } /* + * Initialize an mm_struct to be used during poking and a pointer to be used + * during patching. + */ +void __init poking_init(void) +{ + spinlock_t *ptl; + pte_t *ptep; + + poking_mm = copy_init_mm(); + BUG_ON(!poking_mm); + + /* + * Randomize the poking address, but make sure that the following page + * will be mapped at the same PMD. We need 2 pages, so find space for 3, + * and adjust the address if the PMD ends after the first one. + */ + poking_addr = TASK_UNMAPPED_BASE; + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + poking_addr += (kaslr_get_random_long("Poking") & PAGE_MASK) % + (TASK_SIZE - TASK_UNMAPPED_BASE - 3 * PAGE_SIZE); + + if (((poking_addr + PAGE_SIZE) & ~PMD_MASK) == 0) + poking_addr += PAGE_SIZE; + + /* + * We need to trigger the allocation of the page-tables that will be + * needed for poking now. Later, poking may be performed in an atomic + * section, which might cause allocation to fail. + */ + ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + BUG_ON(!ptep); + pte_unmap_unlock(ptep, ptl); +} + +/* * devmem_is_allowed() checks to see if /dev/mem access to a certain address * is valid. The argument is a physical page number. * @@ -766,6 +804,11 @@ void free_init_pages(const char *what, unsigned long begin, unsigned long end) if (debug_pagealloc_enabled()) { pr_info("debug: unmapping init [mem %#010lx-%#010lx]\n", begin, end - 1); + /* + * Inform kmemleak about the hole in the memory since the + * corresponding pages will be unmapped. + */ + kmemleak_free_part((void *)begin, end - begin); set_memory_np(begin, (end - begin) >> PAGE_SHIFT); } else { /* diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 0029604af8a4..dd73d5d74393 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -825,7 +825,7 @@ void __init __early_set_fixmap(enum fixed_addresses idx, pte = early_ioremap_pte(addr); /* Sanitize 'prot' against any unsupported bits: */ - pgprot_val(flags) &= __default_kernel_pte_mask; + pgprot_val(flags) &= __supported_pte_mask; if (pgprot_val(flags)) set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 3f452ffed7e9..dc3f058bdf9b 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -94,7 +94,7 @@ void __init kernel_randomize_memory(void) if (!kaslr_memory_enabled()) return; - kaslr_regions[0].size_tb = 1 << (__PHYSICAL_MASK_SHIFT - TB_SHIFT); + kaslr_regions[0].size_tb = 1 << (MAX_PHYSMEM_BITS - TB_SHIFT); kaslr_regions[1].size_tb = VMALLOC_SIZE_TB; /* @@ -125,10 +125,7 @@ void __init kernel_randomize_memory(void) */ entropy = remain_entropy / (ARRAY_SIZE(kaslr_regions) - i); prandom_bytes_state(&rand_state, &rand, sizeof(rand)); - if (pgtable_l5_enabled()) - entropy = (rand % (entropy + 1)) & P4D_MASK; - else - entropy = (rand % (entropy + 1)) & PUD_MASK; + entropy = (rand % (entropy + 1)) & PUD_MASK; vaddr += entropy; *kaslr_regions[i].base = vaddr; @@ -137,84 +134,71 @@ void __init kernel_randomize_memory(void) * randomization alignment. */ vaddr += get_padding(&kaslr_regions[i]); - if (pgtable_l5_enabled()) - vaddr = round_up(vaddr + 1, P4D_SIZE); - else - vaddr = round_up(vaddr + 1, PUD_SIZE); + vaddr = round_up(vaddr + 1, PUD_SIZE); remain_entropy -= entropy; } } static void __meminit init_trampoline_pud(void) { - unsigned long paddr, paddr_next; + pud_t *pud_page_tramp, *pud, *pud_tramp; + p4d_t *p4d_page_tramp, *p4d, *p4d_tramp; + unsigned long paddr, vaddr; pgd_t *pgd; - pud_t *pud_page, *pud_page_tramp; - int i; pud_page_tramp = alloc_low_page(); + /* + * There are two mappings for the low 1MB area, the direct mapping + * and the 1:1 mapping for the real mode trampoline: + * + * Direct mapping: virt_addr = phys_addr + PAGE_OFFSET + * 1:1 mapping: virt_addr = phys_addr + */ paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - pud_page = (pud_t *) pgd_page_vaddr(*pgd); - - for (i = pud_index(paddr); i < PTRS_PER_PUD; i++, paddr = paddr_next) { - pud_t *pud, *pud_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + vaddr = (unsigned long)__va(paddr); + pgd = pgd_offset_k(vaddr); - pud_tramp = pud_page_tramp + pud_index(paddr); - pud = pud_page + pud_index(vaddr); - paddr_next = (paddr & PUD_MASK) + PUD_SIZE; - - *pud_tramp = *pud; - } + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); -} - -static void __meminit init_trampoline_p4d(void) -{ - unsigned long paddr, paddr_next; - pgd_t *pgd; - p4d_t *p4d_page, *p4d_page_tramp; - int i; + pud_tramp = pud_page_tramp + pud_index(paddr); + *pud_tramp = *pud; - p4d_page_tramp = alloc_low_page(); - - paddr = 0; - pgd = pgd_offset_k((unsigned long)__va(paddr)); - p4d_page = (p4d_t *) pgd_page_vaddr(*pgd); - - for (i = p4d_index(paddr); i < PTRS_PER_P4D; i++, paddr = paddr_next) { - p4d_t *p4d, *p4d_tramp; - unsigned long vaddr = (unsigned long)__va(paddr); + if (pgtable_l5_enabled()) { + p4d_page_tramp = alloc_low_page(); p4d_tramp = p4d_page_tramp + p4d_index(paddr); - p4d = p4d_page + p4d_index(vaddr); - paddr_next = (paddr & P4D_MASK) + P4D_SIZE; - *p4d_tramp = *p4d; - } + set_p4d(p4d_tramp, + __p4d(_KERNPG_TABLE | __pa(pud_page_tramp))); - set_pgd(&trampoline_pgd_entry, - __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(p4d_page_tramp))); + } else { + set_pgd(&trampoline_pgd_entry, + __pgd(_KERNPG_TABLE | __pa(pud_page_tramp))); + } } /* - * Create PGD aligned trampoline table to allow real mode initialization - * of additional CPUs. Consume only 1 low memory page. + * The real mode trampoline, which is required for bootstrapping CPUs + * occupies only a small area under the low 1MB. See reserve_real_mode() + * for details. + * + * If KASLR is disabled the first PGD entry of the direct mapping is copied + * to map the real mode trampoline. + * + * If KASLR is enabled, copy only the PUD which covers the low 1MB + * area. This limits the randomization granularity to 1GB for both 4-level + * and 5-level paging. */ void __meminit init_trampoline(void) { - if (!kaslr_memory_enabled()) { init_trampoline_default(); return; } - if (pgtable_l5_enabled()) - init_trampoline_p4d(); - else - init_trampoline_pud(); + init_trampoline_pud(); } diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index c805db6236b4..59726aaf4671 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -142,7 +142,7 @@ int mpx_fault_info(struct mpx_fault_info *info, struct pt_regs *regs) goto err_out; } /* get bndregs field from current task's xsave area */ - bndregs = get_xsave_field_ptr(XFEATURE_MASK_BNDREGS); + bndregs = get_xsave_field_ptr(XFEATURE_BNDREGS); if (!bndregs) { err = -EINVAL; goto err_out; @@ -190,7 +190,7 @@ static __user void *mpx_get_bounds_dir(void) * The bounds directory pointer is stored in a register * only accessible if we first do an xsave. */ - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) return MPX_INVALID_BOUNDS_DIR; @@ -376,7 +376,7 @@ static int do_mpx_bt_fault(void) const struct mpx_bndcsr *bndcsr; struct mm_struct *mm = current->mm; - bndcsr = get_xsave_field_ptr(XFEATURE_MASK_BNDCSR); + bndcsr = get_xsave_field_ptr(XFEATURE_BNDCSR); if (!bndcsr) return -EINVAL; /* diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4c570612e24e..daf4d645e537 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -2209,8 +2209,6 @@ int set_pages_rw(struct page *page, int numpages) return set_memory_rw(addr, numpages); } -#ifdef CONFIG_DEBUG_PAGEALLOC - static int __set_pages_p(struct page *page, int numpages) { unsigned long tempaddr = (unsigned long) page_address(page); @@ -2249,6 +2247,16 @@ static int __set_pages_np(struct page *page, int numpages) return __change_page_attr_set_clr(&cpa, 0); } +int set_direct_map_invalid_noflush(struct page *page) +{ + return __set_pages_np(page, 1); +} + +int set_direct_map_default_noflush(struct page *page) +{ + return __set_pages_p(page, 1); +} + void __kernel_map_pages(struct page *page, int numpages, int enable) { if (PageHighMem(page)) @@ -2282,7 +2290,6 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) } #ifdef CONFIG_HIBERNATION - bool kernel_page_present(struct page *page) { unsigned int level; @@ -2294,11 +2301,8 @@ bool kernel_page_present(struct page *page) pte = lookup_address((unsigned long)page_address(page), &level); return (pte_val(*pte) & _PAGE_PRESENT); } - #endif /* CONFIG_HIBERNATION */ -#endif /* CONFIG_DEBUG_PAGEALLOC */ - int __init kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags) { diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 7bd01709a091..1f67b1e15bf6 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ static void pgd_dtor(pgd_t *pgd) * when PTI is enabled. We need them to map the per-process LDT into the * user-space page-table. */ -#define PREALLOCATED_USER_PMDS (static_cpu_has(X86_FEATURE_PTI) ? \ +#define PREALLOCATED_USER_PMDS (boot_cpu_has(X86_FEATURE_PTI) ? \ KERNEL_PGD_PTRS : 0) #define MAX_PREALLOCATED_USER_PMDS KERNEL_PGD_PTRS @@ -292,7 +292,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) #ifdef CONFIG_PAGE_TABLE_ISOLATION - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; pgdp = kernel_to_user_pgdp(pgdp); @@ -373,14 +373,14 @@ static void pgd_prepopulate_user_pmd(struct mm_struct *mm, static struct kmem_cache *pgd_cache; -static int __init pgd_cache_init(void) +void __init pgd_cache_init(void) { /* * When PAE kernel is running as a Xen domain, it does not use * shared kernel pmd. And this requires a whole page for pgd. */ if (!SHARED_KERNEL_PMD) - return 0; + return; /* * when PAE kernel is not running as a Xen domain, it uses @@ -390,9 +390,7 @@ static int __init pgd_cache_init(void) */ pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, SLAB_PANIC, NULL); - return 0; } -core_initcall(pgd_cache_init); static inline pgd_t *_pgd_alloc(void) { @@ -420,6 +418,10 @@ static inline void _pgd_free(pgd_t *pgd) } #else +void __init pgd_cache_init(void) +{ +} + static inline pgd_t *_pgd_alloc(void) { return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index 047a77f6a10c..1dcfc91c8f0c 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -18,6 +18,7 @@ #include <asm/cpufeature.h> /* boot_cpu_has, ... */ #include <asm/mmu_context.h> /* vma_pkey() */ +#include <asm/fpu/internal.h> /* init_fpstate */ int __execute_only_pkey(struct mm_struct *mm) { @@ -39,17 +40,12 @@ int __execute_only_pkey(struct mm_struct *mm) * dance to set PKRU if we do not need to. Check it * first and assume that if the execute-only pkey is * write-disabled that we do not have to set it - * ourselves. We need preempt off so that nobody - * can make fpregs inactive. + * ourselves. */ - preempt_disable(); if (!need_to_set_mm_pkey && - current->thread.fpu.initialized && !__pkru_allows_read(read_pkru(), execute_only_pkey)) { - preempt_enable(); return execute_only_pkey; } - preempt_enable(); /* * Set up PKRU so that it denies access for everything @@ -131,7 +127,6 @@ int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey * in the process's lifetime will not accidentally get access * to data which is pkey-protected later on. */ -static u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | PKRU_AD_KEY( 4) | PKRU_AD_KEY( 5) | PKRU_AD_KEY( 6) | PKRU_AD_KEY( 7) | PKRU_AD_KEY( 8) | PKRU_AD_KEY( 9) | @@ -148,13 +143,6 @@ void copy_init_pkru_to_fpregs(void) { u32 init_pkru_value_snapshot = READ_ONCE(init_pkru_value); /* - * Any write to PKRU takes it out of the XSAVE 'init - * state' which increases context switch cost. Avoid - * writing 0 when PKRU was already 0. - */ - if (!init_pkru_value_snapshot && !read_pkru()) - return; - /* * Override the PKRU state that came from 'init_fpstate' * with the baseline from the process. */ @@ -174,6 +162,7 @@ static ssize_t init_pkru_read_file(struct file *file, char __user *user_buf, static ssize_t init_pkru_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { + struct pkru_state *pk; char buf[32]; ssize_t len; u32 new_init_pkru; @@ -196,6 +185,10 @@ static ssize_t init_pkru_write_file(struct file *file, return -EINVAL; WRITE_ONCE(init_pkru_value, new_init_pkru); + pk = get_xsave_addr(&init_fpstate.xsave, XFEATURE_PKRU); + if (!pk) + return -EINVAL; + pk->pkru = new_init_pkru; return count; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 139b28a01ce4..9c2463bc158f 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -35,6 +35,7 @@ #include <linux/spinlock.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <linux/cpu.h> #include <asm/cpufeature.h> #include <asm/hypervisor.h> @@ -115,7 +116,8 @@ void __init pti_check_boottime_disable(void) } } - if (cmdline_find_option_bool(boot_command_line, "nopti")) { + if (cmdline_find_option_bool(boot_command_line, "nopti") || + cpu_mitigations_off()) { pti_mode = PTI_FORCE_OFF; pti_print_if_insecure("disabled on command line."); return; @@ -626,7 +628,7 @@ static void pti_set_kernel_image_nonglobal(void) */ void __init pti_init(void) { - if (!static_cpu_has(X86_FEATURE_PTI)) + if (!boot_cpu_has(X86_FEATURE_PTI)) return; pr_info("enabled\n"); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bc4bc7b2f075..7f61431c75fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -634,7 +634,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen); } -static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason) +static void flush_tlb_func_local(const void *info, enum tlb_flush_reason reason) { const struct flush_tlb_info *f = info; @@ -722,43 +722,81 @@ void native_flush_tlb_others(const struct cpumask *cpumask, */ unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM +static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx); +#endif + +static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, + unsigned long start, unsigned long end, + unsigned int stride_shift, bool freed_tables, + u64 new_tlb_gen) +{ + struct flush_tlb_info *info = this_cpu_ptr(&flush_tlb_info); + +#ifdef CONFIG_DEBUG_VM + /* + * Ensure that the following code is non-reentrant and flush_tlb_info + * is not overwritten. This means no TLB flushing is initiated by + * interrupt handlers and machine-check exception handlers. + */ + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); +#endif + + info->start = start; + info->end = end; + info->mm = mm; + info->stride_shift = stride_shift; + info->freed_tables = freed_tables; + info->new_tlb_gen = new_tlb_gen; + + return info; +} + +static inline void put_flush_tlb_info(void) +{ +#ifdef CONFIG_DEBUG_VM + /* Complete reentrency prevention checks */ + barrier(); + this_cpu_dec(flush_tlb_info_idx); +#endif +} + void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift, bool freed_tables) { + struct flush_tlb_info *info; + u64 new_tlb_gen; int cpu; - struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { - .mm = mm, - .stride_shift = stride_shift, - .freed_tables = freed_tables, - }; - cpu = get_cpu(); - /* This is also a barrier that synchronizes with switch_mm(). */ - info.new_tlb_gen = inc_mm_tlb_gen(mm); - /* Should we flush just the requested range? */ - if ((end != TLB_FLUSH_ALL) && - ((end - start) >> stride_shift) <= tlb_single_page_flush_ceiling) { - info.start = start; - info.end = end; - } else { - info.start = 0UL; - info.end = TLB_FLUSH_ALL; + if ((end == TLB_FLUSH_ALL) || + ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { + start = 0; + end = TLB_FLUSH_ALL; } + /* This is also a barrier that synchronizes with switch_mm(). */ + new_tlb_gen = inc_mm_tlb_gen(mm); + + info = get_flush_tlb_info(mm, start, end, stride_shift, freed_tables, + new_tlb_gen); + if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { - VM_WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_MM_SHOOTDOWN); + flush_tlb_func_local(info, TLB_LOCAL_MM_SHOOTDOWN); local_irq_enable(); } if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) - flush_tlb_others(mm_cpumask(mm), &info); + flush_tlb_others(mm_cpumask(mm), info); + put_flush_tlb_info(); put_cpu(); } @@ -787,38 +825,48 @@ static void do_kernel_range_flush(void *info) void flush_tlb_kernel_range(unsigned long start, unsigned long end) { - /* Balance as user space task's flush, a bit conservative */ if (end == TLB_FLUSH_ALL || (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { on_each_cpu(do_flush_tlb_all, NULL, 1); } else { - struct flush_tlb_info info; - info.start = start; - info.end = end; - on_each_cpu(do_kernel_range_flush, &info, 1); + struct flush_tlb_info *info; + + preempt_disable(); + info = get_flush_tlb_info(NULL, start, end, 0, false, 0); + + on_each_cpu(do_kernel_range_flush, info, 1); + + put_flush_tlb_info(); + preempt_enable(); } } +/* + * arch_tlbbatch_flush() performs a full TLB flush regardless of the active mm. + * This means that the 'struct flush_tlb_info' that describes which mappings to + * flush is actually fixed. We therefore set a single fixed struct and use it in + * arch_tlbbatch_flush(). + */ +static const struct flush_tlb_info full_flush_tlb_info = { + .mm = NULL, + .start = 0, + .end = TLB_FLUSH_ALL, +}; + void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) { - struct flush_tlb_info info = { - .mm = NULL, - .start = 0UL, - .end = TLB_FLUSH_ALL, - }; - int cpu = get_cpu(); if (cpumask_test_cpu(cpu, &batch->cpumask)) { - VM_WARN_ON(irqs_disabled()); + lockdep_assert_irqs_enabled(); local_irq_disable(); - flush_tlb_func_local(&info, TLB_LOCAL_SHOOTDOWN); + flush_tlb_func_local(&full_flush_tlb_info, TLB_LOCAL_SHOOTDOWN); local_irq_enable(); } if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) - flush_tlb_others(&batch->cpumask, &info); + flush_tlb_others(&batch->cpumask, &full_flush_tlb_info); cpumask_clear(&batch->cpumask); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 2c53b0f19329..1297e185b8c8 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -2133,14 +2133,19 @@ static int __init summarize_uvhub_sockets(int nuvhubs, */ static int __init init_per_cpu(int nuvhubs, int base_part_pnode) { - unsigned char *uvhub_mask; struct uvhub_desc *uvhub_descs; + unsigned char *uvhub_mask = NULL; if (is_uv3_hub() || is_uv2_hub() || is_uv1_hub()) timeout_us = calculate_destination_timeout(); uvhub_descs = kcalloc(nuvhubs, sizeof(struct uvhub_desc), GFP_KERNEL); + if (!uvhub_descs) + goto fail; + uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL); + if (!uvhub_mask) + goto fail; if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask)) goto fail; diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index bcddf09b5aa3..4845b8c7be7f 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -90,7 +90,6 @@ static int get_e820_md5(struct e820_table *table, void *buf) } desc->tfm = tfm; - desc->flags = 0; size = offsetof(struct e820_table, entries) + sizeof(struct e820_entry) * table->nr_entries; diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index b629f6992d9f..ce7188cbdae5 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -11,7 +11,9 @@ #define Elf_Shdr ElfW(Shdr) #define Elf_Sym ElfW(Sym) -static Elf_Ehdr ehdr; +static Elf_Ehdr ehdr; +static unsigned long shnum; +static unsigned int shstrndx; struct relocs { uint32_t *offset; @@ -241,9 +243,9 @@ static const char *sec_name(unsigned shndx) { const char *sec_strtab; const char *name; - sec_strtab = secs[ehdr.e_shstrndx].strtab; + sec_strtab = secs[shstrndx].strtab; name = "<noname>"; - if (shndx < ehdr.e_shnum) { + if (shndx < shnum) { name = sec_strtab + secs[shndx].shdr.sh_name; } else if (shndx == SHN_ABS) { @@ -271,7 +273,7 @@ static const char *sym_name(const char *sym_strtab, Elf_Sym *sym) static Elf_Sym *sym_lookup(const char *symname) { int i; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; long nsyms; char *strtab; @@ -366,27 +368,41 @@ static void read_ehdr(FILE *fp) ehdr.e_shnum = elf_half_to_cpu(ehdr.e_shnum); ehdr.e_shstrndx = elf_half_to_cpu(ehdr.e_shstrndx); - if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) { + shnum = ehdr.e_shnum; + shstrndx = ehdr.e_shstrndx; + + if ((ehdr.e_type != ET_EXEC) && (ehdr.e_type != ET_DYN)) die("Unsupported ELF header type\n"); - } - if (ehdr.e_machine != ELF_MACHINE) { + if (ehdr.e_machine != ELF_MACHINE) die("Not for %s\n", ELF_MACHINE_NAME); - } - if (ehdr.e_version != EV_CURRENT) { + if (ehdr.e_version != EV_CURRENT) die("Unknown ELF version\n"); - } - if (ehdr.e_ehsize != sizeof(Elf_Ehdr)) { + if (ehdr.e_ehsize != sizeof(Elf_Ehdr)) die("Bad Elf header size\n"); - } - if (ehdr.e_phentsize != sizeof(Elf_Phdr)) { + if (ehdr.e_phentsize != sizeof(Elf_Phdr)) die("Bad program header entry\n"); - } - if (ehdr.e_shentsize != sizeof(Elf_Shdr)) { + if (ehdr.e_shentsize != sizeof(Elf_Shdr)) die("Bad section header entry\n"); + + + if (shnum == SHN_UNDEF || shstrndx == SHN_XINDEX) { + Elf_Shdr shdr; + + if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) + die("Seek to %d failed: %s\n", ehdr.e_shoff, strerror(errno)); + + if (fread(&shdr, sizeof(shdr), 1, fp) != 1) + die("Cannot read initial ELF section header: %s\n", strerror(errno)); + + if (shnum == SHN_UNDEF) + shnum = elf_xword_to_cpu(shdr.sh_size); + + if (shstrndx == SHN_XINDEX) + shstrndx = elf_word_to_cpu(shdr.sh_link); } - if (ehdr.e_shstrndx >= ehdr.e_shnum) { + + if (shstrndx >= shnum) die("String table index out of bounds\n"); - } } static void read_shdrs(FILE *fp) @@ -394,20 +410,20 @@ static void read_shdrs(FILE *fp) int i; Elf_Shdr shdr; - secs = calloc(ehdr.e_shnum, sizeof(struct section)); + secs = calloc(shnum, sizeof(struct section)); if (!secs) { die("Unable to allocate %d section headers\n", - ehdr.e_shnum); + shnum); } if (fseek(fp, ehdr.e_shoff, SEEK_SET) < 0) { die("Seek to %d failed: %s\n", ehdr.e_shoff, strerror(errno)); } - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (fread(&shdr, sizeof(shdr), 1, fp) != 1) die("Cannot read ELF section headers %d/%d: %s\n", - i, ehdr.e_shnum, strerror(errno)); + i, shnum, strerror(errno)); sec->shdr.sh_name = elf_word_to_cpu(shdr.sh_name); sec->shdr.sh_type = elf_word_to_cpu(shdr.sh_type); sec->shdr.sh_flags = elf_xword_to_cpu(shdr.sh_flags); @@ -418,7 +434,7 @@ static void read_shdrs(FILE *fp) sec->shdr.sh_info = elf_word_to_cpu(shdr.sh_info); sec->shdr.sh_addralign = elf_xword_to_cpu(shdr.sh_addralign); sec->shdr.sh_entsize = elf_xword_to_cpu(shdr.sh_entsize); - if (sec->shdr.sh_link < ehdr.e_shnum) + if (sec->shdr.sh_link < shnum) sec->link = &secs[sec->shdr.sh_link]; } @@ -427,7 +443,7 @@ static void read_shdrs(FILE *fp) static void read_strtabs(FILE *fp) { int i; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (sec->shdr.sh_type != SHT_STRTAB) { continue; @@ -452,7 +468,7 @@ static void read_strtabs(FILE *fp) static void read_symtabs(FILE *fp) { int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (sec->shdr.sh_type != SHT_SYMTAB) { continue; @@ -485,7 +501,7 @@ static void read_symtabs(FILE *fp) static void read_relocs(FILE *fp) { int i,j; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; if (sec->shdr.sh_type != SHT_REL_TYPE) { continue; @@ -528,7 +544,7 @@ static void print_absolute_symbols(void) printf("Absolute symbols\n"); printf(" Num: Value Size Type Bind Visibility Name\n"); - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; char *sym_strtab; int j; @@ -566,7 +582,7 @@ static void print_absolute_relocs(void) else format = "%08"PRIx32" %08"PRIx32" %10s %08"PRIx32" %s\n"; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { struct section *sec = &secs[i]; struct section *sec_applies, *sec_symtab; char *sym_strtab; @@ -650,7 +666,7 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, { int i; /* Walk through the relocations */ - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { char *sym_strtab; Elf_Sym *sh_symtab; struct section *sec_applies, *sec_symtab; @@ -706,7 +722,7 @@ static Elf_Addr per_cpu_load_addr; static void percpu_init(void) { int i; - for (i = 0; i < ehdr.e_shnum; i++) { + for (i = 0; i < shnum; i++) { ElfW(Sym) *sym; if (strcmp(sec_name(i), ".data..percpu")) continue; @@ -738,7 +754,7 @@ static void percpu_init(void) * __per_cpu_load * * The "gold" linker incorrectly associates: - * init_per_cpu__irq_stack_union + * init_per_cpu__fixed_percpu_data * init_per_cpu__gdt_page */ static int is_percpu_sym(ElfW(Sym) *sym, const char *symname) diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index a9e80e44178c..a8985e1f7432 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -32,12 +32,6 @@ config ARCH_DEFCONFIG default "arch/um/configs/i386_defconfig" if X86_32 default "arch/um/configs/x86_64_defconfig" if X86_64 -config RWSEM_XCHGADD_ALGORITHM - def_bool 64BIT - -config RWSEM_GENERIC_SPINLOCK - def_bool !RWSEM_XCHGADD_ALGORITHM - config 3_LEVEL_PGTABLES bool "Three-level pagetables" if !64BIT default 64BIT diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index 2d686ae54681..33c51c064c77 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -21,14 +21,12 @@ obj-y += checksum_32.o syscalls_32.o obj-$(CONFIG_ELF_CORE) += elfcore.o subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o -subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o else obj-y += syscalls_64.o vdso/ -subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o \ - ../lib/rwsem.o +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../entry/thunk_64.o endif diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile index bf94060fc06f..0caddd6acb22 100644 --- a/arch/x86/um/vdso/Makefile +++ b/arch/x86/um/vdso/Makefile @@ -62,7 +62,7 @@ quiet_cmd_vdso = VDSO $@ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +VDSO_LDFLAGS = -fPIC -shared -Wl,--hash-style=sysv GCOV_PROFILE := n # diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index a21e1734fc1f..beb44e22afdf 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -2318,8 +2318,6 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) #elif defined(CONFIG_X86_VSYSCALL_EMULATION) case VSYSCALL_PAGE: #endif - case FIX_TEXT_POKE0: - case FIX_TEXT_POKE1: /* All local page mappings */ pte = pfn_pte(phys, prot); break; diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 0766a08bdf45..07054572297f 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -105,7 +105,7 @@ void xen_mc_flush(void) for (i = 0; i < b->mcidx; i++) { if (b->entries[i].result < 0) { #if MC_DEBUG - pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pF\n", + pr_err(" call %2d: op=%lu arg=[%lx] result=%ld\t%pS\n", i + 1, b->debug[i].op, b->debug[i].args[0], diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c index 145506f9fdbe..590fcf863006 100644 --- a/arch/x86/xen/smp_pv.c +++ b/arch/x86/xen/smp_pv.c @@ -361,7 +361,9 @@ static int xen_pv_cpu_up(unsigned int cpu, struct task_struct *idle) { int rc; - common_cpu_up(cpu, idle); + rc = common_cpu_up(cpu, idle); + if (rc) + return rc; xen_setup_runstate_info(cpu); diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 5077ead5e59c..c1d8b90aa4e2 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -40,13 +40,13 @@ ENTRY(startup_xen) #ifdef CONFIG_X86_64 /* Set up %gs. * - * The base of %gs always points to the bottom of the irqstack - * union. If the stack protector canary is enabled, it is - * located at %gs:40. Note that, on SMP, the boot cpu uses - * init data section till per cpu areas are set up. + * The base of %gs always points to fixed_percpu_data. If the + * stack protector canary is enabled, it is located at %gs:40. + * Note that, on SMP, the boot cpu uses init data section until + * the per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq $INIT_PER_CPU_VAR(irq_stack_union),%rax + movq $INIT_PER_CPU_VAR(fixed_percpu_data),%rax cdq wrmsr #endif diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 4b9aafe766c5..35c8d91e6106 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -46,9 +46,6 @@ config XTENSA with reasonable minimum requirements. The Xtensa Linux project has a home page at <http://www.linux-xtensa.org/>. -config RWSEM_XCHGADD_ALGORITHM - def_bool y - config GENERIC_HWEIGHT def_bool y diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 3843198e03d4..35f83c4bf239 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -20,12 +20,12 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += mmiowb.h generic-y += param.h generic-y += percpu.h generic-y += preempt.h generic-y += qrwlock.h generic-y += qspinlock.h -generic-y += rwsem.h generic-y += sections.h generic-y += socket.h generic-y += topology.h diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index f7dd895b2353..0c14018d1c26 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -187,15 +187,18 @@ struct thread_struct { /* Clearing a0 terminates the backtrace. */ #define start_thread(regs, new_pc, new_sp) \ - memset(regs, 0, sizeof(*regs)); \ - regs->pc = new_pc; \ - regs->ps = USER_PS_VALUE; \ - regs->areg[1] = new_sp; \ - regs->areg[0] = 0; \ - regs->wmask = 1; \ - regs->depc = 0; \ - regs->windowbase = 0; \ - regs->windowstart = 1; + do { \ + memset((regs), 0, sizeof(*(regs))); \ + (regs)->pc = (new_pc); \ + (regs)->ps = USER_PS_VALUE; \ + (regs)->areg[1] = (new_sp); \ + (regs)->areg[0] = 0; \ + (regs)->wmask = 1; \ + (regs)->depc = 0; \ + (regs)->windowbase = 0; \ + (regs)->windowstart = 1; \ + (regs)->syscall = NO_SYSCALL; \ + } while (0) /* Forward declaration */ struct task_struct; diff --git a/arch/xtensa/include/asm/tlb.h b/arch/xtensa/include/asm/tlb.h index 0d766f9c1083..50889935138a 100644 --- a/arch/xtensa/include/asm/tlb.h +++ b/arch/xtensa/include/asm/tlb.h @@ -14,32 +14,6 @@ #include <asm/cache.h> #include <asm/page.h> -#if (DCACHE_WAY_SIZE <= PAGE_SIZE) - -/* Note, read http://lkml.org/lkml/2004/1/15/6 */ - -# define tlb_start_vma(tlb,vma) do { } while (0) -# define tlb_end_vma(tlb,vma) do { } while (0) - -#else - -# define tlb_start_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_cache_range(vma, vma->vm_start, vma->vm_end); \ - } while(0) - -# define tlb_end_vma(tlb, vma) \ - do { \ - if (!tlb->fullmm) \ - flush_tlb_range(vma, vma->vm_start, vma->vm_end); \ - } while(0) - -#endif - -#define __tlb_remove_tlb_entry(tlb,pte,addr) do { } while (0) -#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) - #include <asm-generic/tlb.h> #define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index e50f5124dc6f..e54af8b7e0f8 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -1860,6 +1860,8 @@ ENTRY(system_call) l32i a7, a2, PT_SYSCALL 1: + s32i a7, a1, 4 + /* syscall = sys_call_table[syscall_nr] */ movi a4, sys_call_table @@ -1893,8 +1895,12 @@ ENTRY(system_call) retw 1: + l32i a4, a1, 4 + l32i a3, a2, PT_SYSCALL + s32i a4, a2, PT_SYSCALL mov a6, a2 call4 do_syscall_trace_leave + s32i a3, a2, PT_SYSCALL retw ENDPROC(system_call) diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c index 174c11f13bba..b9f82510c650 100644 --- a/arch/xtensa/kernel/stacktrace.c +++ b/arch/xtensa/kernel/stacktrace.c @@ -253,10 +253,14 @@ static int return_address_cb(struct stackframe *frame, void *data) return 1; } +/* + * level == 0 is for the return address from the caller of this function, + * not from this function itself. + */ unsigned long return_address(unsigned level) { struct return_addr_data r = { - .skip = level + 1, + .skip = level, }; walk_stackframe(stack_pointer(NULL), return_address_cb, &r); return r.addr; diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 6af49929de85..30084eaf8422 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -394,3 +394,7 @@ 421 common rt_sigtimedwait_time64 sys_rt_sigtimedwait 422 common futex_time64 sys_futex 423 common sched_rr_get_interval_time64 sys_sched_rr_get_interval +424 common pidfd_send_signal sys_pidfd_send_signal +425 common io_uring_setup sys_io_uring_setup +426 common io_uring_enter sys_io_uring_enter +427 common io_uring_register sys_io_uring_register diff --git a/arch/xtensa/mm/mmu.c b/arch/xtensa/mm/mmu.c index 2fb7d1172228..03678c4afc39 100644 --- a/arch/xtensa/mm/mmu.c +++ b/arch/xtensa/mm/mmu.c @@ -33,7 +33,7 @@ static void * __init init_pmd(unsigned long vaddr, unsigned long n_pages) pte = memblock_alloc_low(n_pages * sizeof(pte_t), PAGE_SIZE); if (!pte) - panic("%s: Failed to allocate %zu bytes align=%lx\n", + panic("%s: Failed to allocate %lu bytes align=%lx\n", __func__, n_pages * sizeof(pte_t), PAGE_SIZE); for (i = 0; i < n_pages; ++i) |