From c771320e9357c9b85634002daedfe5c8988f27a6 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Thu, 5 Oct 2017 08:44:26 +0200 Subject: s390/mm,kvm: improve detection of KVM guest faults The identification of guest fault currently relies on the PF_VCPU flag. This is set in guest_entry_irqoff and cleared in guest_exit_irqoff. Both functions are called by __vcpu_run, the PF_VCPU flag is set for quite a lot of kernel code outside of the guest execution. Replace the PF_VCPU scheme with the PIF_GUEST_FAULT in the pt_regs and make the program check handler code in entry.S set the bit only for exception that occurred between the .Lsie_gmap and .Lsie_done labels. Reviewed-by: Christian Borntraeger Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- arch/s390/include/asm/ptrace.h | 2 ++ arch/s390/kernel/entry.S | 7 +++++-- arch/s390/mm/fault.c | 6 +++--- 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index 2f84e77f1f1b..a3788dafc0e1 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -13,10 +13,12 @@ #define PIF_SYSCALL 0 /* inside a system call */ #define PIF_PER_TRAP 1 /* deliver sigtrap on return to user */ #define PIF_SYSCALL_RESTART 2 /* restart the current system call */ +#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ #define _PIF_SYSCALL _BITUL(PIF_SYSCALL) #define _PIF_PER_TRAP _BITUL(PIF_PER_TRAP) #define _PIF_SYSCALL_RESTART _BITUL(PIF_SYSCALL_RESTART) +#define _PIF_GUEST_FAULT _BITUL(PIF_GUEST_FAULT) #ifndef __ASSEMBLY__ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index f498d201f98d..ee53ac7b1ab8 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -518,6 +518,7 @@ ENTRY(pgm_check_handler) stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK lg %r12,__LC_CURRENT + lghi %r11,0 larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # test problem state bit @@ -532,6 +533,7 @@ ENTRY(pgm_check_handler) ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit + lghi %r11,_PIF_GUEST_FAULT #endif 0: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 1f # -> enabled, can't be a double fault @@ -549,13 +551,14 @@ ENTRY(pgm_check_handler) jz 3f mvc __THREAD_trap_tdb(256,%r14),0(%r13) 3: stg %r10,__THREAD_last_break(%r14) -4: la %r11,STACK_FRAME_OVERHEAD(%r15) +4: lgr %r13,%r11 + la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE - xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + stg %r13,__PT_FLAGS(%r11) stg %r10,__PT_ARGS(%r11) tm __LC_PGM_ILC+3,0x80 # check for per exception jz 5f diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 242b78c0a9ec..be974b3eb7e4 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -117,7 +117,7 @@ static inline int user_space_fault(struct pt_regs *regs) return 1; if (trans_exc_code == 2) /* secondary space -> set_fs */ return current->thread.mm_segment.ar4; - if (current->flags & PF_VCPU) + if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) return 1; return 0; } @@ -209,7 +209,7 @@ static void dump_fault_info(struct pt_regs *regs) pr_cont("kernel "); } #ifdef CONFIG_PGSTE - else if ((current->flags & PF_VCPU) && S390_lowcore.gmap) { + else if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) { struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; asce = gmap->asce; pr_cont("gmap "); @@ -438,7 +438,7 @@ static inline int do_exception(struct pt_regs *regs, int access) down_read(&mm->mmap_sem); #ifdef CONFIG_PGSTE - gmap = (current->flags & PF_VCPU) ? + gmap = test_pt_regs_flag(regs, PIF_GUEST_FAULT) ? (struct gmap *) S390_lowcore.gmap : NULL; if (gmap) { current->thread.gmap_addr = address; -- cgit v1.2.3 From 0aaba41b58bc5f3074c0c0a6136b9500b5e29e19 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Tue, 22 Aug 2017 12:08:22 +0200 Subject: s390: remove all code using the access register mode The vdso code for the getcpu() and the clock_gettime() call use the access register mode to access the per-CPU vdso data page with the current code. An alternative to the complicated AR mode is to use the secondary space mode. This makes the vdso faster and quite a bit simpler. The downside is that the uaccess code has to be changed quite a bit. Which instructions are used depends on the machine and what kind of uaccess operation is requested. The instruction dictates which ASCE value needs to be loaded into %cr1 and %cr7. The different cases: * User copy with MVCOS for z10 and newer machines The MVCOS instruction can copy between the primary space (aka user) and the home space (aka kernel) directly. For set_fs(KERNEL_DS) the kernel ASCE is loaded into %cr1. For set_fs(USER_DS) the user space is already loaded in %cr1. * User copy with MVCP/MVCS for older machines To be able to execute the MVCP/MVCS instructions the kernel needs to switch to primary mode. The control register %cr1 has to be set to the kernel ASCE and %cr7 to either the kernel ASCE or the user ASCE dependent on set_fs(KERNEL_DS) vs set_fs(USER_DS). * Data access in the user address space for strnlen / futex To use "normal" instruction with data from the user address space the secondary space mode is used. The kernel needs to switch to primary mode, %cr1 has to contain the kernel ASCE and %cr7 either the user ASCE or the kernel ASCE, dependent on set_fs. To load a new value into %cr1 or %cr7 is an expensive operation, the kernel tries to be lazy about it. E.g. for multiple user copies in a row with MVCP/MVCS the replacement of the vdso ASCE in %cr7 with the user ASCE is done only once. On return to user space a CPU bit is checked that loads the vdso ASCE again. To enable and disable the data access via the secondary space two new functions are added, enable_sacf_uaccess and disable_sacf_uaccess. The fact that a context is in secondary space uaccess mode is stored in the mm_segment_t value for the task. The code of an interrupt may use set_fs as long as it returns to the previous state it got with get_fs with another call to set_fs. The code in finish_arch_post_lock_switch simply has to do a set_fs with the current mm_segment_t value for the task. For CPUs with MVCOS: CPU running in | %cr1 ASCE | %cr7 ASCE | --------------------------------------|-----------|-----------| user space | user | vdso | kernel, USER_DS, normal-mode | user | vdso | kernel, USER_DS, normal-mode, lazy | user | user | kernel, USER_DS, sacf-mode | kernel | user | kernel, KERNEL_DS, normal-mode | kernel | vdso | kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel | kernel, KERNEL_DS, sacf-mode | kernel | kernel | For CPUs without MVCOS: CPU running in | %cr1 ASCE | %cr7 ASCE | --------------------------------------|-----------|-----------| user space | user | vdso | kernel, USER_DS, normal-mode | user | vdso | kernel, USER_DS, normal-mode lazy | kernel | user | kernel, USER_DS, sacf-mode | kernel | user | kernel, KERNEL_DS, normal-mode | kernel | vdso | kernel, KERNEL_DS, normal-mode, lazy | kernel | kernel | kernel, KERNEL_DS, sacf-mode | kernel | kernel | The lines with "lazy" refer to the state after a copy via the secondary space with a delayed reload of %cr1 and %cr7. There are three hardware address spaces that can cause a DAT exception, primary, secondary and home space. The exception can be related to four different fault types: user space fault, vdso fault, kernel fault, and the gmap faults. Dependent on the set_fs state and normal vs. sacf mode there are a number of fault combinations: 1) user address space fault via the primary ASCE 2) gmap address space fault via the primary ASCE 3) kernel address space fault via the primary ASCE for machines with MVCOS and set_fs(KERNEL_DS) 4) vdso address space faults via the secondary ASCE with an invalid address while running in secondary space in problem state 5) user address space fault via the secondary ASCE for user-copy based on the secondary space mode, e.g. futex_ops or strnlen_user 6) kernel address space fault via the secondary ASCE for user-copy with secondary space mode with set_fs(KERNEL_DS) 7) kernel address space fault via the primary ASCE for user-copy with secondary space mode with set_fs(USER_DS) on machines without MVCOS. 8) kernel address space fault via the home space ASCE Replace user_space_fault() with a new function get_fault_type() that can distinguish all four different fault types. With these changes the futex atomic ops from the kernel and the strnlen_user will get a little bit slower, as well as the old style uaccess with MVCP/MVCS. All user accesses based on MVCOS will be as fast as before. On the positive side, the user space vdso code is a lot faster and Linux ceases to use the complicated AR mode. Reviewed-by: Heiko Carstens Signed-off-by: Martin Schwidefsky Signed-off-by: Heiko Carstens --- arch/s390/include/asm/futex.h | 9 ++- arch/s390/include/asm/lowcore.h | 33 +++++----- arch/s390/include/asm/mmu_context.h | 36 +++++------ arch/s390/include/asm/processor.h | 4 +- arch/s390/include/asm/uaccess.h | 29 +++------ arch/s390/kernel/asm-offsets.c | 2 +- arch/s390/kernel/entry.S | 26 ++++++-- arch/s390/kernel/head64.S | 2 +- arch/s390/kernel/vdso.c | 44 ++----------- arch/s390/kernel/vdso32/getcpu.S | 16 +---- arch/s390/kernel/vdso64/clock_gettime.S | 19 ++---- arch/s390/kernel/vdso64/getcpu.S | 15 +---- arch/s390/lib/uaccess.c | 90 +++++++++++++++++++++++--- arch/s390/mm/fault.c | 108 +++++++++++++++++++------------- arch/s390/mm/init.c | 1 + arch/s390/mm/pgalloc.c | 4 +- 16 files changed, 228 insertions(+), 210 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/futex.h b/arch/s390/include/asm/futex.h index 9b5a3469fed9..5e97a4353147 100644 --- a/arch/s390/include/asm/futex.h +++ b/arch/s390/include/asm/futex.h @@ -26,9 +26,9 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { int oldval = 0, newval, ret; + mm_segment_t old_fs; - load_kernel_asce(); - + old_fs = enable_sacf_uaccess(); pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -55,6 +55,7 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, ret = -ENOSYS; } pagefault_enable(); + disable_sacf_uaccess(old_fs); if (!ret) *oval = oldval; @@ -65,9 +66,10 @@ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, u32 newval) { + mm_segment_t old_fs; int ret; - load_kernel_asce(); + old_fs = enable_sacf_uaccess(); asm volatile( " sacf 256\n" "0: cs %1,%4,0(%5)\n" @@ -77,6 +79,7 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, : "=d" (ret), "+d" (oldval), "=m" (*uaddr) : "0" (-EFAULT), "d" (newval), "a" (uaddr), "m" (*uaddr) : "cc", "memory"); + disable_sacf_uaccess(old_fs); *uval = oldval; return ret; } diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 9eb36a1592c7..2306fa17f6cd 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -115,33 +115,28 @@ struct lowcore { /* Address space pointer. */ __u64 kernel_asce; /* 0x0378 */ __u64 user_asce; /* 0x0380 */ + __u64 vdso_asce; /* 0x0388 */ /* * The lpp and current_pid fields form a * 64-bit value that is set as program * parameter with the LPP instruction. */ - __u32 lpp; /* 0x0388 */ - __u32 current_pid; /* 0x038c */ + __u32 lpp; /* 0x0390 */ + __u32 current_pid; /* 0x0394 */ /* SMP info area */ - __u32 cpu_nr; /* 0x0390 */ - __u32 softirq_pending; /* 0x0394 */ - __u64 percpu_offset; /* 0x0398 */ - __u64 vdso_per_cpu_data; /* 0x03a0 */ - __u64 machine_flags; /* 0x03a8 */ - __u32 preempt_count; /* 0x03b0 */ - __u8 pad_0x03b4[0x03b8-0x03b4]; /* 0x03b4 */ - __u64 gmap; /* 0x03b8 */ - __u32 spinlock_lockval; /* 0x03c0 */ - __u32 spinlock_index; /* 0x03c4 */ - __u32 fpu_flags; /* 0x03c8 */ - __u8 pad_0x03cc[0x0400-0x03cc]; /* 0x03cc */ - - /* Per cpu primary space access list */ - __u32 paste[16]; /* 0x0400 */ - - __u8 pad_0x04c0[0x0e00-0x0440]; /* 0x0440 */ + __u32 cpu_nr; /* 0x0398 */ + __u32 softirq_pending; /* 0x039c */ + __u32 preempt_count; /* 0x03a0 */ + __u32 spinlock_lockval; /* 0x03a4 */ + __u32 spinlock_index; /* 0x03a8 */ + __u32 fpu_flags; /* 0x03ac */ + __u64 percpu_offset; /* 0x03b0 */ + __u64 vdso_per_cpu_data; /* 0x03b8 */ + __u64 machine_flags; /* 0x03c0 */ + __u64 gmap; /* 0x03c8 */ + __u8 pad_0x03d0[0x0e00-0x03d0]; /* 0x03d0 */ /* * 0xe00 contains the address of the IPL Parameter Information diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 43607bb12cc2..6133aa376b7c 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -71,41 +71,38 @@ static inline int init_new_context(struct task_struct *tsk, static inline void set_user_asce(struct mm_struct *mm) { S390_lowcore.user_asce = mm->context.asce; - if (current->thread.mm_segment.ar4) - __ctl_load(S390_lowcore.user_asce, 7, 7); - set_cpu_flag(CIF_ASCE_PRIMARY); + __ctl_load(S390_lowcore.user_asce, 1, 1); + clear_cpu_flag(CIF_ASCE_PRIMARY); } static inline void clear_user_asce(void) { S390_lowcore.user_asce = S390_lowcore.kernel_asce; - - __ctl_load(S390_lowcore.user_asce, 1, 1); - __ctl_load(S390_lowcore.user_asce, 7, 7); -} - -static inline void load_kernel_asce(void) -{ - unsigned long asce; - - __ctl_store(asce, 1, 1); - if (asce != S390_lowcore.kernel_asce) - __ctl_load(S390_lowcore.kernel_asce, 1, 1); + __ctl_load(S390_lowcore.kernel_asce, 1, 1); set_cpu_flag(CIF_ASCE_PRIMARY); } +mm_segment_t enable_sacf_uaccess(void); +void disable_sacf_uaccess(mm_segment_t old_fs); + static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { int cpu = smp_processor_id(); - S390_lowcore.user_asce = next->context.asce; if (prev == next) return; + S390_lowcore.user_asce = next->context.asce; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - /* Clear old ASCE by loading the kernel ASCE. */ - __ctl_load(S390_lowcore.kernel_asce, 1, 1); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); + /* Clear previous user-ASCE from CR1 and CR7 */ + if (!test_cpu_flag(CIF_ASCE_PRIMARY)) { + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + set_cpu_flag(CIF_ASCE_PRIMARY); + } + if (test_cpu_flag(CIF_ASCE_SECONDARY)) { + __ctl_load(S390_lowcore.vdso_asce, 7, 7); + clear_cpu_flag(CIF_ASCE_SECONDARY); + } cpumask_clear_cpu(cpu, &prev->context.cpu_attach_mask); } @@ -115,7 +112,6 @@ static inline void finish_arch_post_lock_switch(void) struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - load_kernel_asce(); if (mm) { preempt_disable(); while (atomic_read(&mm->context.flush_count)) diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index f25bfe888933..709351bce80e 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -109,9 +109,7 @@ extern void execve_tail(void); #define HAVE_ARCH_PICK_MMAP_LAYOUT -typedef struct { - __u32 ar4; -} mm_segment_t; +typedef unsigned int mm_segment_t; /* * Thread structure diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index cdd0f0d999e2..ad6b91013a05 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -16,7 +16,7 @@ #include #include #include - +#include /* * The fs value determines whether argument validity checking should be @@ -26,27 +26,16 @@ * For historical reasons, these macros are grossly misnamed. */ -#define MAKE_MM_SEG(a) ((mm_segment_t) { (a) }) - - -#define KERNEL_DS MAKE_MM_SEG(0) -#define USER_DS MAKE_MM_SEG(1) +#define KERNEL_DS (0) +#define KERNEL_DS_SACF (1) +#define USER_DS (2) +#define USER_DS_SACF (3) #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.mm_segment) -#define segment_eq(a,b) ((a).ar4 == (b).ar4) +#define segment_eq(a,b) (((a) & 2) == ((b) & 2)) -static inline void set_fs(mm_segment_t fs) -{ - current->thread.mm_segment = fs; - if (uaccess_kernel()) { - set_cpu_flag(CIF_ASCE_SECONDARY); - __ctl_load(S390_lowcore.kernel_asce, 7, 7); - } else { - clear_cpu_flag(CIF_ASCE_SECONDARY); - __ctl_load(S390_lowcore.user_asce, 7, 7); - } -} +void set_fs(mm_segment_t fs); static inline int __range_ok(unsigned long addr, unsigned long size) { @@ -95,7 +84,7 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n); static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) { - unsigned long spec = 0x810000UL; + unsigned long spec = 0x010000UL; int rc; switch (size) { @@ -125,7 +114,7 @@ static inline int __put_user_fn(void *x, void __user *ptr, unsigned long size) static inline int __get_user_fn(void *x, const void __user *ptr, unsigned long size) { - unsigned long spec = 0x81UL; + unsigned long spec = 0x01UL; int rc; switch (size) { diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 33ec80df7ed4..587b195b588d 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -171,6 +171,7 @@ int main(void) OFFSET(__LC_RESTART_DATA, lowcore, restart_data); OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source); OFFSET(__LC_USER_ASCE, lowcore, user_asce); + OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce); OFFSET(__LC_LPP, lowcore, lpp); OFFSET(__LC_CURRENT_PID, lowcore, current_pid); OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); @@ -178,7 +179,6 @@ int main(void) OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); - OFFSET(__LC_PASTE, lowcore, paste); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); /* hardware defined lowcore locations 0x1000 - 0x18ff */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index ee53ac7b1ab8..a316cd6999ad 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -379,13 +379,21 @@ ENTRY(system_call) jg s390_handle_mcck # TIF bit will be cleared by handler # -# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce +# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce # .Lsysc_asce: + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY + lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce + TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY + jz .Lsysc_return +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES + tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? + jnz .Lsysc_set_fs_fixup ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY - jz .Lsysc_return + j .Lsysc_return +.Lsysc_set_fs_fixup: +#endif larl %r14,.Lsysc_return jg set_fs_fixup @@ -741,10 +749,18 @@ ENTRY(io_int_handler) # _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce # .Lio_asce: + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY + lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce + TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY + jz .Lio_return +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES + tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? + jnz .Lio_set_fs_fixup ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY - jz .Lio_return + j .Lio_return +.Lio_set_fs_fixup: +#endif larl %r14,.Lio_return jg set_fs_fixup diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 172002da7075..38a973ccf501 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -28,7 +28,7 @@ ENTRY(startup_continue) lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore - lghi %r0,__LC_PASTE + larl %r0,boot_vdso_data stg %r0,__LC_VDSO_PER_CPU # # Setup stack diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 0520854a4dab..39a218703c50 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -158,16 +158,9 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) { unsigned long segment_table, page_table, page_frame; struct vdso_per_cpu_data *vd; - u32 *psal, *aste; - int i; - - lowcore->vdso_per_cpu_data = __LC_PASTE; - - if (!vdso_enabled) - return 0; segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); - page_table = get_zeroed_page(GFP_KERNEL | GFP_DMA); + page_table = get_zeroed_page(GFP_KERNEL); page_frame = get_zeroed_page(GFP_KERNEL); if (!segment_table || !page_table || !page_frame) goto out; @@ -179,25 +172,15 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) vd->cpu_nr = lowcore->cpu_nr; vd->node_id = cpu_to_node(vd->cpu_nr); - /* Set up access register mode page table */ + /* Set up page table for the vdso address space */ memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES); memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE); *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table; *(unsigned long *) page_table = _PAGE_PROTECT + page_frame; - psal = (u32 *) (page_table + 256*sizeof(unsigned long)); - aste = psal + 32; - - for (i = 4; i < 32; i += 4) - psal[i] = 0x80000000; - - lowcore->paste[4] = (u32)(addr_t) psal; - psal[0] = 0x02000000; - psal[2] = (u32)(addr_t) aste; - *(unsigned long *) (aste + 2) = segment_table + + lowcore->vdso_asce = segment_table + _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT; - aste[4] = (u32)(addr_t) psal; lowcore->vdso_per_cpu_data = page_frame; return 0; @@ -212,14 +195,8 @@ out: void vdso_free_per_cpu(struct lowcore *lowcore) { unsigned long segment_table, page_table, page_frame; - u32 *psal, *aste; - - if (!vdso_enabled) - return; - psal = (u32 *)(addr_t) lowcore->paste[4]; - aste = (u32 *)(addr_t) psal[2]; - segment_table = *(unsigned long *)(aste + 2) & PAGE_MASK; + segment_table = lowcore->vdso_asce & PAGE_MASK; page_table = *(unsigned long *) segment_table; page_frame = *(unsigned long *) page_table; @@ -228,16 +205,6 @@ void vdso_free_per_cpu(struct lowcore *lowcore) free_pages(segment_table, SEGMENT_ORDER); } -static void vdso_init_cr5(void) -{ - unsigned long cr5; - - if (!vdso_enabled) - return; - cr5 = offsetof(struct lowcore, paste); - __ctl_load(cr5, 5, 5); -} - /* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree @@ -314,8 +281,6 @@ static int __init vdso_init(void) { int i; - if (!vdso_enabled) - return 0; vdso_init_data(vdso_data); #ifdef CONFIG_COMPAT /* Calculate the size of the 32 bit vDSO */ @@ -354,7 +319,6 @@ static int __init vdso_init(void) vdso64_pagelist[vdso64_pages] = NULL; if (vdso_alloc_per_cpu(&S390_lowcore)) BUG(); - vdso_init_cr5(); get_page(virt_to_page(vdso_data)); diff --git a/arch/s390/kernel/vdso32/getcpu.S b/arch/s390/kernel/vdso32/getcpu.S index 6e30769dd017..5477a2c112fb 100644 --- a/arch/s390/kernel/vdso32/getcpu.S +++ b/arch/s390/kernel/vdso32/getcpu.S @@ -15,23 +15,11 @@ .type __kernel_getcpu,@function __kernel_getcpu: .cfi_startproc - ear %r1,%a4 - lhi %r4,1 - sll %r4,24 - sar %a4,%r4 la %r4,0 - epsw %r0,0 - sacf 512 + sacf 256 l %r5,__VDSO_CPU_NR(%r4) l %r4,__VDSO_NODE_ID(%r4) - tml %r0,0x4000 - jo 1f - tml %r0,0x8000 - jno 0f - sacf 256 - j 1f -0: sacf 0 -1: sar %a4,%r1 + sacf 0 ltr %r2,%r2 jz 2f st %r5,0(%r2) diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S index 9c3b12626dba..5d7b56b49458 100644 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -114,23 +114,12 @@ __kernel_clock_gettime: br %r14 /* CPUCLOCK_VIRT for this thread */ -9: icm %r0,15,__VDSO_ECTG_OK(%r5) +9: lghi %r4,0 + icm %r0,15,__VDSO_ECTG_OK(%r5) jz 12f - ear %r2,%a4 - llilh %r4,0x0100 - sar %a4,%r4 - lghi %r4,0 - epsw %r5,0 - sacf 512 /* Magic ectg instruction */ + sacf 256 /* Magic ectg instruction */ .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4 - tml %r5,0x4000 - jo 11f - tml %r5,0x8000 - jno 10f - sacf 256 - j 11f -10: sacf 0 -11: sar %a4,%r2 + sacf 0 algr %r1,%r0 /* r1 = cputime as TOD value */ mghi %r1,1000 /* convert to nanoseconds */ srlg %r1,%r1,12 /* r1 = cputime in nanosec */ diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S index 43983764b959..e9c34364d97b 100644 --- a/arch/s390/kernel/vdso64/getcpu.S +++ b/arch/s390/kernel/vdso64/getcpu.S @@ -15,22 +15,11 @@ .type __kernel_getcpu,@function __kernel_getcpu: .cfi_startproc - ear %r1,%a4 - llilh %r4,0x0100 - sar %a4,%r4 la %r4,0 - epsw %r0,0 - sacf 512 + sacf 256 l %r5,__VDSO_CPU_NR(%r4) l %r4,__VDSO_NODE_ID(%r4) - tml %r0,0x4000 - jo 1f - tml %r0,0x8000 - jno 0f - sacf 256 - j 1f -0: sacf 0 -1: sar %a4,%r1 + sacf 0 ltgr %r2,%r2 jz 2f st %r5,0(%r2) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 802903c50de1..cae5a1e16cbd 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -40,10 +40,67 @@ static inline int copy_with_mvcos(void) } #endif +void set_fs(mm_segment_t fs) +{ + current->thread.mm_segment = fs; + if (fs == USER_DS) { + __ctl_load(S390_lowcore.user_asce, 1, 1); + clear_cpu_flag(CIF_ASCE_PRIMARY); + } else { + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + set_cpu_flag(CIF_ASCE_PRIMARY); + } + if (fs & 1) { + if (fs == USER_DS_SACF) + __ctl_load(S390_lowcore.user_asce, 7, 7); + else + __ctl_load(S390_lowcore.kernel_asce, 7, 7); + set_cpu_flag(CIF_ASCE_SECONDARY); + } +} +EXPORT_SYMBOL(set_fs); + +mm_segment_t enable_sacf_uaccess(void) +{ + mm_segment_t old_fs; + unsigned long asce, cr; + + old_fs = current->thread.mm_segment; + if (old_fs & 1) + return old_fs; + current->thread.mm_segment |= 1; + asce = S390_lowcore.kernel_asce; + if (likely(old_fs == USER_DS)) { + __ctl_store(cr, 1, 1); + if (cr != S390_lowcore.kernel_asce) { + __ctl_load(S390_lowcore.kernel_asce, 1, 1); + set_cpu_flag(CIF_ASCE_PRIMARY); + } + asce = S390_lowcore.user_asce; + } + __ctl_store(cr, 7, 7); + if (cr != asce) { + __ctl_load(asce, 7, 7); + set_cpu_flag(CIF_ASCE_SECONDARY); + } + return old_fs; +} +EXPORT_SYMBOL(enable_sacf_uaccess); + +void disable_sacf_uaccess(mm_segment_t old_fs) +{ + if (old_fs == USER_DS && test_facility(27)) { + __ctl_load(S390_lowcore.user_asce, 1, 1); + clear_cpu_flag(CIF_ASCE_PRIMARY); + } + current->thread.mm_segment = old_fs; +} +EXPORT_SYMBOL(disable_sacf_uaccess); + static inline unsigned long copy_from_user_mvcos(void *x, const void __user *ptr, unsigned long size) { - register unsigned long reg0 asm("0") = 0x81UL; + register unsigned long reg0 asm("0") = 0x01UL; unsigned long tmp1, tmp2; tmp1 = -4096UL; @@ -74,8 +131,9 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, unsigned long size) { unsigned long tmp1, tmp2; + mm_segment_t old_fs; - load_kernel_asce(); + old_fs = enable_sacf_uaccess(); tmp1 = -256UL; asm volatile( " sacf 0\n" @@ -102,6 +160,7 @@ static inline unsigned long copy_from_user_mvcp(void *x, const void __user *ptr, EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) : : "cc", "memory"); + disable_sacf_uaccess(old_fs); return size; } @@ -116,7 +175,7 @@ EXPORT_SYMBOL(raw_copy_from_user); static inline unsigned long copy_to_user_mvcos(void __user *ptr, const void *x, unsigned long size) { - register unsigned long reg0 asm("0") = 0x810000UL; + register unsigned long reg0 asm("0") = 0x010000UL; unsigned long tmp1, tmp2; tmp1 = -4096UL; @@ -147,8 +206,9 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, unsigned long size) { unsigned long tmp1, tmp2; + mm_segment_t old_fs; - load_kernel_asce(); + old_fs = enable_sacf_uaccess(); tmp1 = -256UL; asm volatile( " sacf 0\n" @@ -175,6 +235,7 @@ static inline unsigned long copy_to_user_mvcs(void __user *ptr, const void *x, EX_TABLE(7b,3b) EX_TABLE(8b,3b) EX_TABLE(9b,6b) : "+a" (size), "+a" (ptr), "+a" (x), "+a" (tmp1), "=a" (tmp2) : : "cc", "memory"); + disable_sacf_uaccess(old_fs); return size; } @@ -189,7 +250,7 @@ EXPORT_SYMBOL(raw_copy_to_user); static inline unsigned long copy_in_user_mvcos(void __user *to, const void __user *from, unsigned long size) { - register unsigned long reg0 asm("0") = 0x810081UL; + register unsigned long reg0 asm("0") = 0x010001UL; unsigned long tmp1, tmp2; tmp1 = -4096UL; @@ -212,9 +273,10 @@ static inline unsigned long copy_in_user_mvcos(void __user *to, const void __use static inline unsigned long copy_in_user_mvc(void __user *to, const void __user *from, unsigned long size) { + mm_segment_t old_fs; unsigned long tmp1; - load_kernel_asce(); + old_fs = enable_sacf_uaccess(); asm volatile( " sacf 256\n" " aghi %0,-1\n" @@ -238,6 +300,7 @@ static inline unsigned long copy_in_user_mvc(void __user *to, const void __user EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) : "+a" (size), "+a" (to), "+a" (from), "=a" (tmp1) : : "cc", "memory"); + disable_sacf_uaccess(old_fs); return size; } @@ -251,7 +314,7 @@ EXPORT_SYMBOL(raw_copy_in_user); static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size) { - register unsigned long reg0 asm("0") = 0x810000UL; + register unsigned long reg0 asm("0") = 0x010000UL; unsigned long tmp1, tmp2; tmp1 = -4096UL; @@ -279,9 +342,10 @@ static inline unsigned long clear_user_mvcos(void __user *to, unsigned long size static inline unsigned long clear_user_xc(void __user *to, unsigned long size) { + mm_segment_t old_fs; unsigned long tmp1, tmp2; - load_kernel_asce(); + old_fs = enable_sacf_uaccess(); asm volatile( " sacf 256\n" " aghi %0,-1\n" @@ -310,6 +374,7 @@ static inline unsigned long clear_user_xc(void __user *to, unsigned long size) EX_TABLE(1b,6b) EX_TABLE(2b,0b) EX_TABLE(4b,0b) : "+a" (size), "+a" (to), "=a" (tmp1), "=a" (tmp2) : : "cc", "memory"); + disable_sacf_uaccess(old_fs); return size; } @@ -345,10 +410,15 @@ static inline unsigned long strnlen_user_srst(const char __user *src, unsigned long __strnlen_user(const char __user *src, unsigned long size) { + mm_segment_t old_fs; + unsigned long len; + if (unlikely(!size)) return 0; - load_kernel_asce(); - return strnlen_user_srst(src, size); + old_fs = enable_sacf_uaccess(); + len = strnlen_user_srst(src, size); + disable_sacf_uaccess(old_fs); + return len; } EXPORT_SYMBOL(__strnlen_user); diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index be974b3eb7e4..14654007dce4 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -50,6 +50,13 @@ #define VM_FAULT_SIGNAL 0x080000 #define VM_FAULT_PFAULT 0x100000 +enum fault_type { + KERNEL_FAULT, + USER_FAULT, + VDSO_FAULT, + GMAP_FAULT, +}; + static unsigned long store_indication __read_mostly; static int __init fault_init(void) @@ -99,27 +106,34 @@ void bust_spinlocks(int yes) } /* - * Returns the address space associated with the fault. - * Returns 0 for kernel space and 1 for user space. + * Find out which address space caused the exception. + * Access register mode is impossible, ignore space == 3. */ -static inline int user_space_fault(struct pt_regs *regs) +static inline enum fault_type get_fault_type(struct pt_regs *regs) { unsigned long trans_exc_code; - /* - * The lowest two bits of the translation exception - * identification indicate which paging table was used. - */ trans_exc_code = regs->int_parm_long & 3; - if (trans_exc_code == 3) /* home space -> kernel */ - return 0; - if (user_mode(regs)) - return 1; - if (trans_exc_code == 2) /* secondary space -> set_fs */ - return current->thread.mm_segment.ar4; - if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) - return 1; - return 0; + if (likely(trans_exc_code == 0)) { + /* primary space exception */ + if (IS_ENABLED(CONFIG_PGSTE) && + test_pt_regs_flag(regs, PIF_GUEST_FAULT)) + return GMAP_FAULT; + if (current->thread.mm_segment == USER_DS) + return USER_FAULT; + return KERNEL_FAULT; + } + if (trans_exc_code == 2) { + /* secondary space exception */ + if (current->thread.mm_segment & 1) { + if (current->thread.mm_segment == USER_DS_SACF) + return USER_FAULT; + return KERNEL_FAULT; + } + return VDSO_FAULT; + } + /* home space exception -> access via kernel ASCE */ + return KERNEL_FAULT; } static int bad_address(void *p) @@ -204,20 +218,23 @@ static void dump_fault_info(struct pt_regs *regs) break; } pr_cont("mode while using "); - if (!user_space_fault(regs)) { - asce = S390_lowcore.kernel_asce; - pr_cont("kernel "); - } -#ifdef CONFIG_PGSTE - else if (test_pt_regs_flag(regs, PIF_GUEST_FAULT)) { - struct gmap *gmap = (struct gmap *)S390_lowcore.gmap; - asce = gmap->asce; - pr_cont("gmap "); - } -#endif - else { + switch (get_fault_type(regs)) { + case USER_FAULT: asce = S390_lowcore.user_asce; pr_cont("user "); + break; + case VDSO_FAULT: + asce = S390_lowcore.vdso_asce; + pr_cont("vdso "); + break; + case GMAP_FAULT: + asce = ((struct gmap *) S390_lowcore.gmap)->asce; + pr_cont("gmap "); + break; + case KERNEL_FAULT: + asce = S390_lowcore.kernel_asce; + pr_cont("kernel "); + break; } pr_cont("ASCE.\n"); dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK); @@ -273,7 +290,7 @@ static noinline void do_no_context(struct pt_regs *regs) * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - if (!user_space_fault(regs)) + if (get_fault_type(regs) == KERNEL_FAULT) printk(KERN_ALERT "Unable to handle kernel pointer dereference" " in virtual kernel address space\n"); else @@ -395,12 +412,11 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, int fault) */ static inline int do_exception(struct pt_regs *regs, int access) { -#ifdef CONFIG_PGSTE struct gmap *gmap; -#endif struct task_struct *tsk; struct mm_struct *mm; struct vm_area_struct *vma; + enum fault_type type; unsigned long trans_exc_code; unsigned long address; unsigned int flags; @@ -425,8 +441,19 @@ static inline int do_exception(struct pt_regs *regs, int access) * user context. */ fault = VM_FAULT_BADCONTEXT; - if (unlikely(!user_space_fault(regs) || faulthandler_disabled() || !mm)) + type = get_fault_type(regs); + switch (type) { + case KERNEL_FAULT: + goto out; + case VDSO_FAULT: + fault = VM_FAULT_BADMAP; goto out; + case USER_FAULT: + case GMAP_FAULT: + if (faulthandler_disabled() || !mm) + goto out; + break; + } address = trans_exc_code & __FAIL_ADDR_MASK; perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); @@ -437,10 +464,9 @@ static inline int do_exception(struct pt_regs *regs, int access) flags |= FAULT_FLAG_WRITE; down_read(&mm->mmap_sem); -#ifdef CONFIG_PGSTE - gmap = test_pt_regs_flag(regs, PIF_GUEST_FAULT) ? - (struct gmap *) S390_lowcore.gmap : NULL; - if (gmap) { + gmap = NULL; + if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) { + gmap = (struct gmap *) S390_lowcore.gmap; current->thread.gmap_addr = address; current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE); current->thread.gmap_int_code = regs->int_code & 0xffff; @@ -452,7 +478,6 @@ static inline int do_exception(struct pt_regs *regs, int access) if (gmap->pfault_enabled) flags |= FAULT_FLAG_RETRY_NOWAIT; } -#endif retry: fault = VM_FAULT_BADMAP; @@ -507,15 +532,14 @@ retry: regs, address); } if (fault & VM_FAULT_RETRY) { -#ifdef CONFIG_PGSTE - if (gmap && (flags & FAULT_FLAG_RETRY_NOWAIT)) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap && + (flags & FAULT_FLAG_RETRY_NOWAIT)) { /* FAULT_FLAG_RETRY_NOWAIT has been set, * mmap_sem has not been released */ current->thread.gmap_pfault = 1; fault = VM_FAULT_PFAULT; goto out_up; } -#endif /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk * of starvation. */ flags &= ~(FAULT_FLAG_ALLOW_RETRY | @@ -525,8 +549,7 @@ retry: goto retry; } } -#ifdef CONFIG_PGSTE - if (gmap) { + if (IS_ENABLED(CONFIG_PGSTE) && gmap) { address = __gmap_link(gmap, current->thread.gmap_addr, address); if (address == -EFAULT) { @@ -538,7 +561,6 @@ retry: goto out_up; } } -#endif fault = 0; out_up: up_read(&mm->mmap_sem); diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 817c9e16e83e..671535e64aba 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -95,6 +95,7 @@ void __init paging_init(void) } init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; S390_lowcore.kernel_asce = init_mm.context.asce; + S390_lowcore.user_asce = S390_lowcore.kernel_asce; crst_table_init((unsigned long *) init_mm.pgd, pgd_type); vmem_map_init(); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 4ad4c4f77b4d..434a9564917b 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -71,10 +71,8 @@ static void __crst_table_upgrade(void *arg) { struct mm_struct *mm = arg; - if (current->active_mm == mm) { - clear_user_asce(); + if (current->active_mm == mm) set_user_asce(mm); - } __tlb_flush_local(); } -- cgit v1.2.3 From 11776eaa6568f5357542bf41b0c7bb90854137cc Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 13 Nov 2017 16:37:33 +0100 Subject: s390: correct some inline assembly constraints Inline assembly code changed in this patch should really use "Q" constraint "Memory reference without index register and with short displacement". The kernel does not compile with kasan support enabled otherwise (due to stack instrumentation). Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/include/asm/cpu_mf.h | 2 +- arch/s390/include/asm/lowcore.h | 4 ++-- arch/s390/include/asm/processor.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 05480e4cc5ca..7364130a29c8 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -167,7 +167,7 @@ static inline int lcctl(u64 ctl) " .insn s,0xb2840000,%1\n" " ipm %0\n" " srl %0,28\n" - : "=d" (cc) : "m" (ctl) : "cc"); + : "=d" (cc) : "Q" (ctl) : "cc"); return cc; } diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 2306fa17f6cd..ec6592e8ba36 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -188,14 +188,14 @@ extern struct lowcore *lowcore_ptr[]; static inline void set_prefix(__u32 address) { - asm volatile("spx %0" : : "m" (address) : "memory"); + asm volatile("spx %0" : : "Q" (address) : "memory"); } static inline __u32 store_prefix(void) { __u32 address; - asm volatile("stpx %0" : "=m" (address)); + asm volatile("stpx %0" : "=Q" (address)); return address; } diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 709351bce80e..bfbfad482289 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -245,7 +245,7 @@ static inline unsigned short stap(void) { unsigned short cpu_address; - asm volatile("stap %0" : "=m" (cpu_address)); + asm volatile("stap %0" : "=Q" (cpu_address)); return cpu_address; } -- cgit v1.2.3 From dfd4c4935de8ac39f22e0f65972140405fd27942 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Mon, 13 Nov 2017 16:58:34 +0100 Subject: s390/kbuild: get rid of a warning when compiling with KCOV This change fixes the following warning: warning: (KCOV) selects GCC_PLUGINS which has unmet direct dependencies (HAVE_GCC_PLUGINS && !COMPILE_TEST) Signed-off-by: Vasily Gorbik Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/s390') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 863a62a6de3c..f5beccbe74d8 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -148,6 +148,7 @@ config S390 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_FUTEX_CMPXCHG if FUTEX + select HAVE_GCC_PLUGINS select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 -- cgit v1.2.3 From 3c6153e8145f74870bad11fa4344fd20f1ad3aaf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Nov 2017 13:35:14 +0100 Subject: s390/vdso: add missing boot_vdso_data declaration sparse says: arch/s390/kernel/vdso.c:150:18: warning: symbol 'boot_vdso_data' was not declared. Should it be static? Signed-off-by: Heiko Carstens --- arch/s390/include/asm/vdso.h | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index ae6261ef97d5..169d7604eb80 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -46,6 +46,7 @@ struct vdso_per_cpu_data { }; extern struct vdso_data *vdso_data; +extern struct vdso_data boot_vdso_data; void vdso_alloc_boot_cpu(struct lowcore *lowcore); int vdso_alloc_per_cpu(struct lowcore *lowcore); -- cgit v1.2.3 From 78ca4fe3bb166e913d278e504d93f09a8ba3139e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Nov 2017 13:47:00 +0100 Subject: s390/spinlock: fix indentation checkpatch: WARNING: Statements should start on a tabstop #9499: FILE: arch/s390/lib/spinlock.c:231: + return; sparse: arch/s390/lib/spinlock.c:81 arch_load_niai4() warn: inconsistent indenting Signed-off-by: Heiko Carstens --- arch/s390/lib/spinlock.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index 84c0faeaf7ea..30a7c8c29964 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -78,7 +78,7 @@ static inline int arch_load_niai4(int *lock) ALTERNATIVE("", ".long 0xb2fa0040", 49) /* NIAI 4 */ " l %0,%1\n" : "=d" (owner) : "Q" (*lock) : "memory"); - return owner; + return owner; } static inline int arch_cmpxchg_niai8(int *lock, int old, int new) @@ -226,9 +226,10 @@ static inline void arch_spin_lock_classic(arch_spinlock_t *lp) /* Try to get the lock if it is free. */ if (!owner) { new = (old & _Q_TAIL_MASK) | lockval; - if (arch_cmpxchg_niai8(&lp->lock, old, new)) + if (arch_cmpxchg_niai8(&lp->lock, old, new)) { /* Got the lock */ - return; + return; + } continue; } if (count-- >= 0) -- cgit v1.2.3 From 2be1da8d4d3fd7b09f5c6ab952bff5cef0677ade Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Nov 2017 14:50:08 +0100 Subject: s390/mm: remove unused code Signed-off-by: Heiko Carstens --- arch/s390/mm/gmap.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 2f66290c9b92..b2c140193b0a 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1187,12 +1187,11 @@ static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr, unsigned long *sgt) { - unsigned long asce, *pgt; + unsigned long *pgt; struct page *page; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _SEGMENT_SIZE) { if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN)) continue; @@ -1245,12 +1244,11 @@ static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr, unsigned long *r3t) { - unsigned long asce, *sgt; + unsigned long *sgt; struct page *page; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r3t | _ASCE_TYPE_REGION3; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION3_SIZE) { if (!(r3t[i] & _REGION_ENTRY_ORIGIN)) continue; @@ -1303,12 +1301,11 @@ static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr) static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr, unsigned long *r2t) { - unsigned long asce, *r3t; + unsigned long *r3t; struct page *page; int i; BUG_ON(!gmap_is_shadow(sg)); - asce = (unsigned long) r2t | _ASCE_TYPE_REGION2; for (i = 0; i < _CRST_ENTRIES; i++, raddr += _REGION2_SIZE) { if (!(r2t[i] & _REGION_ENTRY_ORIGIN)) continue; -- cgit v1.2.3 From a6de0a91d93a47f812cf43b96ba6e639de6df6d5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Nov 2017 14:50:38 +0100 Subject: s390/nmi: remove unused code Signed-off-by: Heiko Carstens --- arch/s390/kernel/nmi.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 3f3cda41f32a..6ff169253cae 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -191,7 +191,6 @@ static int notrace s390_check_registers(union mci mci, int umode) { union ctlreg2 cr2; int kill_task; - void *fpt_save_area; kill_task = 0; @@ -224,7 +223,6 @@ static int notrace s390_check_registers(union mci mci, int umode) if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } - fpt_save_area = &S390_lowcore.floating_pt_save_area; if (!mci.fc) { /* * Floating point control register can't be restored. -- cgit v1.2.3 From 049a2c2d486e8cc82c5cd79fa479c5b105b109e9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 14 Nov 2017 15:20:24 +0100 Subject: s390: enable CPU alternatives unconditionally Remove the CPU_ALTERNATIVES config option and enable the code unconditionally. The config option was only added to avoid a conflict with the named saved segment support. Since that code is gone there is no reason to keep the CPU_ALTERNATIVES config option. Just enable it unconditionally to also reduce the number of config options and make it less likely that something breaks. Signed-off-by: Heiko Carstens --- arch/s390/Kconfig | 16 ---------------- arch/s390/include/asm/alternative.h | 20 +++----------------- arch/s390/kernel/Makefile | 3 +-- arch/s390/kernel/module.c | 15 ++++++--------- 4 files changed, 10 insertions(+), 44 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f5beccbe74d8..84767046daff 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -539,22 +539,6 @@ config ARCH_RANDOM If unsure, say Y. -config ALTERNATIVES - def_bool y - prompt "Patch optimized instructions for running CPU type" - help - When enabled the kernel code is compiled with additional - alternative instructions blocks optimized for newer CPU types. - These alternative instructions blocks are patched at kernel boot - time when running CPU supports them. This mechanism is used to - optimize some critical code paths (i.e. spinlocks) for newer CPUs - even if kernel is build to support older machine generations. - - This mechanism could be disabled by appending "noaltinstr" - option to the kernel command line. - - If unsure, say Y. - endmenu menu "Memory setup" diff --git a/arch/s390/include/asm/alternative.h b/arch/s390/include/asm/alternative.h index 6c268f6a51d3..a72002056b54 100644 --- a/arch/s390/include/asm/alternative.h +++ b/arch/s390/include/asm/alternative.h @@ -15,14 +15,9 @@ struct alt_instr { u8 replacementlen; /* length of new instruction */ } __packed; -#ifdef CONFIG_ALTERNATIVES -extern void apply_alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); -#else -static inline void apply_alternative_instructions(void) {}; -static inline void apply_alternatives(struct alt_instr *start, - struct alt_instr *end) {}; -#endif +void apply_alternative_instructions(void); +void apply_alternatives(struct alt_instr *start, struct alt_instr *end); + /* * |661: |662: |6620 |663: * +-----------+---------------------+ @@ -109,7 +104,6 @@ static inline void apply_alternatives(struct alt_instr *start, b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \ INSTR_LEN_SANITY_CHECK(altinstr_len(num)) -#ifdef CONFIG_ALTERNATIVES /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, altinstr, facility) \ ".pushsection .altinstr_replacement, \"ax\"\n" \ @@ -130,14 +124,6 @@ static inline void apply_alternatives(struct alt_instr *start, ALTINSTR_ENTRY(facility1, 1) \ ALTINSTR_ENTRY(facility2, 2) \ ".popsection\n" -#else -/* Alternative instructions are disabled, let's put just oldinstr in */ -#define ALTERNATIVE(oldinstr, altinstr, facility) \ - oldinstr "\n" - -#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \ - oldinstr "\n" -#endif /* * Alternative instructions for different CPU types or capabilities. diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 83bc82001c06..0319f4e81ea4 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -59,7 +59,7 @@ obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o als.o obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o -obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o +obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o extra-y += head.o head64.o vmlinux.lds @@ -77,7 +77,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UPROBES) += uprobes.o -obj-$(CONFIG_ALTERNATIVES) += alternative.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 6d9f73bb4142..7b87991416fd 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -433,16 +433,13 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s; char *secstrings; - if (IS_ENABLED(CONFIG_ALTERNATIVES)) { - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".altinstructions", - secstrings + s->sh_name)) { - /* patch .altinstructions */ - void *aseg = (void *)s->sh_addr; + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".altinstructions", secstrings + s->sh_name)) { + /* patch .altinstructions */ + void *aseg = (void *)s->sh_addr; - apply_alternatives(aseg, aseg + s->sh_size); - } + apply_alternatives(aseg, aseg + s->sh_size); } } -- cgit v1.2.3 From 6470c0cc4840c229c6f1c270ebbcacb8e14f477d Mon Sep 17 00:00:00 2001 From: Michael Holzheu Date: Wed, 15 Nov 2017 17:06:30 +0100 Subject: s390: Remove CONFIG_HARDENED_USERCOPY When running the crash tool on a s390 live system we get a kernel panic for reading memory within the kernel image: # uname -a Linux r3545011 4.14.0-rc8-00066-g1c9dbd4615fd #45 SMP PREEMPT Fri Nov 10 16:16:22 CET 2017 s390x s390x s390x GNU/Linux # crash /boot/vmlinux-devel /dev/mem # crash> rd 0x100000 usercopy: kernel memory exposure attempt detected from 0000000000100000 () (8 bytes) ------------[ cut here ]------------ kernel BUG at mm/usercopy.c:72! illegal operation: 0001 ilc:1 [#1] PREEMPT SMP. Modules linked in: CPU: 0 PID: 1461 Comm: crash Not tainted 4.14.0-rc8-00066-g1c9dbd4615fd-dirty #46 Hardware name: IBM 2827 H66 706 (z/VM 6.3.0) task: 000000001ad10100 task.stack: 000000001df78000 Krnl PSW : 0704d00180000000 000000000038165c (__check_object_size+0x164/0x1d0) R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:3 CC:1 PM:0 RI:0 EA:3 Krnl GPRS: 0000000012440e1d 0000000080000000 0000000000000061 00000000001cabc0 00000000001cc6d6 0000000000000000 0000000000cc4ed2 0000000000001000 000003ffc22fdd20 0000000000000008 0000000000100008 0000000000000001 0000000000000008 0000000000100000 0000000000381658 000000001df7bc90 Krnl Code: 000000000038164c: c020004a1c4a larl %r2,cc4ee0 0000000000381652: c0e5fff2581b brasl %r14,1cc688 #0000000000381658: a7f40001 brc 15,38165a >000000000038165c: eb42000c000c srlg %r4,%r2,12 0000000000381662: eb32001c000c srlg %r3,%r2,28 0000000000381668: c0110003ffff lgfi %r1,262143 000000000038166e: ec31ff752065 clgrj %r3,%r1,2,381558 0000000000381674: a7f4ff67 brc 15,381542 Call Trace: ([<0000000000381658>] __check_object_size+0x160/0x1d0) [<000000000082263a>] read_mem+0xaa/0x130. [<0000000000386182>] __vfs_read+0x42/0x168. [<000000000038632e>] vfs_read+0x86/0x140. [<0000000000386a26>] SyS_read+0x66/0xc0. [<0000000000ace6a4>] system_call+0xc4/0x2b0. INFO: lockdep is turned off. Last Breaking-Event-Address: [<0000000000381658>] __check_object_size+0x160/0x1d0 Kernel panic - not syncing: Fatal exception: panic_on_oops With CONFIG_HARDENED_USERCOPY copy_to_user() checks in __check_object_size() if the source address is within the kernel image. When the crash tool reads from 0x100000, this check leads to the kernel BUG(). So disable the kernel config option until this bug is fixed. Corresponding bug report on LKML: https://lkml.org/lkml/2017/11/10/341 Signed-off-by: Michael Holzheu Signed-off-by: Martin Schwidefsky --- arch/s390/configs/default_defconfig | 5 +---- arch/s390/configs/gcov_defconfig | 4 ---- arch/s390/configs/performance_defconfig | 4 ---- 3 files changed, 1 insertion(+), 12 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 84eccc88c065..5af8458951cf 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -629,6 +629,7 @@ CONFIG_STACK_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_FUNCTION_PROFILER=y CONFIG_HIST_TRIGGERS=y +CONFIG_DMA_API_DEBUG=y CONFIG_LKDTM=m CONFIG_TEST_LIST_SORT=y CONFIG_TEST_SORT=y @@ -637,14 +638,12 @@ CONFIG_RBTREE_TEST=y CONFIG_INTERVAL_TREE_TEST=m CONFIG_PERCPU_TEST=m CONFIG_ATOMIC64_SELFTEST=y -CONFIG_DMA_API_DEBUG=y CONFIG_TEST_BPF=m CONFIG_BUG_ON_DATA_CORRUPTION=y CONFIG_S390_PTDUMP=y CONFIG_ENCRYPTED_KEYS=m CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y -CONFIG_HARDENED_USERCOPY=y CONFIG_FORTIFY_SOURCE=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y @@ -660,13 +659,11 @@ CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m CONFIG_CRYPTO_GCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index f7202358e6d7..d52eafe57ae8 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -587,7 +587,6 @@ CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y -CONFIG_HARDENED_USERCOPY=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 @@ -605,13 +604,10 @@ CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 03100fe74ea8..20ed149e1137 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -585,7 +585,6 @@ CONFIG_BIG_KEYS=y CONFIG_ENCRYPTED_KEYS=m CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y -CONFIG_HARDENED_USERCOPY=y CONFIG_SECURITY_SELINUX=y CONFIG_SECURITY_SELINUX_BOOTPARAM=y CONFIG_SECURITY_SELINUX_BOOTPARAM_VALUE=0 @@ -603,13 +602,10 @@ CONFIG_CRYPTO_PCRYPT=m CONFIG_CRYPTO_CRYPTD=m CONFIG_CRYPTO_MCRYPTD=m CONFIG_CRYPTO_TEST=m -CONFIG_CRYPTO_CCM=m -CONFIG_CRYPTO_GCM=m CONFIG_CRYPTO_CHACHA20POLY1305=m CONFIG_CRYPTO_LRW=m CONFIG_CRYPTO_PCBC=m CONFIG_CRYPTO_KEYWRAP=m -CONFIG_CRYPTO_CMAC=m CONFIG_CRYPTO_XCBC=m CONFIG_CRYPTO_VMAC=m CONFIG_CRYPTO_CRC32=m -- cgit v1.2.3 From b192571d1ae375e0bbe0aa3ccfa1a3c3704454b9 Mon Sep 17 00:00:00 2001 From: Vasily Gorbik Date: Wed, 15 Nov 2017 14:15:36 +0100 Subject: s390/disassembler: increase show_code buffer size Current buffer size of 64 is too small. objdump shows that there are instructions which would require up to 75 bytes buffer (with current formating). 128 bytes "ought to be enough for anybody". Also replaces 8 spaces with a single tab to reduce the memory footprint. Fixes the following KASAN finding: BUG: KASAN: stack-out-of-bounds in number+0x3fe/0x538 Write of size 1 at addr 000000005a4a75a0 by task bash/1282 CPU: 1 PID: 1282 Comm: bash Not tainted 4.14.0+ #215 Hardware name: IBM 2964 N96 702 (z/VM 6.4.0) Call Trace: ([<000000000011eeb6>] show_stack+0x56/0x88) [<0000000000e1ce1a>] dump_stack+0x15a/0x1b0 [<00000000004e2994>] print_address_description+0xf4/0x288 [<00000000004e2cf2>] kasan_report+0x13a/0x230 [<0000000000e38ae6>] number+0x3fe/0x538 [<0000000000e3dfe4>] vsnprintf+0x194/0x948 [<0000000000e3ea42>] sprintf+0xa2/0xb8 [<00000000001198dc>] print_insn+0x374/0x500 [<0000000000119346>] show_code+0x4ee/0x538 [<000000000011f234>] show_registers+0x34c/0x388 [<000000000011f2ae>] show_regs+0x3e/0xa8 [<000000000011f502>] die+0x1ea/0x2e8 [<0000000000138f0e>] do_no_context+0x106/0x168 [<0000000000139a1a>] do_protection_exception+0x4da/0x7d0 [<0000000000e55914>] pgm_check_handler+0x16c/0x1c0 [<000000000090639e>] sysrq_handle_crash+0x46/0x58 ([<0000000000000007>] 0x7) [<00000000009073fa>] __handle_sysrq+0x102/0x218 [<0000000000907c06>] write_sysrq_trigger+0xd6/0x100 [<000000000061d67a>] proc_reg_write+0xb2/0x128 [<0000000000520be6>] __vfs_write+0xee/0x368 [<0000000000521222>] vfs_write+0x21a/0x278 [<000000000052156a>] SyS_write+0xda/0x178 [<0000000000e555cc>] system_call+0xc4/0x270 The buggy address belongs to the page: page:000003d1016929c0 count:0 mapcount:0 mapping: (null) index:0x0 flags: 0x0() raw: 0000000000000000 0000000000000000 0000000000000000 ffffffff00000000 raw: 0000000000000100 0000000000000200 0000000000000000 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: 000000005a4a7480: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 000000005a4a7500: 00 00 00 00 00 00 00 00 f2 f2 f2 f2 00 00 00 00 >000000005a4a7580: 00 00 00 00 f3 f3 f3 f3 00 00 00 00 00 00 00 00 ^ 000000005a4a7600: 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1 f8 f8 000000005a4a7680: f2 f2 f2 f2 f2 f2 f8 f8 f2 f2 f3 f3 f3 f3 00 00 ================================================================== Cc: Signed-off-by: Vasily Gorbik Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/dis.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index b811d3a8417d..3be829721cf9 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -480,7 +480,7 @@ void show_code(struct pt_regs *regs) { char *mode = user_mode(regs) ? "User" : "Krnl"; unsigned char code[64]; - char buffer[64], *ptr; + char buffer[128], *ptr; mm_segment_t old_fs; unsigned long addr; int start, end, opsize, hops, i; @@ -543,7 +543,7 @@ void show_code(struct pt_regs *regs) start += opsize; pr_cont("%s", buffer); ptr = buffer; - ptr += sprintf(ptr, "\n "); + ptr += sprintf(ptr, "\n\t "); hops++; } pr_cont("\n"); -- cgit v1.2.3 From ca5955cdeae744edd3dcc65d677e833fc29658c2 Mon Sep 17 00:00:00 2001 From: Pu Hou Date: Fri, 11 Nov 2016 03:08:49 +0100 Subject: s390/cpumf: introduce AUX buffer for dump diagnostic sample data Current implementation uses a private buffer for cpumf to dump samples. Samples first go to this buffer. Then copy to ring buffer allocated by perf core. With AUX buffer, this copy is not needed. AUX buffer is shared and zero-copy mapped to user space. The trailer information at the end of each SDB(sample data block) is also exported to user space. AUX buffer is used when diagnostic sampling mode is enabled. This patch contains functions to setup/free AUX buffer or to begin/end sampling per-cpu. Also include function called in interrupt to collect samples. Signed-off-by: Pu Hou Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_sf.c | 446 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 446 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index bd4bbf61aaf3..51dbd8d90dbe 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -77,6 +77,15 @@ struct sf_buffer { unsigned long *tail; /* last sample-data-block-table */ }; +struct aux_buffer { + struct sf_buffer sfb; + unsigned long head; /* index of SDB of buffer head */ + unsigned long alert_mark; /* index of SDB of alert request position */ + unsigned long empty_mark; /* mark of SDB not marked full */ + unsigned long *sdb_index; /* SDB address for fast lookup */ + unsigned long *sdbt_index; /* SDBT address for fast lookup */ +}; + struct cpu_hw_sf { /* CPU-measurement sampling information block */ struct hws_qsi_info_block qsi; @@ -85,6 +94,7 @@ struct cpu_hw_sf { struct sf_buffer sfb; /* Sampling buffer */ unsigned int flags; /* Status flags */ struct perf_event *event; /* Scheduled perf event */ + struct perf_output_handle handle; /* AUX buffer output handle */ }; static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); @@ -1291,6 +1301,439 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) sampl_overflow, event_overflow); } +#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb) +#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0) +#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark) +#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark) + +/* + * Get trailer entry by index of SDB. + */ +static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, + unsigned long index) +{ + unsigned long sdb; + + index = AUX_SDB_INDEX(aux, index); + sdb = aux->sdb_index[index]; + return (struct hws_trailer_entry *)trailer_entry_ptr(sdb); +} + +/* + * Finish sampling on the cpu. Called by cpumsf_pmu_del() with pmu + * disabled. Collect the full SDBs in AUX buffer which have not reached + * the point of alert indicator. And ignore the SDBs which are not + * full. + * + * 1. Scan SDBs to see how much data is there and consume them. + * 2. Remove alert indicator in the buffer. + */ +static void aux_output_end(struct perf_output_handle *handle) +{ + unsigned long i, range_scan, idx; + struct aux_buffer *aux; + struct hws_trailer_entry *te; + + aux = perf_get_aux(handle); + if (!aux) + return; + + range_scan = AUX_SDB_NUM_ALERT(aux); + for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + if (!(te->flags & SDB_TE_BUFFER_FULL_MASK)) + break; + } + /* i is num of SDBs which are full */ + perf_aux_output_end(handle, i << PAGE_SHIFT); + + /* Remove alert indicators in the buffer */ + te = aux_sdb_trailer(aux, aux->alert_mark); + te->flags &= ~SDB_TE_ALERT_REQ_MASK; + + debug_sprintf_event(sfdbg, 6, "aux_output_end: collect %lx SDBs\n", i); +} + +/* + * Start sampling on the CPU. Called by cpumsf_pmu_add() when an event + * is first added to the CPU or rescheduled again to the CPU. It is called + * with pmu disabled. + * + * 1. Reset the trailer of SDBs to get ready for new data. + * 2. Tell the hardware where to put the data by reset the SDBs buffer + * head(tear/dear). + */ +static int aux_output_begin(struct perf_output_handle *handle, + struct aux_buffer *aux, + struct cpu_hw_sf *cpuhw) +{ + unsigned long range; + unsigned long i, range_scan, idx; + unsigned long head, base, offset; + struct hws_trailer_entry *te; + + if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + return -EINVAL; + + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range <= 1) + return -ENOMEM; + + /* + * SDBs between aux->head and aux->empty_mark are already ready + * for new data. range_scan is num of SDBs not within them. + */ + if (range > AUX_SDB_NUM_EMPTY(aux)) { + range_scan = range - AUX_SDB_NUM_EMPTY(aux); + idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK; + te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK; + te->overflow = 0; + } + /* Save the position of empty SDBs */ + aux->empty_mark = aux->head + range - 1; + } + + /* Set alert indicator */ + aux->alert_mark = aux->head + range/2 - 1; + te = aux_sdb_trailer(aux, aux->alert_mark); + te->flags = te->flags | SDB_TE_ALERT_REQ_MASK; + + /* Reset hardware buffer head */ + head = AUX_SDB_INDEX(aux, aux->head); + base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; + offset = head % CPUM_SF_SDB_PER_TABLE; + cpuhw->lsctl.tear = base + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = aux->sdb_index[head]; + + debug_sprintf_event(sfdbg, 6, "aux_output_begin: " + "head->alert_mark->empty_mark (num_alert, range)" + "[%lx -> %lx -> %lx] (%lx, %lx) " + "tear index %lx, tear %lx dear %lx\n", + aux->head, aux->alert_mark, aux->empty_mark, + AUX_SDB_NUM_ALERT(aux), range, + head / CPUM_SF_SDB_PER_TABLE, + cpuhw->lsctl.tear, + cpuhw->lsctl.dear); + + return 0; +} + +/* + * Set alert indicator on SDB at index @alert_index while sampler is running. + * + * Return true if successfully. + * Return false if full indicator is already set by hardware sampler. + */ +static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, + unsigned long long *overflow) +{ + unsigned long long orig_overflow, orig_flags, new_flags; + struct hws_trailer_entry *te; + + te = aux_sdb_trailer(aux, alert_index); + do { + orig_flags = te->flags; + orig_overflow = te->overflow; + *overflow = orig_overflow; + if (orig_flags & SDB_TE_BUFFER_FULL_MASK) { + /* + * SDB is already set by hardware. + * Abort and try to set somewhere + * behind. + */ + return false; + } + new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK; + } while (!cmpxchg_double(&te->flags, &te->overflow, + orig_flags, orig_overflow, + new_flags, 0ULL)); + return true; +} + +/* + * aux_reset_buffer() - Scan and setup SDBs for new samples + * @aux: The AUX buffer to set + * @range: The range of SDBs to scan started from aux->head + * @overflow: Set to overflow count + * + * Set alert indicator on the SDB at index of aux->alert_mark. If this SDB is + * marked as empty, check if it is already set full by the hardware sampler. + * If yes, that means new data is already there before we can set an alert + * indicator. Caller should try to set alert indicator to some position behind. + * + * Scan the SDBs in AUX buffer from behind aux->empty_mark. They are used + * previously and have already been consumed by user space. Reset these SDBs + * (clear full indicator and alert indicator) for new data. + * If aux->alert_mark fall in this area, just set it. Overflow count is + * recorded while scanning. + * + * SDBs between aux->head and aux->empty_mark are already reset at last time. + * and ready for new samples. So scanning on this area could be skipped. + * + * Return true if alert indicator is set successfully and false if not. + */ +static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, + unsigned long long *overflow) +{ + unsigned long long orig_overflow, orig_flags, new_flags; + unsigned long i, range_scan, idx; + struct hws_trailer_entry *te; + + if (range <= AUX_SDB_NUM_EMPTY(aux)) + /* + * No need to scan. All SDBs in range are marked as empty. + * Just set alert indicator. Should check race with hardware + * sampler. + */ + return aux_set_alert(aux, aux->alert_mark, overflow); + + if (aux->alert_mark <= aux->empty_mark) + /* + * Set alert indicator on empty SDB. Should check race + * with hardware sampler. + */ + if (!aux_set_alert(aux, aux->alert_mark, overflow)) + return false; + + /* + * Scan the SDBs to clear full and alert indicator used previously. + * Start scanning from one SDB behind empty_mark. If the new alert + * indicator fall into this range, set it. + */ + range_scan = range - AUX_SDB_NUM_EMPTY(aux); + idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + do { + orig_flags = te->flags; + orig_overflow = te->overflow; + new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK; + if (idx == aux->alert_mark) + new_flags |= SDB_TE_ALERT_REQ_MASK; + else + new_flags &= ~SDB_TE_ALERT_REQ_MASK; + } while (!cmpxchg_double(&te->flags, &te->overflow, + orig_flags, orig_overflow, + new_flags, 0ULL)); + *overflow += orig_overflow; + } + + /* Update empty_mark to new position */ + aux->empty_mark = aux->head + range - 1; + + return true; +} + +/* + * Measurement alert handler for diagnostic mode sampling. + */ +static void hw_collect_aux(struct cpu_hw_sf *cpuhw) +{ + struct aux_buffer *aux; + int done = 0; + unsigned long range = 0, size; + unsigned long long overflow = 0; + struct perf_output_handle *handle = &cpuhw->handle; + unsigned long num_sdb; + + aux = perf_get_aux(handle); + if (WARN_ON_ONCE(!aux)) + return; + + /* Inform user space new data arrived */ + size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + perf_aux_output_end(handle, size); + num_sdb = aux->sfb.num_sdb; + + while (!done) { + /* Get an output handle */ + aux = perf_aux_output_begin(handle, cpuhw->event); + if (handle->size == 0) { + pr_err("The AUX buffer with %lu pages for the " + "diagnostic-sampling mode is full\n", + num_sdb); + debug_sprintf_event(sfdbg, 1, "AUX buffer used up\n"); + break; + } + if (WARN_ON_ONCE(!aux)) + return; + + /* Update head and alert_mark to new position */ + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range == 1) + aux->alert_mark = aux->head; + else + aux->alert_mark = aux->head + range/2 - 1; + + if (aux_reset_buffer(aux, range, &overflow)) { + if (!overflow) { + done = 1; + break; + } + size = range << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + pr_err("Sample data caused the AUX buffer with %lu " + "pages to overflow\n", num_sdb); + debug_sprintf_event(sfdbg, 1, "head %lx range %lx " + "overflow %llx\n", + aux->head, range, overflow); + } else { + size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + debug_sprintf_event(sfdbg, 6, "head %lx alert %lx " + "already full, try another\n", + aux->head, aux->alert_mark); + } + } + + if (done) + debug_sprintf_event(sfdbg, 6, "aux_reset_buffer: " + "[%lx -> %lx -> %lx] (%lx, %lx)\n", + aux->head, aux->alert_mark, aux->empty_mark, + AUX_SDB_NUM_ALERT(aux), range); +} + +/* + * Callback when freeing AUX buffers. + */ +static void aux_buffer_free(void *data) +{ + struct aux_buffer *aux = data; + unsigned long i, num_sdbt; + + if (!aux) + return; + + /* Free SDBT. SDB is freed by the caller */ + num_sdbt = aux->sfb.num_sdbt; + for (i = 0; i < num_sdbt; i++) + free_page(aux->sdbt_index[i]); + + kfree(aux->sdbt_index); + kfree(aux->sdb_index); + kfree(aux); + + debug_sprintf_event(sfdbg, 4, "aux_buffer_free: free " + "%lu SDBTs\n", num_sdbt); +} + +/* + * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling + * @cpu: On which to allocate, -1 means current + * @pages: Array of pointers to buffer pages passed from perf core + * @nr_pages: Total pages + * @snapshot: Flag for snapshot mode + * + * This is the callback when setup an event using AUX buffer. Perf tool can + * trigger this by an additional mmap() call on the event. Unlike the buffer + * for basic samples, AUX buffer belongs to the event. It is scheduled with + * the task among online cpus when it is a per-thread event. + * + * Return the private AUX buffer structure if success or NULL if fails. + */ +static void *aux_buffer_setup(int cpu, void **pages, int nr_pages, + bool snapshot) +{ + struct sf_buffer *sfb; + struct aux_buffer *aux; + unsigned long *new, *tail; + int i, n_sdbt; + + if (!nr_pages || !pages) + return NULL; + + if (nr_pages > CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is larger than the " + "maximum sampling buffer limit\n", + nr_pages); + return NULL; + } else if (nr_pages < CPUM_SF_MIN_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is less than the " + "minimum sampling buffer limit\n", + nr_pages); + return NULL; + } + + /* Allocate aux_buffer struct for the event */ + aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL); + if (!aux) + goto no_aux; + sfb = &aux->sfb; + + /* Allocate sdbt_index for fast reference */ + n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE; + aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL); + if (!aux->sdbt_index) + goto no_sdbt_index; + + /* Allocate sdb_index for fast reference */ + aux->sdb_index = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + if (!aux->sdb_index) + goto no_sdb_index; + + /* Allocate the first SDBT */ + sfb->num_sdbt = 0; + sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!sfb->sdbt) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; + tail = sfb->tail = sfb->sdbt; + + /* + * Link the provided pages of AUX buffer to SDBT. + * Allocate SDBT if needed. + */ + for (i = 0; i < nr_pages; i++, tail++) { + if (require_table_link(tail)) { + new = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!new) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; + /* Link current page to tail of chain */ + *tail = (unsigned long)(void *) new + 1; + tail = new; + } + /* Tail is the entry in a SDBT */ + *tail = (unsigned long)pages[i]; + aux->sdb_index[i] = (unsigned long)pages[i]; + } + sfb->num_sdb = nr_pages; + + /* Link the last entry in the SDBT to the first SDBT */ + *tail = (unsigned long) sfb->sdbt + 1; + sfb->tail = tail; + + /* + * Initial all SDBs are zeroed. Mark it as empty. + * So there is no need to clear the full indicator + * when this event is first added. + */ + aux->empty_mark = sfb->num_sdb - 1; + + debug_sprintf_event(sfdbg, 4, "aux_buffer_setup: setup %lu SDBTs" + " and %lu SDBs\n", + sfb->num_sdbt, sfb->num_sdb); + + return aux; + +no_sdbt: + /* SDBs (AUX buffer pages) are freed by caller */ + for (i = 0; i < sfb->num_sdbt; i++) + free_page(aux->sdbt_index[i]); + kfree(aux->sdb_index); +no_sdb_index: + kfree(aux->sdbt_index); +no_sdbt_index: + kfree(aux); +no_aux: + return NULL; +} + static void cpumsf_pmu_read(struct perf_event *event) { /* Nothing to do ... updates are interrupt-driven */ @@ -1448,6 +1891,9 @@ static struct pmu cpumf_sampling = { .read = cpumsf_pmu_read, .attr_groups = cpumsf_pmu_attr_groups, + + .setup_aux = aux_buffer_setup, + .free_aux = aux_buffer_free, }; static void cpumf_measurement_alert(struct ext_code ext_code, -- cgit v1.2.3 From cbf6948f36afcbeca61ed5c3d5e7d930567a200f Mon Sep 17 00:00:00 2001 From: Pu Hou Date: Fri, 11 Nov 2016 04:23:15 +0100 Subject: s390/cpumf: enable using AUX buffer Modify PMU callback to use AUX buffer for diagnostic mode sampling. Basic-mode sampling still use orignal way. Signed-off-by: Pu Hou Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_sf.c | 55 ++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 11 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 51dbd8d90dbe..b9248a70b232 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -780,6 +780,10 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->extra_reg.reg = REG_OVERFLOW; OVERFLOW_REG(hwc) = 0; + /* Use AUX buffer. No need to allocate it by ourself */ + if (attr->config == PERF_EVENT_CPUM_SF_DIAG) + return 0; + /* Allocate the per-CPU sampling buffer using the CPU information * from the event. If the event is not pinned to a particular * CPU (event->cpu == -1; or cpuhw == NULL), allocate sampling @@ -876,10 +880,15 @@ static void cpumsf_pmu_enable(struct pmu *pmu) */ if (cpuhw->event) { hwc = &cpuhw->event->hw; - /* Account number of overflow-designated buffer extents */ - sfb_account_overflows(cpuhw, hwc); - if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) - extend_sampling_buffer(&cpuhw->sfb, hwc); + if (!(SAMPL_DIAG_MODE(hwc))) { + /* + * Account number of overflow-designated + * buffer extents + */ + sfb_account_overflows(cpuhw, hwc); + if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) + extend_sampling_buffer(&cpuhw->sfb, hwc); + } } /* (Re)enable the PMU and sampling facility */ @@ -1225,6 +1234,13 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags; int done; + /* + * AUX buffer is used when in diagnostic sampling mode. + * No perf events/samples are created. + */ + if (SAMPL_DIAG_MODE(&event->hw)) + return; + if (flush_all && SDB_FULL_BLOCKS(hwc)) flush_all = 0; @@ -1785,12 +1801,13 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags) static int cpumsf_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct aux_buffer *aux; int err; if (cpuhw->flags & PMU_F_IN_USE) return -EAGAIN; - if (!cpuhw->sfb.sdbt) + if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) return -EINVAL; err = 0; @@ -1805,10 +1822,12 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) */ cpuhw->lsctl.s = 0; cpuhw->lsctl.h = 1; - cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; - cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); - hw_reset_registers(&event->hw, cpuhw->sfb.sdbt); + if (!SAMPL_DIAG_MODE(&event->hw)) { + cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; + cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; + hw_reset_registers(&event->hw, cpuhw->sfb.sdbt); + } /* Ensure sampling functions are in the disabled state. If disabled, * switch on sampling enable control. */ @@ -1816,9 +1835,18 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) err = -EAGAIN; goto out; } - cpuhw->lsctl.es = 1; - if (SAMPL_DIAG_MODE(&event->hw)) + if (SAMPL_DIAG_MODE(&event->hw)) { + aux = perf_aux_output_begin(&cpuhw->handle, event); + if (!aux) { + err = -EINVAL; + goto out; + } + err = aux_output_begin(&cpuhw->handle, aux, cpuhw); + if (err) + goto out; cpuhw->lsctl.ed = 1; + } + cpuhw->lsctl.es = 1; /* Set in_use flag and store event */ cpuhw->event = event; @@ -1844,6 +1872,8 @@ static void cpumsf_pmu_del(struct perf_event *event, int flags) cpuhw->flags &= ~PMU_F_IN_USE; cpuhw->event = NULL; + if (SAMPL_DIAG_MODE(&event->hw)) + aux_output_end(&cpuhw->handle); perf_event_update_userpage(event); perf_pmu_enable(event->pmu); } @@ -1917,7 +1947,10 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Program alert request */ if (alert & CPU_MF_INT_SF_PRA) { if (cpuhw->flags & PMU_F_IN_USE) - hw_perf_event_update(cpuhw->event, 0); + if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) + hw_collect_aux(cpuhw); + else + hw_perf_event_update(cpuhw->event, 0); else WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE)); } -- cgit v1.2.3 From 3d43b981eb841a9493717e6d509f59553dbe8c7a Mon Sep 17 00:00:00 2001 From: Pu Hou Date: Fri, 19 May 2017 11:16:55 +0200 Subject: s390/cpumf: remove raw event support in basic-only sampling mode Raw sample was implemented to export the diagnostic samples. With having this achieved with AUX buffers, there is no requirement for basic samples to export raw data. In particular, most basic sampling information are consumed for creating the perf event sample. Signed-off-by: Pu Hou Reviewed-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/perf_event.h | 17 ---- arch/s390/kernel/perf_cpum_sf.c | 185 ++++++------------------------------- 2 files changed, 27 insertions(+), 175 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h index 79aa6421fedb..d6c9d1e0dc2d 100644 --- a/arch/s390/include/asm/perf_event.h +++ b/arch/s390/include/asm/perf_event.h @@ -64,27 +64,10 @@ struct perf_sf_sde_regs { #define REG_OVERFLOW 1 #define OVERFLOW_REG(hwc) ((hwc)->extra_reg.config) #define SFB_ALLOC_REG(hwc) ((hwc)->extra_reg.alloc) -#define RAWSAMPLE_REG(hwc) ((hwc)->config) #define TEAR_REG(hwc) ((hwc)->last_tag) #define SAMPL_RATE(hwc) ((hwc)->event_base) #define SAMPL_FLAGS(hwc) ((hwc)->config_base) #define SAMPL_DIAG_MODE(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_DIAG_MODE) #define SDB_FULL_BLOCKS(hwc) (SAMPL_FLAGS(hwc) & PERF_CPUM_SF_FULL_BLOCKS) -/* Structure for sampling data entries to be passed as perf raw sample data - * to user space. Note that raw sample data must be aligned and, thus, might - * be padded with zeros. - */ -struct sf_raw_sample { -#define SF_RAW_SAMPLE_BASIC PERF_CPUM_SF_BASIC_MODE -#define SF_RAW_SAMPLE_DIAG PERF_CPUM_SF_DIAG_MODE - u64 format; - u32 size; /* Size of sf_raw_sample */ - u16 bsdes; /* Basic-sampling data entry size */ - u16 dsdes; /* Diagnostic-sampling data entry size */ - struct hws_basic_entry basic; /* Basic-sampling data entry */ - struct hws_diag_entry diag; /* Diagnostic-sampling data entry */ - u8 padding[]; /* Padding to next multiple of 8 */ -} __packed; - #endif /* _ASM_S390_PERF_EVENT_H */ diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index b9248a70b232..4d8ddd8bd9be 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -351,22 +351,6 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) sfb_account_allocs(num, hwc); } -static size_t event_sample_size(struct hw_perf_event *hwc) -{ - struct sf_raw_sample *sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(hwc); - size_t sample_size; - - /* The sample size depends on the sampling function: The basic-sampling - * function must be always enabled, diagnostic-sampling function is - * optional. - */ - sample_size = sfr->bsdes; - if (SAMPL_DIAG_MODE(hwc)) - sample_size += sfr->dsdes; - - return sample_size; -} - static void deallocate_buffers(struct cpu_hw_sf *cpuhw) { if (cpuhw->sfb.sdbt) @@ -376,35 +360,7 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw) static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { unsigned long n_sdb, freq, factor; - size_t sfr_size, sample_size; - struct sf_raw_sample *sfr; - - /* Allocate raw sample buffer - * - * The raw sample buffer is used to temporarily store sampling data - * entries for perf raw sample processing. The buffer size mainly - * depends on the size of diagnostic-sampling data entries which is - * machine-specific. The exact size calculation includes: - * 1. The first 4 bytes of diagnostic-sampling data entries are - * already reflected in the sf_raw_sample structure. Subtract - * these bytes. - * 2. The perf raw sample data must be 8-byte aligned (u64) and - * perf's internal data size must be considered too. So add - * an additional u32 for correct alignment and subtract before - * allocating the buffer. - * 3. Store the raw sample buffer pointer in the perf event - * hardware structure. - */ - sfr_size = ALIGN((sizeof(*sfr) - sizeof(sfr->diag) + cpuhw->qsi.dsdes) + - sizeof(u32), sizeof(u64)); - sfr_size -= sizeof(u32); - sfr = kzalloc(sfr_size, GFP_KERNEL); - if (!sfr) - return -ENOMEM; - sfr->size = sfr_size; - sfr->bsdes = cpuhw->qsi.bsdes; - sfr->dsdes = cpuhw->qsi.dsdes; - RAWSAMPLE_REG(hwc) = (unsigned long) sfr; + size_t sample_size; /* Calculate sampling buffers using 4K pages * @@ -430,7 +386,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up * to 511 SDBs). */ - sample_size = event_sample_size(hwc); + sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); factor = 1; n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size)); @@ -629,10 +585,6 @@ static int reserve_pmc_hardware(void) static void hw_perf_event_destroy(struct perf_event *event) { - /* Free raw sample buffer */ - if (RAWSAMPLE_REG(&event->hw)) - kfree((void *) RAWSAMPLE_REG(&event->hw)); - /* Release PMC if this is the last perf event */ if (!atomic_add_unless(&num_events, -1, 1)) { mutex_lock(&pmc_reserve_mutex); @@ -652,15 +604,8 @@ static void hw_init_period(struct hw_perf_event *hwc, u64 period) static void hw_reset_registers(struct hw_perf_event *hwc, unsigned long *sdbt_origin) { - struct sf_raw_sample *sfr; - /* (Re)set to first sample-data-block-table */ TEAR_REG(hwc) = (unsigned long) sdbt_origin; - - /* (Re)set raw sampling buffer register */ - sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(hwc); - memset(&sfr->basic, 0, sizeof(sfr->basic)); - memset(&sfr->diag, 0, sfr->dsdes); } static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si, @@ -986,22 +931,16 @@ static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, * * Return non-zero if an event overflow occurred. */ -static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) +static int perf_push_sample(struct perf_event *event, + struct hws_basic_entry *basic) { int overflow; struct pt_regs regs; struct perf_sf_sde_regs *sde_regs; struct perf_sample_data data; - struct perf_raw_record raw = { - .frag = { - .size = sfr->size, - .data = sfr, - }, - }; /* Setup perf sample */ perf_sample_data_init(&data, 0, event->hw.last_period); - data.raw = &raw; /* Setup pt_regs to look like an CPU-measurement external interrupt * using the Program Request Alert code. The regs.int_parm_long @@ -1013,11 +952,11 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) regs.int_parm = CPU_MF_INT_SF_PRA; sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; - psw_bits(regs.psw).ia = sfr->basic.ia; - psw_bits(regs.psw).dat = sfr->basic.T; - psw_bits(regs.psw).wait = sfr->basic.W; - psw_bits(regs.psw).pstate = sfr->basic.P; - psw_bits(regs.psw).as = sfr->basic.AS; + psw_bits(regs.psw).ia = basic->ia; + psw_bits(regs.psw).dat = basic->T; + psw_bits(regs.psw).wait = basic->W; + psw_bits(regs.psw).pstate = basic->P; + psw_bits(regs.psw).as = basic->AS; /* * Use the hardware provided configuration level to decide if the @@ -1030,7 +969,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) * If the value differs from 0xffff (the host value), we assume to * be a KVM guest. */ - switch (sfr->basic.CL) { + switch (basic->CL) { case 1: /* logical partition */ sde_regs->in_guest = 0; break; @@ -1038,7 +977,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) sde_regs->in_guest = 1; break; default: /* old machine, use heuristics */ - if (sfr->basic.gpp || sfr->basic.prim_asn != 0xffff) + if (basic->gpp || basic->prim_asn != 0xffff) sde_regs->in_guest = 1; break; } @@ -1060,75 +999,12 @@ static void perf_event_count_update(struct perf_event *event, u64 count) local64_add(count, &event->count); } -static int sample_format_is_valid(struct hws_combined_entry *sample, - unsigned int flags) -{ - if (likely(flags & PERF_CPUM_SF_BASIC_MODE)) - /* Only basic-sampling data entries with data-entry-format - * version of 0x0001 can be processed. - */ - if (sample->basic.def != 0x0001) - return 0; - if (flags & PERF_CPUM_SF_DIAG_MODE) - /* The data-entry-format number of diagnostic-sampling data - * entries can vary. Because diagnostic data is just passed - * through, do only a sanity check on the DEF. - */ - if (sample->diag.def < 0x8001) - return 0; - return 1; -} - -static int sample_is_consistent(struct hws_combined_entry *sample, - unsigned long flags) -{ - /* This check applies only to basic-sampling data entries of potentially - * combined-sampling data entries. Invalid entries cannot be processed - * by the PMU and, thus, do not deliver an associated - * diagnostic-sampling data entry. - */ - if (unlikely(!(flags & PERF_CPUM_SF_BASIC_MODE))) - return 0; - /* - * Samples are skipped, if they are invalid or for which the - * instruction address is not predictable, i.e., the wait-state bit is - * set. - */ - if (sample->basic.I || sample->basic.W) - return 0; - return 1; -} - -static void reset_sample_slot(struct hws_combined_entry *sample, - unsigned long flags) -{ - if (likely(flags & PERF_CPUM_SF_BASIC_MODE)) - sample->basic.def = 0; - if (flags & PERF_CPUM_SF_DIAG_MODE) - sample->diag.def = 0; -} - -static void sfr_store_sample(struct sf_raw_sample *sfr, - struct hws_combined_entry *sample) -{ - if (likely(sfr->format & PERF_CPUM_SF_BASIC_MODE)) - sfr->basic = sample->basic; - if (sfr->format & PERF_CPUM_SF_DIAG_MODE) - memcpy(&sfr->diag, &sample->diag, sfr->dsdes); -} - -static void debug_sample_entry(struct hws_combined_entry *sample, - struct hws_trailer_entry *te, - unsigned long flags) +static void debug_sample_entry(struct hws_basic_entry *sample, + struct hws_trailer_entry *te) { debug_sprintf_event(sfdbg, 4, "hw_collect_samples: Found unknown " - "sampling data entry: te->f=%i basic.def=%04x (%p)" - " diag.def=%04x (%p)\n", te->f, - sample->basic.def, &sample->basic, - (flags & PERF_CPUM_SF_DIAG_MODE) - ? sample->diag.def : 0xFFFF, - (flags & PERF_CPUM_SF_DIAG_MODE) - ? &sample->diag : NULL); + "sampling data entry: te->f=%i basic.def=%04x (%p)\n", + te->f, sample->def, sample); } /* hw_collect_samples() - Walk through a sample-data-block and collect samples @@ -1154,44 +1030,37 @@ static void debug_sample_entry(struct hws_combined_entry *sample, static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, unsigned long long *overflow) { - unsigned long flags = SAMPL_FLAGS(&event->hw); - struct hws_combined_entry *sample; struct hws_trailer_entry *te; - struct sf_raw_sample *sfr; - size_t sample_size; - - /* Prepare and initialize raw sample data */ - sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(&event->hw); - sfr->format = flags & PERF_CPUM_SF_MODE_MASK; + struct hws_basic_entry *sample; - sample_size = event_sample_size(&event->hw); te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); - sample = (struct hws_combined_entry *) *sdbt; + sample = (struct hws_basic_entry *) *sdbt; while ((unsigned long *) sample < (unsigned long *) te) { /* Check for an empty sample */ - if (!sample->basic.def) + if (!sample->def) break; /* Update perf event period */ perf_event_count_update(event, SAMPL_RATE(&event->hw)); - /* Check sampling data entry */ - if (sample_format_is_valid(sample, flags)) { + /* Check whether sample is valid */ + if (sample->def == 0x0001) { /* If an event overflow occurred, the PMU is stopped to * throttle event delivery. Remaining sample data is * discarded. */ if (!*overflow) { - if (sample_is_consistent(sample, flags)) { + /* Check whether sample is consistent */ + if (sample->I == 0 && sample->W == 0) { /* Deliver sample data to perf */ - sfr_store_sample(sfr, sample); - *overflow = perf_push_sample(event, sfr); + *overflow = perf_push_sample(event, + sample); } } else /* Count discarded samples */ *overflow += 1; } else { - debug_sample_entry(sample, te, flags); + debug_sample_entry(sample, te); /* Sample slot is not yet written or other record. * * This condition can occur if the buffer was reused @@ -1207,8 +1076,8 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, } /* Reset sample slot and advance to next sample */ - reset_sample_slot(sample, flags); - sample += sample_size; + sample->def = 0; + sample++; } } -- cgit v1.2.3 From 9232c3c741200167e44ae9d0e434092657ab4534 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Wed, 31 May 2017 15:22:19 +0200 Subject: s390/cpum_sf: do not register PMU if no sampling mode is authorized Previously, the cpum_sf PMU was registered even if there is no sampling mode authorized. Add a check and register cpum_sf only at least one sampling mode is authorized. Signed-off-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/perf_cpum_sf.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 4d8ddd8bd9be..7e9b9e6ee821 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1938,6 +1938,9 @@ static int __init init_cpum_sampling_pmu(void) return -ENODEV; } + if (!si.as && !si.ad) + return -ENODEV; + if (si.bsdes != sizeof(struct hws_basic_entry)) { pr_cpumsf_err(RS_INIT_FAILURE_BSDES); return -EINVAL; -- cgit v1.2.3 From c33eff600584ed493adfb42e3f130a6335f97750 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sat, 6 Jun 2015 12:44:25 +0200 Subject: s390/perf: add perf_regs support and user stack dump Add s390 support to dump user stack to user space for DWARF stack unwinding. Signed-off-by: Heiko Carstens Reviewed-by: Hendrik Brueckner Reviewed-and-tested-by: Thomas Richter Signed-off-by: Martin Schwidefsky --- arch/s390/Kconfig | 2 ++ arch/s390/include/uapi/asm/perf_regs.h | 27 +++++++++++++++++++ arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/perf_regs.c | 49 ++++++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 arch/s390/include/uapi/asm/perf_regs.h create mode 100644 arch/s390/kernel/perf_regs.c (limited to 'arch/s390') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 84767046daff..829c67986db7 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -159,6 +159,8 @@ config S390 select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MEMBLOCK_PHYS_MAP diff --git a/arch/s390/include/uapi/asm/perf_regs.h b/arch/s390/include/uapi/asm/perf_regs.h new file mode 100644 index 000000000000..f84ea6a181d3 --- /dev/null +++ b/arch/s390/include/uapi/asm/perf_regs.h @@ -0,0 +1,27 @@ +#ifndef _ASM_S390_PERF_REGS_H +#define _ASM_S390_PERF_REGS_H + +enum perf_event_s390_regs { + PERF_REG_S390_R0, + PERF_REG_S390_R1, + PERF_REG_S390_R2, + PERF_REG_S390_R3, + PERF_REG_S390_R4, + PERF_REG_S390_R5, + PERF_REG_S390_R6, + PERF_REG_S390_R7, + PERF_REG_S390_R8, + PERF_REG_S390_R9, + PERF_REG_S390_R10, + PERF_REG_S390_R11, + PERF_REG_S390_R12, + PERF_REG_S390_R13, + PERF_REG_S390_R14, + PERF_REG_S390_R15, + PERF_REG_S390_MASK, + PERF_REG_S390_PC, + + PERF_REG_S390_MAX +}; + +#endif /* _ASM_S390_PERF_REGS_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 0319f4e81ea4..909bce65cb2b 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -79,7 +79,7 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf.o perf_cpum_sf.o -obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o +obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o obj-$(CONFIG_TRACEPOINTS) += trace.o diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c new file mode 100644 index 000000000000..e883e6a2146a --- /dev/null +++ b/arch/s390/kernel/perf_regs.c @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include +#include + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + if (WARN_ON_ONCE((u32)idx >= PERF_REG_S390_MAX)) + return 0; + + if (idx == PERF_REG_S390_MASK) + return regs->psw.mask; + if (idx == PERF_REG_S390_PC) + return regs->psw.addr; + + return regs->gprs[idx]; +} + +#define REG_RESERVED (~((1UL << PERF_REG_S390_MAX) - 1)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_31BIT)) + return PERF_SAMPLE_REGS_ABI_32; + + return PERF_SAMPLE_REGS_ABI_64; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + /* + * Use the regs from the first interruption and let + * perf_sample_regs_intr() handle interrupts (regs == get_irq_regs()). + */ + regs_user->regs = task_pt_regs(current); + regs_user->abi = perf_reg_abi(current); +} -- cgit v1.2.3 From 0da0017f72554c005c1a04c3adc5da9eb64fa7e5 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Wed, 8 Nov 2017 07:30:15 +0100 Subject: s390/perf: extend perf_regs support to include floating-point registers Extend the perf register support to also export floating-point register contents for user space tasks. Floating-point registers might be used in leaf functions to contain the return address. Hence, they are required for proper DWARF unwinding. Signed-off-by: Hendrik Brueckner Reviewed-and-tested-by: Thomas Richter Signed-off-by: Martin Schwidefsky --- arch/s390/include/uapi/asm/perf_regs.h | 16 ++++++++++++++++ arch/s390/kernel/perf_regs.c | 21 +++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/include/uapi/asm/perf_regs.h b/arch/s390/include/uapi/asm/perf_regs.h index f84ea6a181d3..7c8564f98205 100644 --- a/arch/s390/include/uapi/asm/perf_regs.h +++ b/arch/s390/include/uapi/asm/perf_regs.h @@ -18,6 +18,22 @@ enum perf_event_s390_regs { PERF_REG_S390_R13, PERF_REG_S390_R14, PERF_REG_S390_R15, + PERF_REG_S390_FP0, + PERF_REG_S390_FP1, + PERF_REG_S390_FP2, + PERF_REG_S390_FP3, + PERF_REG_S390_FP4, + PERF_REG_S390_FP5, + PERF_REG_S390_FP6, + PERF_REG_S390_FP7, + PERF_REG_S390_FP8, + PERF_REG_S390_FP9, + PERF_REG_S390_FP10, + PERF_REG_S390_FP11, + PERF_REG_S390_FP12, + PERF_REG_S390_FP13, + PERF_REG_S390_FP14, + PERF_REG_S390_FP15, PERF_REG_S390_MASK, PERF_REG_S390_PC, diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index e883e6a2146a..f8603ebed669 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -4,12 +4,29 @@ #include #include #include +#include +#include u64 perf_reg_value(struct pt_regs *regs, int idx) { + freg_t fp; + if (WARN_ON_ONCE((u32)idx >= PERF_REG_S390_MAX)) return 0; + if (idx >= PERF_REG_S390_R0 && idx <= PERF_REG_S390_R15) + return regs->gprs[idx]; + + if (idx >= PERF_REG_S390_FP0 && idx <= PERF_REG_S390_FP15) { + if (!user_mode(regs)) + return 0; + + idx -= PERF_REG_S390_FP0; + fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx) + : current->thread.fpu.fprs[idx]; + return fp.ui; + } + if (idx == PERF_REG_S390_MASK) return regs->psw.mask; if (idx == PERF_REG_S390_PC) @@ -43,7 +60,11 @@ void perf_get_regs_user(struct perf_regs *regs_user, /* * Use the regs from the first interruption and let * perf_sample_regs_intr() handle interrupts (regs == get_irq_regs()). + * + * Also save FPU registers for user-space tasks only. */ regs_user->regs = task_pt_regs(current); + if (user_mode(regs_user->regs)) + save_fpu_regs(); regs_user->abi = perf_reg_abi(current); } -- cgit v1.2.3 From d4c7e649d7bf17792629dbeaf25945e26a32894f Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Fri, 27 Oct 2017 15:45:19 +0200 Subject: s390/cpum_sf: load program parameter at sampler enablement The lpp instruction is used to place the PID of the current task in the program-parameter (PP) register. The register contents is then included in the sampling data entries. The lpp instruction loads the PP register only when at least one sampling function is enabled. Otherwise it is executed as a no-op. Linux calls lpp at context switch. If the context switch happens before the sampler is enabled, the PP register is empty. That means, the PID of the task that is sampled is not stored in sampling data until the next context switch. Hence, always call lpp when enabling the sampler. Signed-off-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cpu_mf.h | 6 ++++++ arch/s390/kernel/perf_cpum_sf.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 7364130a29c8..792cda339af1 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -144,6 +144,12 @@ struct hws_trailer_entry { unsigned long long progusage2; /* */ } __packed; +/* Load program parameter */ +static inline void lpp(void *pp) +{ + asm volatile(".insn s,0xb2800000,0(%0)\n":: "a" (pp) : "memory"); +} + /* Query counter information */ static inline int qctri(struct cpumf_ctr_info *info) { diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 7e9b9e6ee821..dbb62c05805d 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -848,6 +848,9 @@ static void cpumsf_pmu_enable(struct pmu *pmu) return; } + /* Load current program parameter */ + lpp(&S390_lowcore.lpp); + debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i " "tear=%p dear=%p\n", cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, cpuhw->lsctl.cd, -- cgit v1.2.3 From 544e8dd7a8e49d22b4315fc232479bc02b417b46 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Tue, 8 Mar 2016 14:00:23 +0100 Subject: s390/cpum_sf: correctly set the PID and TID in perf samples The hardware sampler creates samples that are processed at a later point in time. The PID and TID values of the perf samples that are created for hardware samples are initialized with values from the current task. Hence, the PID and TID values are not correct and perf samples are associated with wrong processes. The PID and TID values are obtained from the Host Program Parameter (HPP) field in the basic-sampling data entries. These PIDs are valid in the init PID namespace. Ensure that the PIDs in the perf samples are resolved considering the PID namespace in which the perf event was created. To correct the PID and TID values in the created perf samples, a special overflow handler is installed. It replaces the default overflow handler and does not become effective if any other overflow handler is used. With the special overflow handler most of the perf samples are associated with the right processes. For processes, that are no longer exist, the association might still be wrong. Signed-off-by: Hendrik Brueckner Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/setup.h | 2 +- arch/s390/kernel/perf_cpum_sf.c | 76 +++++++++++++++++++++++++++++++++++++++++ arch/s390/mm/fault.c | 2 +- 3 files changed, 78 insertions(+), 2 deletions(-) (limited to 'arch/s390') diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index 8bc87dcb10eb..2eb0c8a7b664 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -36,7 +36,7 @@ #define MACHINE_FLAG_SCC _BITUL(17) #define LPP_MAGIC _BITUL(31) -#define LPP_PFAULT_PID_MASK _AC(0xffffffff, UL) +#define LPP_PID_MASK _AC(0xffffffff, UL) #ifndef __ASSEMBLY__ diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index dbb62c05805d..227b38bd82c9 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -615,6 +616,67 @@ static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si, si->min_sampl_rate, si->max_sampl_rate); } +static u32 cpumsf_pid_type(struct perf_event *event, + u32 pid, enum pid_type type) +{ + struct task_struct *tsk; + + /* Idle process */ + if (!pid) + goto out; + + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + pid = -1; + if (tsk) { + /* + * Only top level events contain the pid namespace in which + * they are created. + */ + if (event->parent) + event = event->parent; + pid = __task_pid_nr_ns(tsk, type, event->ns); + /* + * See also 1d953111b648 + * "perf/core: Don't report zero PIDs for exiting tasks". + */ + if (!pid && !pid_alive(tsk)) + pid = -1; + } +out: + return pid; +} + +static void cpumsf_output_event_pid(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + u32 pid; + struct perf_event_header header; + struct perf_output_handle handle; + + /* + * Obtain the PID from the basic-sampling data entry and + * correct the data->tid_entry.pid value. + */ + pid = data->tid_entry.pid; + + /* Protect callchain buffers, tasks */ + rcu_read_lock(); + + perf_prepare_sample(&header, data, event, regs); + if (perf_output_begin(&handle, event, header.size)) + goto out; + + /* Update the process ID (see also kernel/events/core.c) */ + data->tid_entry.pid = cpumsf_pid_type(event, pid, __PIDTYPE_TGID); + data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID); + + perf_output_sample(&handle, &header, data, event); + perf_output_end(&handle); +out: + rcu_read_unlock(); +} + static int __hw_perf_event_init(struct perf_event *event) { struct cpu_hw_sf *cpuhw; @@ -748,6 +810,14 @@ static int __hw_perf_event_init(struct perf_event *event) break; } } + + /* If PID/TID sampling is active, replace the default overflow + * handler to extract and resolve the PIDs from the basic-sampling + * data entries. + */ + if (event->attr.sample_type & PERF_SAMPLE_TID) + if (is_default_overflow_handler(event)) + event->overflow_handler = cpumsf_output_event_pid; out: return err; } @@ -985,6 +1055,12 @@ static int perf_push_sample(struct perf_event *event, break; } + /* + * Store the PID value from the sample-data-entry to be + * processed and resolved by cpumsf_output_event_pid(). + */ + data.tid_entry.pid = basic->hpp & LPP_PID_MASK; + overflow = 0; if (perf_exclude_event(event, ®s, sde_regs)) goto out; diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 14654007dce4..93faeca52284 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -728,7 +728,7 @@ static void pfault_interrupt(struct ext_code ext_code, return; inc_irq_stat(IRQEXT_PFL); /* Get the token (= pid of the affected task). */ - pid = param64 & LPP_PFAULT_PID_MASK; + pid = param64 & LPP_PID_MASK; rcu_read_lock(); tsk = find_task_by_pid_ns(pid, &init_pid_ns); if (tsk) -- cgit v1.2.3 From ab35727eb879ccc9487c6df0a3796be124ed39d3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 16 Nov 2017 13:49:50 +0100 Subject: s390: remove unused parameter from Makefile Remove unused parameter from the call function, which I accidentally added. Signed-off-by: Heiko Carstens Signed-off-by: Martin Schwidefsky --- arch/s390/tools/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390') diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 2ebf2872cc16..2e70e25de07a 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -21,4 +21,4 @@ include/generated/facilities.h: $(obj)/gen_facilities FORCE $(call filechk,facilities.h) include/generated/dis.h: $(obj)/gen_opcode_table FORCE - $(call filechk,dis.h,__FUN) + $(call filechk,dis.h) -- cgit v1.2.3