diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-10 10:46:14 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-07-10 10:46:14 -0700 |
commit | e98e03d075537a14928661ebfbfcde34b0eced1a (patch) | |
tree | bfaa3607c7f3623c7c487eddb9be1e14cd2f322e /arch/s390/kernel | |
parent | 379cf80a9861e4356792185bc3fcdd7d4133f2f7 (diff) | |
parent | 6a942f5780545ebd11aca8b3ac4b163397962322 (diff) | |
download | linux-e98e03d075537a14928661ebfbfcde34b0eced1a.tar.bz2 |
Merge tag 's390-5.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
Pull more s390 updates from Vasily Gorbik:
- Fix preempt_count initialization.
- Rework call_on_stack() macro to add proper type handling and avoid
possible register corruption.
- More error prone "register asm" removal and fixes.
- Fix syscall restarting when multiple signals are coming in. This adds
minimalistic trampolines to vdso so we can return from signal without
using the stack which requires pgm check handler hacks when NX is
enabled.
- Remove HAVE_IRQ_EXIT_ON_IRQ_STACK since this is no longer true after
switch to generic entry.
- Fix protected virtualization secure storage access exception
handling.
- Make machine check C handler always enter with DAT enabled and move
register validation to C code.
- Fix tinyconfig boot problem by avoiding MONITOR CALL without
CONFIG_BUG.
- Increase asm symbols alignment to 16 to make it consistent with
compilers.
- Enable concurrent access to the CPU Measurement Counter Facility.
- Add support for dynamic AP bus size limit and rework ap_dqap to deal
with messages greater than recv buffer.
* tag 's390-5.14-2' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (41 commits)
s390: preempt: Fix preempt_count initialization
s390/linkage: increase asm symbols alignment to 16
s390: rename CALL_ON_STACK_NORETURN() to call_on_stack_noreturn()
s390: add type checking to CALL_ON_STACK_NORETURN() macro
s390: remove old CALL_ON_STACK() macro
s390/softirq: use call_on_stack() macro
s390/lib: use call_on_stack() macro
s390/smp: use call_on_stack() macro
s390/kexec: use call_on_stack() macro
s390/irq: use call_on_stack() macro
s390/mm: use call_on_stack() macro
s390: introduce proper type handling call_on_stack() macro
s390/irq: simplify on_async_stack()
s390/irq: inline do_softirq_own_stack()
s390/irq: simplify do_softirq_own_stack()
s390/ap: get rid of register asm in ap_dqap()
s390: rename PIF_SYSCALL_RESTART to PIF_EXECVE_PGSTE_RESTART
s390: move restart of execve() syscall
s390/signal: remove sigreturn on stack
s390/signal: switch to using vdso for sigreturn and syscall restart
...
Diffstat (limited to 'arch/s390/kernel')
31 files changed, 1598 insertions, 1449 deletions
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 68ca1834316f..4a44ba5a2d73 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -71,10 +71,10 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o -obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o obj-$(CONFIG_TRACEPOINTS) += trace.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ +obj-$(CONFIG_COMPAT) += vdso32/ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index f53605a3dfcd..77ff2130cb04 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -14,8 +14,6 @@ #include <linux/pgtable.h> #include <asm/idle.h> #include <asm/gmap.h> -#include <asm/nmi.h> -#include <asm/setup.h> #include <asm/stacktrace.h> int main(void) @@ -108,7 +106,6 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); - OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator); OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); @@ -145,9 +142,6 @@ int main(void) OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); - /* extended machine check save area */ - OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area); - BLANK(); /* gmap/sie offsets */ OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 1d0e17ec93eb..cca142fbb516 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -28,6 +28,7 @@ #include <linux/uaccess.h> #include <asm/lowcore.h> #include <asm/switch_to.h> +#include <asm/vdso.h> #include "compat_linux.h" #include "compat_ptrace.h" #include "entry.h" @@ -118,7 +119,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } @@ -304,11 +304,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - /* Signal frames without vectors registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, sigreturn); } /* Set up registers for signal handler */ @@ -371,10 +367,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, rt_sigreturn); } /* Create siginfo on the signal stack */ diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c2cf79d353cf..fb84e3fc1686 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -33,6 +33,8 @@ #include <asm/switch_to.h> #include "entry.h" +int __bootdata(is_full_image); + static void __init reset_tod_clock(void) { union tod_clock clk; @@ -279,7 +281,7 @@ static void __init setup_boot_command_line(void) static void __init check_image_bootable(void) { - if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING))) + if (is_full_image) return; sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3e8c6669373a..5a2f70cbd3a9 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -14,7 +14,6 @@ #include <asm/alternative-asm.h> #include <asm/processor.h> #include <asm/cache.h> -#include <asm/ctl_reg.h> #include <asm/dwarf.h> #include <asm/errno.h> #include <asm/ptrace.h> @@ -129,6 +128,24 @@ _LPP_OFFSET = __LC_LPP "jnz .+8; .long 0xb2e8d000", 82 .endm + /* + * The CHKSTG macro jumps to the provided label in case the + * machine check interruption code reports one of unrecoverable + * storage errors: + * - Storage error uncorrected + * - Storage key error uncorrected + * - Storage degradation with Failing-storage-address validity + */ + .macro CHKSTG errlabel + TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR) + jnz \errlabel + TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD + jz oklabel\@ + TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR + jnz \errlabel +oklabel\@: + .endm + #if IS_ENABLED(CONFIG_KVM) /* * The OUTSIDE macro jumps to the provided label in case the value @@ -148,6 +165,13 @@ _LPP_OFFSET = __LC_LPP clgr %r14,%r13 jhe \outside_label .endm + + .macro SIEEXIT + lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + .endm #endif GEN_BR_THUNK %r14 @@ -235,7 +259,6 @@ ENTRY(sie64a) # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. # Other instructions between sie64a and .Lsie_done should not cause program # interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -# See also .Lcleanup_sie .Lrewind_pad6: nopr 7 .Lrewind_pad4: @@ -341,10 +364,7 @@ ENTRY(pgm_check_handler) #if IS_ENABLED(CONFIG_KVM) # cleanup critical section for program checks in sie64a OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f - lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - larl %r9,sie_exit # skip forward to sie_exit + SIEEXIT lghi %r10,_PIF_GUEST_FAULT #endif 1: tmhh %r8,0x4000 # PER bit set in old PSW ? @@ -410,7 +430,8 @@ ENTRY(\name) jnz 1f #if IS_ENABLED(CONFIG_KVM) OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f - brasl %r14,.Lcleanup_sie + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + SIEEXIT #endif 0: CHECK_STACK __LC_SAVE_AREA_ASYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) @@ -484,8 +505,6 @@ ENTRY(mcck_int_handler) BPOFF la %r1,4095 # validate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer - sckc __LC_CLOCK_COMPARATOR # validate comparator - lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs lg %r12,__LC_CURRENT lmg %r8,%r9,__LC_MCK_OLD_PSW @@ -496,41 +515,7 @@ ENTRY(mcck_int_handler) la %r14,4095 lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs ptlb - lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area - nill %r11,0xfc00 # MCESA_ORIGIN_MASK - TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE - jno 0f - TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID - jno 0f - .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC -0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID - jo 0f - sr %r14,%r14 -0: sfpc %r14 - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - jo 0f - lghi %r14,__LC_FPREGS_SAVE_AREA - ld %f0,0(%r14) - ld %f1,8(%r14) - ld %f2,16(%r14) - ld %f3,24(%r14) - ld %f4,32(%r14) - ld %f5,40(%r14) - ld %f6,48(%r14) - ld %f7,56(%r14) - ld %f8,64(%r14) - ld %f9,72(%r14) - ld %f10,80(%r14) - ld %f11,88(%r14) - ld %f12,96(%r14) - ld %f13,104(%r14) - ld %f14,112(%r14) - ld %f15,120(%r14) - j 1f -0: VLM %v0,%v15,0,%r11 - VLM %v16,%v31,256,%r11 -1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA + lghi %r14,__LC_CPU_TIMER_SAVE_AREA mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID jo 3f @@ -546,24 +531,29 @@ ENTRY(mcck_int_handler) 3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? - jnz 4f + jnz 6f TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic -4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off - tmhh %r8,0x0001 # interrupting from user ? - jnz .Lmcck_user #if IS_ENABLED(CONFIG_KVM) - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack - OUTSIDE %r9,.Lsie_entry,.Lsie_skip,5f + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f + OUTSIDE %r9,.Lsie_entry,.Lsie_skip,4f oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST -5: brasl %r14,.Lcleanup_sie -#endif + j 5f +4: CHKSTG .Lmcck_panic +5: larl %r14,.Lstosm_tmp + stosm 0(%r14),0x04 # turn dat on, keep irqs off + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + SIEEXIT j .Lmcck_stack -.Lmcck_user: +#endif +6: CHKSTG .Lmcck_panic + larl %r14,.Lstosm_tmp + stosm 0(%r14),0x04 # turn dat on, keep irqs off + tmhh %r8,0x0001 # interrupting from user ? + jz .Lmcck_stack BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP .Lmcck_stack: lg %r15,__LC_MCCK_STACK -.Lmcck_skip: la %r11,STACK_FRAME_OVERHEAD(%r15) stctg %c1,%c1,__PT_CR1(%r11) lctlg %c1,%c1,__LC_KERNEL_ASCE @@ -605,8 +595,33 @@ ENTRY(mcck_int_handler) b __LC_RETURN_MCCK_LPSWE .Lmcck_panic: - lg %r15,__LC_NODAT_STACK - j .Lmcck_skip + /* + * Iterate over all possible CPU addresses in the range 0..0xffff + * and stop each CPU using signal processor. Use compare and swap + * to allow just one CPU-stopper and prevent concurrent CPUs from + * stopping each other while leaving the others running. + */ + lhi %r5,0 + lhi %r6,1 + larl %r7,.Lstop_lock + cs %r5,%r6,0(%r7) # single CPU-stopper only + jnz 4f + larl %r7,.Lthis_cpu + stap 0(%r7) # this CPU address + lh %r4,0(%r7) + nilh %r4,0 + lhi %r0,1 + sll %r0,16 # CPU counter + lhi %r3,0 # next CPU address +0: cr %r3,%r4 + je 2f +1: sigp %r1,%r3,SIGP_STOP # stop next CPU + brc SIGP_CC_BUSY,1b +2: ahi %r3,1 + brct %r0,0b +3: sigp %r1,%r4,SIGP_STOP # stop this CPU + brc SIGP_CC_BUSY,3b +4: j 4b ENDPROC(mcck_int_handler) # @@ -657,15 +672,11 @@ ENTRY(stack_overflow) ENDPROC(stack_overflow) #endif -#if IS_ENABLED(CONFIG_KVM) -.Lcleanup_sie: - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE - larl %r9,sie_exit # skip forward to sie_exit - BR_EX %r14,%r13 -#endif + .section .data, "aw" + .align 4 +.Lstop_lock: .long 0 +.Lthis_cpu: .short 0 +.Lstosm_tmp: .byte 0 .section .rodata, "a" #define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index c0df4060d28d..234d085257eb 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -110,15 +110,17 @@ static int on_async_stack(void) { unsigned long frame = current_frame_address(); - return !!!((S390_lowcore.async_stack - frame) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)); + return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; } static void do_irq_async(struct pt_regs *regs, int irq) { - if (on_async_stack()) + if (on_async_stack()) { do_IRQ(regs, irq); - else - CALL_ON_STACK(do_IRQ, S390_lowcore.async_stack, 2, regs, irq); + } else { + call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + struct pt_regs *, regs, int, irq); + } } static int irq_pending(struct pt_regs *regs) @@ -266,24 +268,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) } /* - * Switch to the asynchronous interrupt stack for softirq execution. - */ -void do_softirq_own_stack(void) -{ - unsigned long old, new; - - old = current_stack_pointer(); - /* Check against async. stack address range. */ - new = S390_lowcore.async_stack; - if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) { - CALL_ON_STACK(__do_softirq, new, 0); - } else { - /* We are already on the async stack. */ - __do_softirq(); - } -} - -/* * ext_int_hash[index] is the list head for all external interrupts that hash * to this index. */ diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 528bb31815c3..52d056a5f89f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -92,11 +92,6 @@ static void copy_instruction(struct kprobe *p) } NOKPROBE_SYMBOL(copy_instruction); -static inline int is_kernel_addr(void *addr) -{ - return addr < (void *)_end; -} - static int s390_get_insn_slot(struct kprobe *p) { /* @@ -105,7 +100,7 @@ static int s390_get_insn_slot(struct kprobe *p) * field can be patched and executed within the insn slot. */ p->ainsn.insn = NULL; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) p->ainsn.insn = get_s390_insn_slot(); else if (is_module_addr(p->addr)) p->ainsn.insn = get_insn_slot(); @@ -117,7 +112,7 @@ static void s390_free_insn_slot(struct kprobe *p) { if (!p->ainsn.insn) return; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) free_s390_insn_slot(p->ainsn.insn, 0); else free_insn_slot(p->ainsn.insn, 0); diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index d91989c7bd6a..1005a6935fbe 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -132,7 +132,8 @@ static bool kdump_csum_valid(struct kimage *image) int rc; preempt_disable(); - rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image); + rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump, + unsigned long, (unsigned long)image); preempt_enable(); return rc == 0; #else diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 11f8c296f60d..20f8e1868853 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -189,12 +189,16 @@ void noinstr s390_handle_mcck(void) * returns 0 if all required registers are available * returns 1 otherwise */ -static int notrace s390_check_registers(union mci mci, int umode) +static int notrace s390_validate_registers(union mci mci, int umode) { + struct mcesa *mcesa; + void *fpt_save_area; union ctlreg2 cr2; int kill_task; + u64 zero; kill_task = 0; + zero = 0; if (!mci.gr) { /* @@ -205,14 +209,6 @@ static int notrace s390_check_registers(union mci mci, int umode) s390_handle_damage(); kill_task = 1; } - /* Check control registers */ - if (!mci.cr) { - /* - * Control registers have unknown contents. - * Can't recover and therefore stopping machine. - */ - s390_handle_damage(); - } if (!mci.fp) { /* * Floating point registers can't be restored. If the @@ -225,35 +221,89 @@ static int notrace s390_check_registers(union mci mci, int umode) if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } + fpt_save_area = &S390_lowcore.floating_pt_save_area; if (!mci.fc) { /* * Floating point control register can't be restored. * If the kernel currently uses the floating pointer * registers and needs the FPC register the system is * stopped. If the process has its floating pointer - * registers loaded it is terminated. + * registers loaded it is terminated. Otherwise the + * FPC is just validated. */ if (S390_lowcore.fpu_flags & KERNEL_FPC) s390_handle_damage(); + asm volatile( + " lfpc %0\n" + : + : "Q" (zero)); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; + } else { + asm volatile( + " lfpc %0\n" + : + : "Q" (S390_lowcore.fpt_creg_save_area)); } - if (MACHINE_HAS_VX) { + mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (!MACHINE_HAS_VX) { + /* Validate floating point registers */ + asm volatile( + " ld 0,0(%0)\n" + " ld 1,8(%0)\n" + " ld 2,16(%0)\n" + " ld 3,24(%0)\n" + " ld 4,32(%0)\n" + " ld 5,40(%0)\n" + " ld 6,48(%0)\n" + " ld 7,56(%0)\n" + " ld 8,64(%0)\n" + " ld 9,72(%0)\n" + " ld 10,80(%0)\n" + " ld 11,88(%0)\n" + " ld 12,96(%0)\n" + " ld 13,104(%0)\n" + " ld 14,112(%0)\n" + " ld 15,120(%0)\n" + : + : "a" (fpt_save_area) + : "memory"); + } else { + /* Validate vector registers */ + union ctlreg0 cr0; + if (!mci.vr) { /* * Vector registers can't be restored. If the kernel * currently uses vector registers the system is * stopped. If the process has its vector registers - * loaded it is terminated. + * loaded it is terminated. Otherwise just validate + * the registers. */ if (S390_lowcore.fpu_flags & KERNEL_VXR) s390_handle_damage(); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } + cr0.val = S390_lowcore.cregs_save_area[0]; + cr0.afp = cr0.vx = 1; + __ctl_load(cr0.val, 0, 0); + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ + : + : "Q" (*(struct vx_array *)mcesa->vector_save_area) + : "1"); + __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); } - /* Check if access registers are valid */ + /* Validate access registers */ + asm volatile( + " lam 0,15,0(%0)\n" + : + : "a" (&S390_lowcore.access_regs_save_area) + : "memory"); if (!mci.ar) { /* * Access registers have unknown contents. @@ -261,7 +311,7 @@ static int notrace s390_check_registers(union mci mci, int umode) */ kill_task = 1; } - /* Check guarded storage registers */ + /* Validate guarded storage registers */ cr2.val = S390_lowcore.cregs_save_area[2]; if (cr2.gse) { if (!mci.gs) { @@ -271,31 +321,26 @@ static int notrace s390_check_registers(union mci mci, int umode) * It has to be terminated. */ kill_task = 1; + } else { + load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); } } - /* Check if old PSW is valid */ - if (!mci.wp) { - /* - * Can't tell if we come from user or kernel mode - * -> stopping machine. - */ - s390_handle_damage(); - } - /* Check for invalid kernel instruction address */ - if (!mci.ia && !umode) { - /* - * The instruction address got lost while running - * in the kernel -> stopping machine. - */ - s390_handle_damage(); - } + /* + * The getcpu vdso syscall reads CPU number from the programmable + * field of the TOD clock. Disregard the TOD programmable register + * validity bit and load the CPU number into the TOD programmable + * field unconditionally. + */ + set_tod_programmable_field(raw_smp_processor_id()); + /* Validate clock comparator register */ + set_clock_comparator(S390_lowcore.clock_comparator); if (!mci.ms || !mci.pm || !mci.ia) kill_task = 1; return kill_task; } -NOKPROBE_SYMBOL(s390_check_registers); +NOKPROBE_SYMBOL(s390_validate_registers); /* * Backup the guest's machine check info to its description block @@ -353,11 +398,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs) mci.val = S390_lowcore.mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); - if (mci.sd) { - /* System damage -> stopping machine */ - s390_handle_damage(); - } - /* * Reinject the instruction processing damages' machine checks * including Delayed Access Exception into the guest @@ -398,7 +438,7 @@ int notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_check_registers(mci, user_mode(regs))) { + if (s390_validate_registers(mci, user_mode(regs))) { /* * Couldn't restore all register contents for the * user space process -> mark task for termination. @@ -428,21 +468,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs) mcck_pending = 1; } - /* - * Reinject storage related machine checks into the guest if they - * happen when the guest is running. - */ - if (!test_cpu_flag(CIF_MCCK_GUEST)) { - if (mci.se) - /* Storage error uncorrected */ - s390_handle_damage(); - if (mci.ke) - /* Storage key-error uncorrected */ - s390_handle_damage(); - if (mci.ds && mci.fa) - /* Storage degradation */ - s390_handle_damage(); - } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 1b7a0525fbed..975a00c8c564 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,8 +2,9 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2019 + * Copyright IBM Corp. 2012, 2021 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> + * Thomas Richter <tmricht@linux.ibm.com> */ #define KMSG_COMPONENT "cpum_cf" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -14,7 +15,223 @@ #include <linux/notifier.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/miscdevice.h> + #include <asm/cpu_mcf.h> +#include <asm/hwctrset.h> +#include <asm/debug.h> + +static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ +static debug_info_t *cf_dbg; + +#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ + /* interval in seconds */ + +/* Counter sets are stored as data stream in a page sized memory buffer and + * exported to user space via raw data attached to the event sample data. + * Each counter set starts with an eight byte header consisting of: + * - a two byte eye catcher (0xfeef) + * - a one byte counter set number + * - a two byte counter set size (indicates the number of counters in this set) + * - a three byte reserved value (must be zero) to make the header the same + * size as a counter value. + * All counter values are eight byte in size. + * + * All counter sets are followed by a 64 byte trailer. + * The trailer consists of a: + * - flag field indicating valid fields when corresponding bit set + * - the counter facility first and second version number + * - the CPU speed if nonzero + * - the time stamp the counter sets have been collected + * - the time of day (TOD) base value + * - the machine type. + * + * The counter sets are saved when the process is prepared to be executed on a + * CPU and saved again when the process is going to be removed from a CPU. + * The difference of both counter sets are calculated and stored in the event + * sample data area. + */ +struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ + unsigned int def:16; /* 0-15 Data Entry Format */ + unsigned int set:16; /* 16-31 Counter set identifier */ + unsigned int ctr:16; /* 32-47 Number of stored counters */ + unsigned int res1:16; /* 48-63 Reserved */ +}; + +struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ + /* 0 - 7 */ + union { + struct { + unsigned int clock_base:1; /* TOD clock base set */ + unsigned int speed:1; /* CPU speed set */ + /* Measurement alerts */ + unsigned int mtda:1; /* Loss of MT ctr. data alert */ + unsigned int caca:1; /* Counter auth. change alert */ + unsigned int lcda:1; /* Loss of counter data alert */ + }; + unsigned long flags; /* 0-63 All indicators */ + }; + /* 8 - 15 */ + unsigned int cfvn:16; /* 64-79 Ctr First Version */ + unsigned int csvn:16; /* 80-95 Ctr Second Version */ + unsigned int cpu_speed:32; /* 96-127 CPU speed */ + /* 16 - 23 */ + unsigned long timestamp; /* 128-191 Timestamp (TOD) */ + /* 24 - 55 */ + union { + struct { + unsigned long progusage1; + unsigned long progusage2; + unsigned long progusage3; + unsigned long tod_base; + }; + unsigned long progusage[4]; + }; + /* 56 - 63 */ + unsigned int mach_type:16; /* Machine type */ + unsigned int res1:16; /* Reserved */ + unsigned int res2:32; /* Reserved */ +}; + +/* Create the trailer data at the end of a page. */ +static void cfdiag_trailer(struct cf_trailer_entry *te) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpuid cpuid; + + te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ + te->csvn = cpuhw->info.csvn; + + get_cpu_id(&cpuid); /* Machine type */ + te->mach_type = cpuid.machine; + te->cpu_speed = cfdiag_cpu_speed; + if (te->cpu_speed) + te->speed = 1; + te->clock_base = 1; /* Save clock base */ + te->tod_base = tod_clock_base.tod; + te->timestamp = get_tod_clock_fast(); +} + +/* Read a counter set. The counter set number determines the counter set and + * the CPUM-CF first and second version number determine the number of + * available counters in each counter set. + * Each counter set starts with header containing the counter set number and + * the number of eight byte counters. + * + * The functions returns the number of bytes occupied by this counter set + * including the header. + * If there is no counter in the counter set, this counter set is useless and + * zero is returned on this case. + * + * Note that the counter sets may not be enabled or active and the stcctm + * instruction might return error 3. Depending on error_ok value this is ok, + * for example when called from cpumf_pmu_start() call back function. + */ +static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, + size_t room, bool error_ok) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + size_t ctrset_size, need = 0; + int rc = 3; /* Assume write failure */ + + ctrdata->def = CF_DIAG_CTRSET_DEF; + ctrdata->set = ctrset; + ctrdata->res1 = 0; + ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); + + if (ctrset_size) { /* Save data */ + need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); + if (need <= room) { + rc = ctr_stcctm(ctrset, ctrset_size, + (u64 *)(ctrdata + 1)); + } + if (rc != 3 || error_ok) + ctrdata->ctr = ctrset_size; + else + need = 0; + } + + debug_sprintf_event(cf_dbg, 3, + "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" + " need %zd rc %d\n", __func__, ctrset, ctrset_size, + cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); + return need; +} + +/* Read out all counter sets and save them in the provided data buffer. + * The last 64 byte host an artificial trailer entry. + */ +static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, + bool error_ok) +{ + struct cf_trailer_entry *trailer; + size_t offset = 0, done; + int i; + + memset(data, 0, sz); + sz -= sizeof(*trailer); /* Always room for trailer */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + struct cf_ctrset_entry *ctrdata = data + offset; + + if (!(auth & cpumf_ctr_ctl[i])) + continue; /* Counter set not authorized */ + + done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); + offset += done; + } + trailer = data + offset; + cfdiag_trailer(trailer); + return offset + sizeof(*trailer); +} + +/* Calculate the difference for each counter in a counter set. */ +static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) +{ + for (; --counters >= 0; ++pstart, ++pstop) + if (*pstop >= *pstart) + *pstop -= *pstart; + else + *pstop = *pstart - *pstop + 1; +} + +/* Scan the counter sets and calculate the difference of each counter + * in each set. The result is the increment of each counter during the + * period the counter set has been activated. + * + * Return true on success. + */ +static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) +{ + struct cf_trailer_entry *trailer_start, *trailer_stop; + struct cf_ctrset_entry *ctrstart, *ctrstop; + size_t offset = 0; + + auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; + do { + ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); + ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { + pr_err_once("cpum_cf_diag counter set compare error " + "in set %i\n", ctrstart->set); + return 0; + } + auth &= ~cpumf_ctr_ctl[ctrstart->set]; + if (ctrstart->def == CF_DIAG_CTRSET_DEF) { + cfdiag_diffctrset((u64 *)(ctrstart + 1), + (u64 *)(ctrstop + 1), ctrstart->ctr); + offset += ctrstart->ctr * sizeof(u64) + + sizeof(*ctrstart); + } + } while (ctrstart->def && auth); + + /* Save time_stamp from start of event in stop's trailer */ + trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); + trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); + trailer_stop->progusage[0] = trailer_start->timestamp; + + return 1; +} static enum cpumf_ctr_set get_counter_set(u64 event) { @@ -34,7 +251,8 @@ static enum cpumf_ctr_set get_counter_set(u64 event) return set; } -static int validate_ctr_version(const struct hw_perf_event *hwc) +static int validate_ctr_version(const struct hw_perf_event *hwc, + enum cpumf_ctr_set set) { struct cpu_cf_events *cpuhw; int err = 0; @@ -43,7 +261,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) cpuhw = &get_cpu_var(cpu_cf_events); /* check required version for counter sets */ - switch (hwc->config_base) { + switch (set) { case CPUMF_CTR_SET_BASIC: case CPUMF_CTR_SET_USER: if (cpuhw->info.cfvn < 1) @@ -86,6 +304,8 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) (cpuhw->info.act_ctl & mtdiag_ctl))) err = -EOPNOTSUPP; break; + case CPUMF_CTR_SET_MAX: + err = -EOPNOTSUPP; } put_cpu_var(cpu_cf_events); @@ -95,7 +315,6 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) static int validate_ctr_auth(const struct hw_perf_event *hwc) { struct cpu_cf_events *cpuhw; - u64 ctrs_state; int err = 0; cpuhw = &get_cpu_var(cpu_cf_events); @@ -105,8 +324,7 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc) * return with -ENOENT in order to fall back to other * PMUs that might suffice the event request. */ - ctrs_state = cpumf_ctr_ctl[hwc->config_base]; - if (!(ctrs_state & cpuhw->info.auth_ctl)) + if (!(hwc->config_base & cpuhw->info.auth_ctl)) err = -ENOENT; put_cpu_var(cpu_cf_events); @@ -126,7 +344,7 @@ static void cpumf_pmu_enable(struct pmu *pmu) if (cpuhw->flags & PMU_F_ENABLED) return; - err = lcctl(cpuhw->state); + err = lcctl(cpuhw->state | cpuhw->dev_state); if (err) { pr_err("Enabling the performance measuring unit " "failed with rc=%x\n", err); @@ -151,6 +369,7 @@ static void cpumf_pmu_disable(struct pmu *pmu) return; inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + inactive |= cpuhw->dev_state; err = lcctl(inactive); if (err) { pr_err("Disabling the performance measuring unit " @@ -199,6 +418,14 @@ static const int cpumf_generic_events_user[] = { [PERF_COUNT_HW_BUS_CYCLES] = -1, }; +static void cpumf_hw_inuse(void) +{ + mutex_lock(&pmc_reserve_mutex); + if (atomic_inc_return(&num_events) == 1) + __kernel_cpumcf_begin(); + mutex_unlock(&pmc_reserve_mutex); +} + static int __hw_perf_event_init(struct perf_event *event, unsigned int type) { struct perf_event_attr *attr = &event->attr; @@ -258,11 +485,11 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) /* * Use the hardware perf event structure to store the * counter number in the 'config' member and the counter - * set number in the 'config_base'. The counter set number - * is then later used to enable/disable the counter(s). + * set number in the 'config_base' as bit mask. + * It is later used to enable/disable the counter(s). */ hwc->config = ev; - hwc->config_base = set; + hwc->config_base = cpumf_ctr_ctl[set]; break; case CPUMF_CTR_SET_MAX: /* The counter could not be associated to a counter set */ @@ -270,22 +497,13 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) } /* Initialize for using the CPU-measurement counter facility */ - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; + cpumf_hw_inuse(); event->destroy = hw_perf_event_destroy; /* Finally, validate version and authorization of the counter set */ err = validate_ctr_auth(hwc); if (!err) - err = validate_ctr_version(hwc); + err = validate_ctr_version(hwc, set); return err; } @@ -361,6 +579,7 @@ static void cpumf_pmu_start(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct hw_perf_event *hwc = &event->hw; + int i; if (!(hwc->state & PERF_HES_STOPPED)) return; @@ -376,29 +595,92 @@ static void cpumf_pmu_start(struct perf_event *event, int flags) * needs to be synchronized. At this point, the counter set can be in * the inactive or disabled state. */ - hw_perf_event_reset(event); + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + cpuhw->usedss = cfdiag_getctr(cpuhw->start, + sizeof(cpuhw->start), + hwc->config_base, true); + } else { + hw_perf_event_reset(event); + } - /* increment refcount for this counter set */ - atomic_inc(&cpuhw->ctr_set[hwc->config_base]); + /* Increment refcount for counter sets */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if ((hwc->config_base & cpumf_ctr_ctl[i])) + atomic_inc(&cpuhw->ctr_set[i]); +} + +/* Create perf event sample with the counter sets as raw data. The sample + * is then pushed to the event subsystem and the function checks for + * possible event overflows. If an event overflow occurs, the PMU is + * stopped. + * + * Return non-zero if an event overflow occurred. + */ +static int cfdiag_push_sample(struct perf_event *event, + struct cpu_cf_events *cpuhw) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + perf_sample_data_init(&data, 0, event->hw.last_period); + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = event->cpu; + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = cpuhw->usedss; + raw.frag.data = cpuhw->stop; + raw.size = raw.frag.size; + data.raw = &raw; + } + + overflow = perf_event_overflow(event, &data, ®s); + debug_sprintf_event(cf_dbg, 3, + "%s event %#llx sample_type %#llx raw %d ov %d\n", + __func__, event->hw.config, + event->attr.sample_type, raw.size, overflow); + if (overflow) + event->pmu->stop(event, 0); + + perf_event_update_userpage(event); + return overflow; } static void cpumf_pmu_stop(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct hw_perf_event *hwc = &event->hw; + int i; if (!(hwc->state & PERF_HES_STOPPED)) { /* Decrement reference count for this counter set and if this * is the last used counter in the set, clear activation * control and set the counter set state to inactive. */ - if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) - ctr_set_stop(&cpuhw->state, hwc->config_base); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(hwc->config_base & cpumf_ctr_ctl[i])) + continue; + if (!atomic_dec_return(&cpuhw->ctr_set[i])) + ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); + } hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event); + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + local64_inc(&event->count); + cpuhw->usedss = cfdiag_getctr(cpuhw->stop, + sizeof(cpuhw->stop), + event->hw.config_base, + false); + if (cfdiag_diffctr(cpuhw, event->hw.config_base)) + cfdiag_push_sample(event, cpuhw); + } else + hw_perf_event_update(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -419,6 +701,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) static void cpumf_pmu_del(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + int i; cpumf_pmu_stop(event, PERF_EF_UPDATE); @@ -430,8 +713,9 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) * clear enable control and resets all counters in a set. Therefore, * cpumf_pmu_start() always has to reenable a counter set. */ - if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) - ctr_set_disable(&cpuhw->state, event->hw.config_base); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if (!atomic_read(&cpuhw->ctr_set[i])) + ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); } /* Performance monitoring unit for s390x */ @@ -448,6 +732,7 @@ static struct pmu cpumf_pmu = { .read = cpumf_pmu_read, }; +static int cfset_init(void); static int __init cpumf_pmu_init(void) { int rc; @@ -455,10 +740,689 @@ static int __init cpumf_pmu_init(void) if (!kernel_cpumcf_avail()) return -ENODEV; + /* Setup s390dbf facility */ + cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); + if (!cf_dbg) { + pr_err("Registration of s390dbf(cpum_cf) failed\n"); + return -ENOMEM; + }; + debug_register_view(cf_dbg, &debug_sprintf_view); + cpumf_pmu.attr_groups = cpumf_cf_event_group(); rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); - if (rc) + if (rc) { + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + } else if (stccm_avail()) { /* Setup counter set device */ + cfset_init(); + } + return rc; +} + +/* Support for the CPU Measurement Facility counter set extraction using + * device /dev/hwctr. This allows user space programs to extract complete + * counter set via normal file operations. + */ + +static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Excl. access */ +static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ +struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ + unsigned int sets; /* Counter set bit mask */ + atomic_t cpus_ack; /* # CPUs successfully executed func */ +}; + +static struct cfset_request { /* CPUs and counter set bit mask */ + unsigned long ctrset; /* Bit mask of counter set to read */ + cpumask_t mask; /* CPU mask to read from */ +} cfset_request; + +static void cfset_ctrset_clear(void) +{ + cpumask_clear(&cfset_request.mask); + cfset_request.ctrset = 0; +} + +/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access + * path is currently used. + * The cpu_cf_events::dev_state is used to denote counter sets in use by this + * interface. It is always or'ed in. If this interface is not active, its + * value is zero and no additional counter sets will be included. + * + * The cpu_cf_events::state is used by the perf_event_open SVC and remains + * unchanged. + * + * perf_pmu_enable() and perf_pmu_enable() and its call backs + * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the + * performance measurement subsystem to enable per process + * CPU Measurement counter facility. + * The XXX_enable() and XXX_disable functions are used to turn off + * x86 performance monitoring interrupt (PMI) during scheduling. + * s390 uses these calls to temporarily stop and resume the active CPU + * counters sets during scheduling. + * + * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr + * device access. The perf_event_open() SVC interface makes a lot of effort + * to only run the counters while the calling process is actively scheduled + * to run. + * When /dev/hwctr interface is also used at the same time, the counter sets + * will keep running, even when the process is scheduled off a CPU. + * However this is not a problem and does not lead to wrong counter values + * for the perf_event_open() SVC. The current counter value will be recorded + * during schedule-in. At schedule-out time the current counter value is + * extracted again and the delta is calculated and added to the event. + */ +/* Stop all counter sets via ioctl interface */ +static void cfset_ioctl_off(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->dev_state = 0; + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_dec(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + cpuhw->flags &= ~PMU_F_IN_USE; + debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", + __func__, rc, cpuhw->state, cpuhw->dev_state); +} + +/* Start counter sets on particular CPU */ +static void cfset_ioctl_on(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->flags |= PMU_F_IN_USE; + ctr_set_enable(&cpuhw->dev_state, p->sets); + ctr_set_start(&cpuhw->dev_state, p->sets); + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_inc(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ + if (!rc) + atomic_inc(&p->cpus_ack); + else + pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", + cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); + debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", + __func__, rc, cpuhw->state, cpuhw->dev_state); +} + +static void cfset_release_cpu(void *p) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + int rc; + + debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", + __func__, cpuhw->state, cpuhw->dev_state); + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + cpuhw->dev_state = 0; +} + +/* Release function is also called when application gets terminated without + * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. + */ +static int cfset_release(struct inode *inode, struct file *file) +{ + on_each_cpu(cfset_release_cpu, NULL, 1); + hw_perf_event_destroy(NULL); + cfset_ctrset_clear(); + atomic_set(&cfset_opencnt, 0); + return 0; +} + +static int cfset_open(struct inode *inode, struct file *file) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Only one user space program can open /dev/hwctr */ + if (atomic_xchg(&cfset_opencnt, 1)) + return -EBUSY; + + cpumf_hw_inuse(); + file->private_data = NULL; + /* nonseekable_open() never fails */ + return nonseekable_open(inode, file); +} + +static int cfset_all_stop(void) +{ + struct cfset_call_on_cpu_parm p = { + .sets = cfset_request.ctrset, + }; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + free_cpumask_var(mask); + return 0; +} + +static int cfset_all_start(void) +{ + struct cfset_call_on_cpu_parm p = { + .sets = cfset_request.ctrset, + .cpus_ack = ATOMIC_INIT(0), + }; + cpumask_var_t mask; + int rc = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); + if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + rc = -EIO; + debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); + } + free_cpumask_var(mask); + return rc; +} + + +/* Return the maximum required space for all possible CPUs in case one + * CPU will be onlined during the START, READ, STOP cycles. + * To find out the size of the counter sets, any one CPU will do. They + * all have the same counter sets. + */ +static size_t cfset_needspace(unsigned int sets) +{ + struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); + size_t bytes = 0; + int i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(sets & cpumf_ctr_ctl[i])) + continue; + bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + sizeof(((struct s390_ctrset_setdata *)0)->set) + + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); + } + bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * + (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); + put_cpu_ptr(&cpu_cf_events); + return bytes; +} + +static int cfset_all_copy(unsigned long arg, cpumask_t *mask) +{ + struct s390_ctrset_read __user *ctrset_read; + unsigned int cpu, cpus, rc; + void __user *uptr; + + ctrset_read = (struct s390_ctrset_read __user *)arg; + uptr = ctrset_read->data; + for_each_cpu(cpu, mask) { + struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); + struct s390_ctrset_cpudata __user *ctrset_cpudata; + + ctrset_cpudata = uptr; + rc = put_user(cpu, &ctrset_cpudata->cpu_nr); + rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); + rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, + cpuhw->used); + if (rc) + return -EFAULT; + uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; + cond_resched(); + } + cpus = cpumask_weight(mask); + if (put_user(cpus, &ctrset_read->no_cpus)) + return -EFAULT; + debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__, + uptr - (void __user *)ctrset_read->data); + return 0; +} + +static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, + int ctrset_size, size_t room) +{ + size_t need = 0; + int rc = -1; + + need = sizeof(*p) + sizeof(u64) * ctrset_size; + if (need <= room) { + p->set = cpumf_ctr_ctl[ctrset]; + p->no_cnts = ctrset_size; + rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); + if (rc == 3) /* Nothing stored */ + need = 0; + } + return need; +} + +/* Read all counter sets. */ +static void cfset_cpu_read(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int set, set_size; + size_t space; + + /* No data saved yet */ + cpuhw->used = 0; + cpuhw->sets = 0; + memset(cpuhw->data, 0, sizeof(cpuhw->data)); + + /* Scan the counter sets */ + for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { + struct s390_ctrset_setdata *sp = (void *)cpuhw->data + + cpuhw->used; + + if (!(p->sets & cpumf_ctr_ctl[set])) + continue; /* Counter set not in list */ + set_size = cpum_cf_ctrset_size(set, &cpuhw->info); + space = sizeof(cpuhw->data) - cpuhw->used; + space = cfset_cpuset_read(sp, set, set_size, space); + if (space) { + cpuhw->used += space; + cpuhw->sets += 1; + } + } + debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, + cpuhw->sets, cpuhw->used); +} + +static int cfset_all_read(unsigned long arg) +{ + struct cfset_call_on_cpu_parm p; + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + p.sets = cfset_request.ctrset; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); + rc = cfset_all_copy(arg, mask); + free_cpumask_var(mask); return rc; } -subsys_initcall(cpumf_pmu_init); + +static long cfset_ioctl_read(unsigned long arg) +{ + struct s390_ctrset_read read; + int ret = 0; + + if (copy_from_user(&read, (char __user *)arg, sizeof(read))) + return -EFAULT; + ret = cfset_all_read(arg); + return ret; +} + +static long cfset_ioctl_stop(void) +{ + int ret = ENXIO; + + if (cfset_request.ctrset) { + ret = cfset_all_stop(); + cfset_ctrset_clear(); + } + return ret; +} + +static long cfset_ioctl_start(unsigned long arg) +{ + struct s390_ctrset_start __user *ustart; + struct s390_ctrset_start start; + void __user *umask; + unsigned int len; + int ret = 0; + size_t need; + + if (cfset_request.ctrset) + return -EBUSY; + ustart = (struct s390_ctrset_start __user *)arg; + if (copy_from_user(&start, ustart, sizeof(start))) + return -EFAULT; + if (start.version != S390_HWCTR_START_VERSION) + return -EINVAL; + if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | + cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | + cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | + cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | + cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) + return -EINVAL; /* Invalid counter set */ + if (!start.counter_sets) + return -EINVAL; /* No counter set at all? */ + cpumask_clear(&cfset_request.mask); + len = min_t(u64, start.cpumask_len, cpumask_size()); + umask = (void __user *)start.cpumask; + if (copy_from_user(&cfset_request.mask, umask, len)) + return -EFAULT; + if (cpumask_empty(&cfset_request.mask)) + return -EINVAL; + need = cfset_needspace(start.counter_sets); + if (put_user(need, &ustart->data_bytes)) + ret = -EFAULT; + if (ret) + goto out; + cfset_request.ctrset = start.counter_sets; + ret = cfset_all_start(); +out: + if (ret) + cfset_ctrset_clear(); + debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n", + __func__, cfset_request.ctrset, need, ret); + return ret; +} + +/* Entry point to the /dev/hwctr device interface. + * The ioctl system call supports three subcommands: + * S390_HWCTR_START: Start the specified counter sets on a CPU list. The + * counter set keeps running until explicitly stopped. Returns the number + * of bytes needed to store the counter values. If another S390_HWCTR_START + * ioctl subcommand is called without a previous S390_HWCTR_STOP stop + * command, -EBUSY is returned. + * S390_HWCTR_READ: Read the counter set values from specified CPU list given + * with the S390_HWCTR_START command. + * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the + * previous S390_HWCTR_START subcommand. + */ +static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int ret; + + get_online_cpus(); + mutex_lock(&cfset_ctrset_mutex); + switch (cmd) { + case S390_HWCTR_START: + ret = cfset_ioctl_start(arg); + break; + case S390_HWCTR_STOP: + ret = cfset_ioctl_stop(); + break; + case S390_HWCTR_READ: + ret = cfset_ioctl_read(arg); + break; + default: + ret = -ENOTTY; + break; + } + mutex_unlock(&cfset_ctrset_mutex); + put_online_cpus(); + return ret; +} + +static const struct file_operations cfset_fops = { + .owner = THIS_MODULE, + .open = cfset_open, + .release = cfset_release, + .unlocked_ioctl = cfset_ioctl, + .compat_ioctl = cfset_ioctl, + .llseek = no_llseek +}; + +static struct miscdevice cfset_dev = { + .name = S390_HWCTR_DEVICE, + .minor = MISC_DYNAMIC_MINOR, + .fops = &cfset_fops, +}; + +int cfset_online_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + + mutex_lock(&cfset_ctrset_mutex); + if (cfset_request.ctrset) { + p.sets = cfset_request.ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &cfset_request.mask); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +int cfset_offline_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + + mutex_lock(&cfset_ctrset_mutex); + if (cfset_request.ctrset) { + p.sets = cfset_request.ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &cfset_request.mask); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +static void cfdiag_read(struct perf_event *event) +{ + debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, + event->attr.config, local64_read(&event->count)); +} + +static int get_authctrsets(void) +{ + struct cpu_cf_events *cpuhw; + unsigned long auth = 0; + enum cpumf_ctr_set i; + + cpuhw = &get_cpu_var(cpu_cf_events); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) + auth |= cpumf_ctr_ctl[i]; + } + put_cpu_var(cpu_cf_events); + return auth; +} + +/* Setup the event. Test for authorized counter sets and only include counter + * sets which are authorized at the time of the setup. Including unauthorized + * counter sets result in specification exception (and panic). + */ +static int cfdiag_event_init2(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = 0; + + /* Set sample_period to indicate sampling */ + event->hw.config = attr->config; + event->hw.sample_period = attr->sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + local64_set(&event->count, 0); + event->hw.last_period = event->hw.sample_period; + + /* Add all authorized counter sets to config_base. The + * the hardware init function is either called per-cpu or just once + * for all CPUS (event->cpu == -1). This depends on the whether + * counting is started for all CPUs or on a per workload base where + * the perf event moves from one CPU to another CPU. + * Checking the authorization on any CPU is fine as the hardware + * applies the same authorization settings to all CPUs. + */ + event->hw.config_base = get_authctrsets(); + + /* No authorized counter sets, nothing to count/sample */ + if (!event->hw.config_base) + err = -EINVAL; + + debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", + __func__, err, event->hw.config_base); + return err; +} + +static int cfdiag_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = -ENOENT; + + if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || + event->attr.type != event->pmu->type) + goto out; + + /* Raw events are used to access counters directly, + * hence do not permit excludes. + * This event is useless without PERF_SAMPLE_RAW to return counter set + * values as raw data. + */ + if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || + !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { + err = -EOPNOTSUPP; + goto out; + } + + /* Initialize for using the CPU-measurement counter facility */ + cpumf_hw_inuse(); + event->destroy = hw_perf_event_destroy; + + err = cfdiag_event_init2(event); + if (unlikely(err)) + event->destroy(event); +out: + return err; +} + +/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used + * to collect the complete counter sets for a scheduled process. Target + * are complete counter sets attached as raw data to the artificial event. + * This results in complete counter sets available when a process is + * scheduled. Contains the delta of every counter while the process was + * running. + */ +CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); + +static struct attribute *cfdiag_events_attr[] = { + CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), + NULL, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cfdiag_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cfdiag_events_group = { + .name = "events", + .attrs = cfdiag_events_attr, +}; +static struct attribute_group cfdiag_format_group = { + .name = "format", + .attrs = cfdiag_format_attr, +}; +static const struct attribute_group *cfdiag_attr_groups[] = { + &cfdiag_events_group, + &cfdiag_format_group, + NULL, +}; + +/* Performance monitoring unit for event CF_DIAG. Since this event + * is also started and stopped via the perf_event_open() system call, use + * the same event enable/disable call back functions. They do not + * have a pointer to the perf_event strcture as first parameter. + * + * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. + * Reuse them and distinguish the event (always first parameter) via + * 'config' member. + */ +static struct pmu cf_diag = { + .task_ctx_nr = perf_sw_context, + .event_init = cfdiag_event_init, + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cfdiag_read, + + .attr_groups = cfdiag_attr_groups +}; + +/* Calculate memory needed to store all counter sets together with header and + * trailer data. This is independent of the counter set authorization which + * can vary depending on the configuration. + */ +static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) +{ + size_t max_size = sizeof(struct cf_trailer_entry); + enum cpumf_ctr_set i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + size_t size = cpum_cf_ctrset_size(i, info); + + if (size) + max_size += size * sizeof(u64) + + sizeof(struct cf_ctrset_entry); + } + return max_size; +} + +/* Get the CPU speed, try sampling facility first and CPU attributes second. */ +static void cfdiag_get_cpu_speed(void) +{ + if (cpum_sf_avail()) { /* Sampling facility first */ + struct hws_qsi_info_block si; + + memset(&si, 0, sizeof(si)); + if (!qsi(&si)) { + cfdiag_cpu_speed = si.cpu_speed; + return; + } + } + + /* Fallback: CPU speed extract static part. Used in case + * CPU Measurement Sampling Facility is turned off. + */ + if (test_facility(34)) { + unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + + if (mhz != -1UL) + cfdiag_cpu_speed = mhz & 0xffffffff; + } +} + +static int cfset_init(void) +{ + struct cpumf_ctr_info info; + size_t need; + int rc; + + if (qctri(&info)) + return -ENODEV; + + cfdiag_get_cpu_speed(); + /* Make sure the counter set data fits into predefined buffer. */ + need = cfdiag_maxsize(&info); + if (need > sizeof(((struct cpu_cf_events *)0)->start)) { + pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", + need); + return -ENOMEM; + } + + rc = misc_register(&cfset_dev); + if (rc) { + pr_err("Registration of /dev/%s failed rc=%i\n", + cfset_dev.name, rc); + goto out; + } + + rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); + if (rc) { + misc_deregister(&cfset_dev); + pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", + rc); + } +out: + return rc; +} + +device_initcall(cpumf_pmu_init); diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 2300fbaac556..30f0242de4a5 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -29,7 +29,11 @@ DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = { }, .alert = ATOMIC64_INIT(0), .state = 0, + .dev_state = 0, .flags = 0, + .used = 0, + .usedss = 0, + .sets = 0 }; /* Indicator whether the CPU-Measurement Counter Facility Support is ready */ static bool cpum_cf_initalized; @@ -96,25 +100,10 @@ bool kernel_cpumcf_avail(void) } EXPORT_SYMBOL(kernel_cpumcf_avail); - -/* Reserve/release functions for sharing perf hardware */ -static DEFINE_SPINLOCK(cpumcf_owner_lock); -static void *cpumcf_owner; - /* Initialize the CPU-measurement counter facility */ int __kernel_cpumcf_begin(void) { int flags = PMC_INIT; - int err = 0; - - spin_lock(&cpumcf_owner_lock); - if (cpumcf_owner) - err = -EBUSY; - else - cpumcf_owner = __builtin_return_address(0); - spin_unlock(&cpumcf_owner_lock); - if (err) - return err; on_each_cpu(cpum_cf_setup_cpu, &flags, 1); irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); @@ -144,10 +133,6 @@ void __kernel_cpumcf_end(void) on_each_cpu(cpum_cf_setup_cpu, &flags, 1); irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - spin_lock(&cpumcf_owner_lock); - cpumcf_owner = NULL; - spin_unlock(&cpumcf_owner_lock); } EXPORT_SYMBOL(__kernel_cpumcf_end); @@ -161,11 +146,13 @@ static int cpum_cf_setup(unsigned int cpu, int flags) static int cpum_cf_online_cpu(unsigned int cpu) { - return cpum_cf_setup(cpu, PMC_INIT); + cpum_cf_setup(cpu, PMC_INIT); + return cfset_online_cpu(cpu); } static int cpum_cf_offline_cpu(unsigned int cpu) { + cfset_offline_cpu(cpu); return cpum_cf_setup(cpu, PMC_RELEASE); } diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c deleted file mode 100644 index 08c985c1097c..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ /dev/null @@ -1,1148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Performance event support for s390x - CPU-measurement Counter Sets - * - * Copyright IBM Corp. 2019, 2021 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - * Thomas Richer <tmricht@linux.ibm.com> - */ -#define KMSG_COMPONENT "cpum_cf_diag" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/processor.h> -#include <linux/miscdevice.h> -#include <linux/mutex.h> - -#include <asm/ctl_reg.h> -#include <asm/irq.h> -#include <asm/cpu_mcf.h> -#include <asm/timex.h> -#include <asm/debug.h> - -#include <asm/hwctrset.h> - -#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ - /* interval in seconds */ -static unsigned int cf_diag_cpu_speed; -static debug_info_t *cf_diag_dbg; - -struct cf_diag_csd { /* Counter set data per CPU */ - size_t used; /* Bytes used in data/start */ - unsigned char start[PAGE_SIZE]; /* Counter set at event start */ - unsigned char data[PAGE_SIZE]; /* Counter set at event delete */ - unsigned int sets; /* # Counter set saved in data */ -}; -static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd); - -/* Counter sets are stored as data stream in a page sized memory buffer and - * exported to user space via raw data attached to the event sample data. - * Each counter set starts with an eight byte header consisting of: - * - a two byte eye catcher (0xfeef) - * - a one byte counter set number - * - a two byte counter set size (indicates the number of counters in this set) - * - a three byte reserved value (must be zero) to make the header the same - * size as a counter value. - * All counter values are eight byte in size. - * - * All counter sets are followed by a 64 byte trailer. - * The trailer consists of a: - * - flag field indicating valid fields when corresponding bit set - * - the counter facility first and second version number - * - the CPU speed if nonzero - * - the time stamp the counter sets have been collected - * - the time of day (TOD) base value - * - the machine type. - * - * The counter sets are saved when the process is prepared to be executed on a - * CPU and saved again when the process is going to be removed from a CPU. - * The difference of both counter sets are calculated and stored in the event - * sample data area. - */ - -struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ - unsigned int def:16; /* 0-15 Data Entry Format */ - unsigned int set:16; /* 16-31 Counter set identifier */ - unsigned int ctr:16; /* 32-47 Number of stored counters */ - unsigned int res1:16; /* 48-63 Reserved */ -}; - -struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ - /* 0 - 7 */ - union { - struct { - unsigned int clock_base:1; /* TOD clock base set */ - unsigned int speed:1; /* CPU speed set */ - /* Measurement alerts */ - unsigned int mtda:1; /* Loss of MT ctr. data alert */ - unsigned int caca:1; /* Counter auth. change alert */ - unsigned int lcda:1; /* Loss of counter data alert */ - }; - unsigned long flags; /* 0-63 All indicators */ - }; - /* 8 - 15 */ - unsigned int cfvn:16; /* 64-79 Ctr First Version */ - unsigned int csvn:16; /* 80-95 Ctr Second Version */ - unsigned int cpu_speed:32; /* 96-127 CPU speed */ - /* 16 - 23 */ - unsigned long timestamp; /* 128-191 Timestamp (TOD) */ - /* 24 - 55 */ - union { - struct { - unsigned long progusage1; - unsigned long progusage2; - unsigned long progusage3; - unsigned long tod_base; - }; - unsigned long progusage[4]; - }; - /* 56 - 63 */ - unsigned int mach_type:16; /* Machine type */ - unsigned int res1:16; /* Reserved */ - unsigned int res2:32; /* Reserved */ -}; - -/* Create the trailer data at the end of a page. */ -static void cf_diag_trailer(struct cf_trailer_entry *te) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cpuid cpuid; - - te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ - te->csvn = cpuhw->info.csvn; - - get_cpu_id(&cpuid); /* Machine type */ - te->mach_type = cpuid.machine; - te->cpu_speed = cf_diag_cpu_speed; - if (te->cpu_speed) - te->speed = 1; - te->clock_base = 1; /* Save clock base */ - te->tod_base = tod_clock_base.tod; - te->timestamp = get_tod_clock_fast(); -} - -/* - * Change the CPUMF state to active. - * Enable and activate the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_enable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (cpuhw->flags & PMU_F_ENABLED) - return; - - err = lcctl(cpuhw->state); - if (err) { - pr_err("Enabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags |= PMU_F_ENABLED; -} - -/* - * Change the CPUMF state to inactive. - * Disable and enable (inactive) the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_disable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - u64 inactive; - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (!(cpuhw->flags & PMU_F_ENABLED)) - return; - - inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); - err = lcctl(inactive); - if (err) { - pr_err("Disabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags &= ~PMU_F_ENABLED; -} - -/* Number of perf events counting hardware events */ -static atomic_t cf_diag_events = ATOMIC_INIT(0); -/* Used to avoid races in calling reserve/release_cpumf_hardware */ -static DEFINE_MUTEX(cf_diag_reserve_mutex); - -/* Release the PMU if event is the last perf event */ -static void cf_diag_perf_event_destroy(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d cf_diag_events %d\n", - __func__, event, smp_processor_id(), - atomic_read(&cf_diag_events)); - if (atomic_dec_return(&cf_diag_events) == 0) - __kernel_cpumcf_end(); -} - -static int get_authctrsets(void) -{ - struct cpu_cf_events *cpuhw; - unsigned long auth = 0; - enum cpumf_ctr_set i; - - cpuhw = &get_cpu_var(cpu_cf_events); - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) - auth |= cpumf_ctr_ctl[i]; - } - put_cpu_var(cpu_cf_events); - return auth; -} - -/* Setup the event. Test for authorized counter sets and only include counter - * sets which are authorized at the time of the setup. Including unauthorized - * counter sets result in specification exception (and panic). - */ -static int __hw_perf_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__, - event, event->cpu); - - event->hw.config = attr->config; - - /* Add all authorized counter sets to config_base. The - * the hardware init function is either called per-cpu or just once - * for all CPUS (event->cpu == -1). This depends on the whether - * counting is started for all CPUs or on a per workload base where - * the perf event moves from one CPU to another CPU. - * Checking the authorization on any CPU is fine as the hardware - * applies the same authorization settings to all CPUs. - */ - event->hw.config_base = get_authctrsets(); - - /* No authorized counter sets, nothing to count/sample */ - if (!event->hw.config_base) { - err = -EINVAL; - goto out; - } - - /* Set sample_period to indicate sampling */ - event->hw.sample_period = attr->sample_period; - local64_set(&event->hw.period_left, event->hw.sample_period); - event->hw.last_period = event->hw.sample_period; -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n", - __func__, err, event->hw.config_base); - return err; -} - -/* Return 0 if the CPU-measurement counter facility is currently free - * and an error otherwise. - */ -static int cf_diag_perf_event_inuse(void) -{ - int err = 0; - - if (!atomic_inc_not_zero(&cf_diag_events)) { - mutex_lock(&cf_diag_reserve_mutex); - if (atomic_read(&cf_diag_events) == 0 && - __kernel_cpumcf_begin()) - err = -EBUSY; - else - err = atomic_inc_return(&cf_diag_events); - mutex_unlock(&cf_diag_reserve_mutex); - } - return err; -} - -static int cf_diag_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int err = -ENOENT; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d config %#llx type:%u " - "sample_type %#llx cf_diag_events %d\n", __func__, - event, event->cpu, attr->config, event->pmu->type, - attr->sample_type, atomic_read(&cf_diag_events)); - - if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || - event->attr.type != event->pmu->type) - goto out; - - /* Raw events are used to access counters directly, - * hence do not permit excludes. - * This event is usesless without PERF_SAMPLE_RAW to return counter set - * values as raw data. - */ - if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || - !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { - err = -EOPNOTSUPP; - goto out; - } - - /* Initialize for using the CPU-measurement counter facility */ - err = cf_diag_perf_event_inuse(); - if (err < 0) - goto out; - event->destroy = cf_diag_perf_event_destroy; - - err = __hw_perf_event_init(event); - if (unlikely(err)) - event->destroy(event); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_read(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event); -} - -/* Calculate memory needed to store all counter sets together with header and - * trailer data. This is independend of the counter set authorization which - * can vary depending on the configuration. - */ -static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info) -{ - size_t max_size = sizeof(struct cf_trailer_entry); - enum cpumf_ctr_set i; - - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cpum_cf_ctrset_size(i, info); - - if (size) - max_size += size * sizeof(u64) + - sizeof(struct cf_ctrset_entry); - } - debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__, - max_size); - - return max_size; -} - -/* Read a counter set. The counter set number determines which counter set and - * the CPUM-CF first and second version number determine the number of - * available counters in this counter set. - * Each counter set starts with header containing the counter set number and - * the number of 8 byte counters. - * - * The functions returns the number of bytes occupied by this counter set - * including the header. - * If there is no counter in the counter set, this counter set is useless and - * zero is returned on this case. - */ -static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, - size_t room) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - size_t ctrset_size, need = 0; - int rc = 3; /* Assume write failure */ - - ctrdata->def = CF_DIAG_CTRSET_DEF; - ctrdata->set = ctrset; - ctrdata->res1 = 0; - ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); - - if (ctrset_size) { /* Save data */ - need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); - if (need <= room) - rc = ctr_stcctm(ctrset, ctrset_size, - (u64 *)(ctrdata + 1)); - if (rc != 3) - ctrdata->ctr = ctrset_size; - else - need = 0; - } - - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" - " need %zd rc %d\n", - __func__, ctrset, ctrset_size, cpuhw->info.cfvn, - cpuhw->info.csvn, need, rc); - return need; -} - -/* Read out all counter sets and save them in the provided data buffer. - * The last 64 byte host an artificial trailer entry. - */ -static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth) -{ - struct cf_trailer_entry *trailer; - size_t offset = 0, done; - int i; - - memset(data, 0, sz); - sz -= sizeof(*trailer); /* Always room for trailer */ - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - struct cf_ctrset_entry *ctrdata = data + offset; - - if (!(auth & cpumf_ctr_ctl[i])) - continue; /* Counter set not authorized */ - - done = cf_diag_getctrset(ctrdata, i, sz - offset); - offset += done; - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d offset %zu done %zu\n", - __func__, i, offset, done); - } - trailer = data + offset; - cf_diag_trailer(trailer); - return offset + sizeof(*trailer); -} - -/* Calculate the difference for each counter in a counter set. */ -static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters) -{ - for (; --counters >= 0; ++pstart, ++pstop) - if (*pstop >= *pstart) - *pstop -= *pstart; - else - *pstop = *pstart - *pstop; -} - -/* Scan the counter sets and calculate the difference of each counter - * in each set. The result is the increment of each counter during the - * period the counter set has been activated. - * - * Return true on success. - */ -static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth) -{ - struct cf_trailer_entry *trailer_start, *trailer_stop; - struct cf_ctrset_entry *ctrstart, *ctrstop; - size_t offset = 0; - - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { - ctrstart = (struct cf_ctrset_entry *)(csd->start + offset); - ctrstop = (struct cf_ctrset_entry *)(csd->data + offset); - - if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { - pr_err("cpum_cf_diag counter set compare error " - "in set %i\n", ctrstart->set); - return 0; - } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; - if (ctrstart->def == CF_DIAG_CTRSET_DEF) { - cf_diag_diffctrset((u64 *)(ctrstart + 1), - (u64 *)(ctrstop + 1), ctrstart->ctr); - offset += ctrstart->ctr * sizeof(u64) + - sizeof(*ctrstart); - } - debug_sprintf_event(cf_diag_dbg, 6, - "%s set %d ctr %d offset %zu auth %lx\n", - __func__, ctrstart->set, ctrstart->ctr, - offset, auth); - } while (ctrstart->def && auth); - - /* Save time_stamp from start of event in stop's trailer */ - trailer_start = (struct cf_trailer_entry *)(csd->start + offset); - trailer_stop = (struct cf_trailer_entry *)(csd->data + offset); - trailer_stop->progusage[0] = trailer_start->timestamp; - - return 1; -} - -/* Create perf event sample with the counter sets as raw data. The sample - * is then pushed to the event subsystem and the function checks for - * possible event overflows. If an event overflow occurs, the PMU is - * stopped. - * - * Return non-zero if an event overflow occurred. - */ -static int cf_diag_push_sample(struct perf_event *event, - struct cf_diag_csd *csd) -{ - struct perf_sample_data data; - struct perf_raw_record raw; - struct pt_regs regs; - int overflow; - - /* Setup perf sample */ - perf_sample_data_init(&data, 0, event->hw.last_period); - memset(®s, 0, sizeof(regs)); - memset(&raw, 0, sizeof(raw)); - - if (event->attr.sample_type & PERF_SAMPLE_CPU) - data.cpu_entry.cpu = event->cpu; - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.frag.size = csd->used; - raw.frag.data = csd->data; - raw.size = csd->used; - data.raw = &raw; - } - - overflow = perf_event_overflow(event, &data, ®s); - debug_sprintf_event(cf_diag_dbg, 6, - "%s event %p cpu %d sample_type %#llx raw %d " - "ov %d\n", __func__, event, event->cpu, - event->attr.sample_type, raw.size, overflow); - if (overflow) - event->pmu->stop(event, 0); - - perf_event_update_userpage(event); - return overflow; -} - -static void cf_diag_start(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) - return; - - /* (Re-)enable and activate all counter sets */ - lcctl(0); /* Reset counter sets */ - hwc->state = 0; - ctr_set_multiple_enable(&cpuhw->state, hwc->config_base); - lcctl(cpuhw->state); /* Enable counter sets */ - csd->used = cf_diag_getctr(csd->start, sizeof(csd->start), - event->hw.config_base); - ctr_set_multiple_start(&cpuhw->state, hwc->config_base); - /* Function cf_diag_enable() starts the counter sets. */ -} - -static void cf_diag_stop(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - - /* Deactivate all counter sets */ - ctr_set_multiple_stop(&cpuhw->state, hwc->config_base); - local64_inc(&event->count); - csd->used = cf_diag_getctr(csd->data, sizeof(csd->data), - event->hw.config_base); - if (cf_diag_diffctr(csd, event->hw.config_base)) - cf_diag_push_sample(event, csd); - hwc->state |= PERF_HES_STOPPED; -} - -static int cf_diag_add(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x cpuhw %p\n", - __func__, event, event->cpu, flags, cpuhw); - - if (cpuhw->flags & PMU_F_IN_USE) { - err = -EAGAIN; - goto out; - } - - event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - cpuhw->flags |= PMU_F_IN_USE; - if (flags & PERF_EF_START) - cf_diag_start(event, PERF_EF_RELOAD); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_del(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x\n", - __func__, event, event->cpu, flags); - - cf_diag_stop(event, PERF_EF_UPDATE); - ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base); - ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base); - cpuhw->flags &= ~PMU_F_IN_USE; -} - -/* Default counter set events and format attribute groups */ - -CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); - -static struct attribute *cf_diag_events_attr[] = { - CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), - NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-63"); - -static struct attribute *cf_diag_format_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group cf_diag_events_group = { - .name = "events", - .attrs = cf_diag_events_attr, -}; -static struct attribute_group cf_diag_format_group = { - .name = "format", - .attrs = cf_diag_format_attr, -}; -static const struct attribute_group *cf_diag_attr_groups[] = { - &cf_diag_events_group, - &cf_diag_format_group, - NULL, -}; - -/* Performance monitoring unit for s390x */ -static struct pmu cf_diag = { - .task_ctx_nr = perf_sw_context, - .pmu_enable = cf_diag_enable, - .pmu_disable = cf_diag_disable, - .event_init = cf_diag_event_init, - .add = cf_diag_add, - .del = cf_diag_del, - .start = cf_diag_start, - .stop = cf_diag_stop, - .read = cf_diag_read, - - .attr_groups = cf_diag_attr_groups -}; - -/* Get the CPU speed, try sampling facility first and CPU attributes second. */ -static void cf_diag_get_cpu_speed(void) -{ - if (cpum_sf_avail()) { /* Sampling facility first */ - struct hws_qsi_info_block si; - - memset(&si, 0, sizeof(si)); - if (!qsi(&si)) { - cf_diag_cpu_speed = si.cpu_speed; - return; - } - } - - if (test_facility(34)) { /* CPU speed extract static part */ - unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); - - if (mhz != -1UL) - cf_diag_cpu_speed = mhz & 0xffffffff; - } -} - -/* Code to create device and file I/O operations */ -static atomic_t ctrset_opencnt = ATOMIC_INIT(0); /* Excl. access */ - -static int cf_diag_open(struct inode *inode, struct file *file) -{ - int err = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (atomic_xchg(&ctrset_opencnt, 1)) - return -EBUSY; - - /* Avoid concurrent access with perf_event_open() system call */ - mutex_lock(&cf_diag_reserve_mutex); - if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin()) - err = -EBUSY; - mutex_unlock(&cf_diag_reserve_mutex); - if (err) { - atomic_set(&ctrset_opencnt, 0); - return err; - } - file->private_data = NULL; - debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); - /* nonseekable_open() never fails */ - return nonseekable_open(inode, file); -} - -/* Variables for ioctl() interface support */ -static DEFINE_MUTEX(cf_diag_ctrset_mutex); -static struct cf_diag_ctrset { - unsigned long ctrset; /* Bit mask of counter set to read */ - cpumask_t mask; /* CPU mask to read from */ -} cf_diag_ctrset; - -static void cf_diag_ctrset_clear(void) -{ - cpumask_clear(&cf_diag_ctrset.mask); - cf_diag_ctrset.ctrset = 0; -} - -static void cf_diag_release_cpu(void *p) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__, - smp_processor_id()); - lcctl(0); /* Reset counter sets */ - cpuhw->state = 0; /* Save state in CPU hardware state */ -} - -/* Release function is also called when application gets terminated without - * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. - * Since only one application is allowed to open the device, simple stop all - * CPU counter sets. - */ -static int cf_diag_release(struct inode *inode, struct file *file) -{ - on_each_cpu(cf_diag_release_cpu, NULL, 1); - cf_diag_ctrset_clear(); - atomic_set(&ctrset_opencnt, 0); - __kernel_cpumcf_end(); - debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); - return 0; -} - -struct cf_diag_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ - unsigned int sets; /* Counter set bit mask */ - atomic_t cpus_ack; /* # CPUs successfully executed func */ -}; - -static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask) -{ - struct s390_ctrset_read __user *ctrset_read; - unsigned int cpu, cpus, rc; - void __user *uptr; - - ctrset_read = (struct s390_ctrset_read __user *)arg; - uptr = ctrset_read->data; - for_each_cpu(cpu, mask) { - struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu); - struct s390_ctrset_cpudata __user *ctrset_cpudata; - - ctrset_cpudata = uptr; - debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n", - __func__, cpu, csd->used); - rc = put_user(cpu, &ctrset_cpudata->cpu_nr); - rc |= put_user(csd->sets, &ctrset_cpudata->no_sets); - rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used); - if (rc) - return -EFAULT; - uptr += sizeof(struct s390_ctrset_cpudata) + csd->used; - cond_resched(); - } - cpus = cpumask_weight(mask); - if (put_user(cpus, &ctrset_read->no_cpus)) - return -EFAULT; - debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n", - __func__, uptr - (void __user *)ctrset_read->data); - return 0; -} - -static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, - int ctrset_size, size_t room) -{ - size_t need = 0; - int rc = -1; - - need = sizeof(*p) + sizeof(u64) * ctrset_size; - debug_sprintf_event(cf_diag_dbg, 5, - "%s room %zd need %zd set %#x set_size %d\n", - __func__, room, need, ctrset, ctrset_size); - if (need <= room) { - p->set = cpumf_ctr_ctl[ctrset]; - p->no_cnts = ctrset_size; - rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); - if (rc == 3) /* Nothing stored */ - need = 0; - } - debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__, - need, rc); - return need; -} - -/* Read all counter sets. Since the perf_event_open() system call with - * event cpum_cf_diag/.../ is blocked when this interface is active, reuse - * the perf_event_open() data buffer to store the counter sets. - */ -static void cf_diag_cpu_read(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct cf_diag_call_on_cpu_parm *p = parm; - int set, set_size; - size_t space; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - /* No data saved yet */ - csd->used = 0; - csd->sets = 0; - memset(csd->data, 0, sizeof(csd->data)); - - /* Scan the counter sets */ - for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { - struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used; - - if (!(p->sets & cpumf_ctr_ctl[set])) - continue; /* Counter set not in list */ - set_size = cpum_cf_ctrset_size(set, &cpuhw->info); - space = sizeof(csd->data) - csd->used; - space = cf_diag_cpuset_read(sp, set, set_size, space); - if (space) { - csd->used += space; - csd->sets += 1; - } - debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n", - __func__, sp, space); - } - debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__, - csd->sets, csd->used); -} - -static int cf_diag_all_read(unsigned long arg) -{ - struct cf_diag_call_on_cpu_parm p; - cpumask_var_t mask; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - p.sets = cf_diag_ctrset.ctrset; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1); - rc = cf_diag_all_copy(arg, mask); - free_cpumask_var(mask); - debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc); - return rc; -} - -/* Stop all counter sets via ioctl interface */ -static void cf_diag_ioctl_off(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_call_on_cpu_parm *p = parm; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - - ctr_set_multiple_disable(&cpuhw->state, p->sets); - ctr_set_multiple_stop(&cpuhw->state, p->sets); - rc = lcctl(cpuhw->state); /* Stop counter sets */ - if (!cpuhw->state) - cpuhw->flags &= ~PMU_F_IN_USE; - debug_sprintf_event(cf_diag_dbg, 5, - "%s rc %d flags %#x state %#llx\n", __func__, - rc, cpuhw->flags, cpuhw->state); -} - -/* Start counter sets on particular CPU */ -static void cf_diag_ioctl_on(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_call_on_cpu_parm *p = parm; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - - if (!(cpuhw->flags & PMU_F_IN_USE)) - cpuhw->state = 0; - cpuhw->flags |= PMU_F_IN_USE; - rc = lcctl(cpuhw->state); /* Reset unused counter sets */ - ctr_set_multiple_enable(&cpuhw->state, p->sets); - ctr_set_multiple_start(&cpuhw->state, p->sets); - rc |= lcctl(cpuhw->state); /* Start counter sets */ - if (!rc) - atomic_inc(&p->cpus_ack); - debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n", - __func__, rc, cpuhw->state); -} - -static int cf_diag_all_stop(void) -{ - struct cf_diag_call_on_cpu_parm p = { - .sets = cf_diag_ctrset.ctrset, - }; - cpumask_var_t mask; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); - free_cpumask_var(mask); - return 0; -} - -static int cf_diag_all_start(void) -{ - struct cf_diag_call_on_cpu_parm p = { - .sets = cf_diag_ctrset.ctrset, - .cpus_ack = ATOMIC_INIT(0), - }; - cpumask_var_t mask; - int rc = 0; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1); - if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { - on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); - rc = -EIO; - } - free_cpumask_var(mask); - return rc; -} - -/* Return the maximum required space for all possible CPUs in case one - * CPU will be onlined during the START, READ, STOP cycles. - * To find out the size of the counter sets, any one CPU will do. They - * all have the same counter sets. - */ -static size_t cf_diag_needspace(unsigned int sets) -{ - struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); - size_t bytes = 0; - int i; - - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (!(sets & cpumf_ctr_ctl[i])) - continue; - bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + - sizeof(((struct s390_ctrset_setdata *)0)->set) + - sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); - } - bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * - (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + - sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); - debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__, - bytes); - put_cpu_ptr(&cpu_cf_events); - return bytes; -} - -static long cf_diag_ioctl_read(unsigned long arg) -{ - struct s390_ctrset_read read; - int ret = 0; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - if (copy_from_user(&read, (char __user *)arg, sizeof(read))) - return -EFAULT; - ret = cf_diag_all_read(arg); - debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); - return ret; -} - -static long cf_diag_ioctl_stop(void) -{ - int ret; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - ret = cf_diag_all_stop(); - cf_diag_ctrset_clear(); - debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); - return ret; -} - -static long cf_diag_ioctl_start(unsigned long arg) -{ - struct s390_ctrset_start __user *ustart; - struct s390_ctrset_start start; - void __user *umask; - unsigned int len; - int ret = 0; - size_t need; - - if (cf_diag_ctrset.ctrset) - return -EBUSY; - ustart = (struct s390_ctrset_start __user *)arg; - if (copy_from_user(&start, ustart, sizeof(start))) - return -EFAULT; - if (start.version != S390_HWCTR_START_VERSION) - return -EINVAL; - if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | - cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | - cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | - cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | - cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) - return -EINVAL; /* Invalid counter set */ - if (!start.counter_sets) - return -EINVAL; /* No counter set at all? */ - cpumask_clear(&cf_diag_ctrset.mask); - len = min_t(u64, start.cpumask_len, cpumask_size()); - umask = (void __user *)start.cpumask; - if (copy_from_user(&cf_diag_ctrset.mask, umask, len)) - return -EFAULT; - if (cpumask_empty(&cf_diag_ctrset.mask)) - return -EINVAL; - need = cf_diag_needspace(start.counter_sets); - if (put_user(need, &ustart->data_bytes)) - ret = -EFAULT; - if (ret) - goto out; - cf_diag_ctrset.ctrset = start.counter_sets; - ret = cf_diag_all_start(); -out: - if (ret) - cf_diag_ctrset_clear(); - debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n", - __func__, cf_diag_ctrset.ctrset, need, ret); - return ret; -} - -static long cf_diag_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret; - - debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__, - cmd, arg); - get_online_cpus(); - mutex_lock(&cf_diag_ctrset_mutex); - switch (cmd) { - case S390_HWCTR_START: - ret = cf_diag_ioctl_start(arg); - break; - case S390_HWCTR_STOP: - ret = cf_diag_ioctl_stop(); - break; - case S390_HWCTR_READ: - ret = cf_diag_ioctl_read(arg); - break; - default: - ret = -ENOTTY; - break; - } - mutex_unlock(&cf_diag_ctrset_mutex); - put_online_cpus(); - debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret); - return ret; -} - -static const struct file_operations cf_diag_fops = { - .owner = THIS_MODULE, - .open = cf_diag_open, - .release = cf_diag_release, - .unlocked_ioctl = cf_diag_ioctl, - .compat_ioctl = cf_diag_ioctl, - .llseek = no_llseek -}; - -static struct miscdevice cf_diag_dev = { - .name = S390_HWCTR_DEVICE, - .minor = MISC_DYNAMIC_MINOR, - .fops = &cf_diag_fops, -}; - -static int cf_diag_online_cpu(unsigned int cpu) -{ - struct cf_diag_call_on_cpu_parm p; - - mutex_lock(&cf_diag_ctrset_mutex); - if (!cf_diag_ctrset.ctrset) - goto out; - p.sets = cf_diag_ctrset.ctrset; - cf_diag_ioctl_on(&p); -out: - mutex_unlock(&cf_diag_ctrset_mutex); - return 0; -} - -static int cf_diag_offline_cpu(unsigned int cpu) -{ - struct cf_diag_call_on_cpu_parm p; - - mutex_lock(&cf_diag_ctrset_mutex); - if (!cf_diag_ctrset.ctrset) - goto out; - p.sets = cf_diag_ctrset.ctrset; - cf_diag_ioctl_off(&p); -out: - mutex_unlock(&cf_diag_ctrset_mutex); - return 0; -} - -/* Initialize the counter set PMU to generate complete counter set data as - * event raw data. This relies on the CPU Measurement Counter Facility device - * already being loaded and initialized. - */ -static int __init cf_diag_init(void) -{ - struct cpumf_ctr_info info; - size_t need; - int rc; - - if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info)) - return -ENODEV; - cf_diag_get_cpu_speed(); - - /* Make sure the counter set data fits into predefined buffer. */ - need = cf_diag_ctrset_maxsize(&info); - if (need > sizeof(((struct cf_diag_csd *)0)->start)) { - pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", - need); - return -ENOMEM; - } - - rc = misc_register(&cf_diag_dev); - if (rc) { - pr_err("Registration of /dev/" S390_HWCTR_DEVICE - "failed rc=%d\n", rc); - goto out; - } - - /* Setup s390dbf facility */ - cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); - if (!cf_diag_dbg) { - pr_err("Registration of s390dbf(cpum_cf_diag) failed\n"); - rc = -ENOMEM; - goto out_dbf; - } - debug_register_view(cf_diag_dbg, &debug_sprintf_view); - - rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); - if (rc) { - pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", - rc); - goto out_perf; - } - rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE, - "perf/s390/cfd:online", - cf_diag_online_cpu, cf_diag_offline_cpu); - if (!rc) - goto out; - - pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n", - rc); - perf_pmu_unregister(&cf_diag); -out_perf: - debug_unregister_view(cf_diag_dbg, &debug_sprintf_view); - debug_unregister(cf_diag_dbg); -out_dbf: - misc_deregister(&cf_diag_dev); -out: - return rc; -} -device_initcall(cf_diag_init); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 7ae5dde9c54d..350e94d0cac2 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -166,6 +166,12 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, p->thread.acrs[1] = (unsigned int)tls; } } + /* + * s390 stores the svc return address in arch_data when calling + * sigreturn()/restart_syscall() via vdso. 1 means no valid address + * stored. + */ + p->restart_block.arch_data = 1; return 0; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 5486d82470b5..ff0f9e838916 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -354,7 +354,7 @@ void __init arch_call_rest_init(void) set_task_stack_end_magic(current); stack += STACK_INIT_OFFSET; S390_lowcore.kernel_stack = stack; - CALL_ON_STACK_NORETURN(rest_init, stack); + call_on_stack_noreturn(rest_init, stack); } static void __init setup_lowcore_dat_off(void) @@ -442,6 +442,7 @@ static void __init setup_lowcore_dat_off(void) lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; set_prefix((u32)(unsigned long) lc); lowcore_ptr[0] = lc; diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 080e7aed181f..78ef53b29958 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -32,6 +32,7 @@ #include <linux/uaccess.h> #include <asm/lowcore.h> #include <asm/switch_to.h> +#include <asm/vdso.h> #include "entry.h" /* @@ -171,7 +172,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } @@ -334,15 +334,10 @@ static int setup_frame(int sig, struct k_sigaction *ka, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { + if (ka->sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ka->sa.sa_restorer; - } else { - /* Signal frame without vector registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, sigreturn); /* Set up registers for signal handler */ regs->gprs[14] = restorer; @@ -397,14 +392,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { + if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ksig->ka.sa.sa_restorer; - } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, rt_sigreturn); /* Create siginfo on the signal stack */ if (copy_siginfo_to_user(&frame->info, &ksig->info)) @@ -501,7 +492,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) } /* No longer in a system call */ clear_pt_regs_flag(regs, PIF_SYSCALL); - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); + rseq_signal_deliver(&ksig, regs); if (is_compat_task()) handle_signal32(&ksig, oldset, regs); @@ -517,14 +508,20 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ - regs->int_code = __NR_restart_syscall; - fallthrough; + regs->gprs[2] = regs->orig_gpr2; + current->restart_block.arch_data = regs->psw.addr; + if (is_compat_task()) + regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall); + else + regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall); + if (test_thread_flag(TIF_SINGLE_STEP)) + clear_thread_flag(TIF_PER_TRAP); + break; case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - /* Restart system call with magic TIF bit. */ regs->gprs[2] = regs->orig_gpr2; - set_pt_regs_flag(regs, PIF_SYSCALL_RESTART); + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); if (test_thread_flag(TIF_SINGLE_STEP)) clear_thread_flag(TIF_PER_TRAP); break; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index ff42d3aa0f00..8984711f72ed 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -210,6 +210,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; if (nmi_alloc_per_cpu(lc)) goto out; lowcore_ptr[cpu] = lc; @@ -300,24 +301,28 @@ static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); } +typedef void (pcpu_delegate_fn)(void *); + /* * Call function via PSW restart on pcpu and stop the current cpu. */ -static void __pcpu_delegate(void (*func)(void*), void *data) +static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) { func(data); /* should not return */ } static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu, - void (*func)(void *), + pcpu_delegate_fn *func, void *data, unsigned long stack) { struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; unsigned long source_cpu = stap(); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - if (pcpu->address == source_cpu) - CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data); + if (pcpu->address == source_cpu) { + call_on_stack(2, stack, void, __pcpu_delegate, + pcpu_delegate_fn *, func, void *, data); + } /* Stop target cpu (if func returns this stops the current cpu). */ pcpu_sigp_retry(pcpu, SIGP_STOP, 0); /* Restart func on the target cpu and stop the current cpu. */ @@ -898,7 +903,7 @@ static void __no_sanitize_address smp_start_secondary(void *cpuvoid) S390_lowcore.restart_source = -1UL; __ctl_load(S390_lowcore.cregs_save_area, 0, 15); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack); + call_on_stack_noreturn(smp_init_secondary, S390_lowcore.kernel_stack); } /* Upping and downing of CPUs */ diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 76f7916cc30f..8fe2d23b64f4 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -108,7 +108,7 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -void do_syscall(struct pt_regs *regs) +static void do_syscall(struct pt_regs *regs) { unsigned long nr; @@ -121,6 +121,10 @@ void do_syscall(struct pt_regs *regs) regs->gprs[2] = nr; + if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { + regs->psw.addr = current->restart_block.arch_data; + current->restart_block.arch_data = 1; + } nr = syscall_enter_from_user_mode_work(regs, nr); /* @@ -130,13 +134,16 @@ void do_syscall(struct pt_regs *regs) * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here * and if set, the syscall will be skipped. */ - if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) { - regs->gprs[2] = -ENOSYS; - if (likely(nr < NR_syscalls)) - regs->gprs[2] = current->thread.sys_call_table[nr](regs); - } else { - clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET); - } + + if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) + goto out; + regs->gprs[2] = -ENOSYS; + if (likely(nr >= NR_syscalls)) + goto out; + do { + regs->gprs[2] = current->thread.sys_call_table[nr](regs); + } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); +out: syscall_exit_to_user_mode_work(regs); } @@ -154,13 +161,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap) if (per_trap) set_thread_flag(TIF_PER_TRAP); - for (;;) { - regs->flags = 0; - set_pt_regs_flag(regs, PIF_SYSCALL); - do_syscall(regs); - if (!test_pt_regs_flag(regs, PIF_SYSCALL_RESTART)) - break; - local_irq_enable(); - } + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); + do_syscall(regs); exit_to_user_mode(); } diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 019c5748b607..76947275fe8b 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -277,6 +277,8 @@ static void __init test_monitor_call(void) { int val = 1; + if (!IS_ENABLED(CONFIG_BUG)) + return; asm volatile( " mc 0,0\n" "0: xgr %0,%0\n" @@ -299,10 +301,9 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { unsigned long last_break = S390_lowcore.breaking_event_addr; - unsigned int trapnr, syscall_redirect = 0; + unsigned int trapnr; irqentry_state_t state; - add_random_kstack_offset(); regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; regs->int_parm_long = S390_lowcore.trans_exc_code; @@ -344,18 +345,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs) trapnr = regs->int_code & PGM_INT_CODE_MASK; if (trapnr) pgm_check_table[trapnr](regs); - syscall_redirect = user_mode(regs) && test_pt_regs_flag(regs, PIF_SYSCALL); out: local_irq_disable(); irqentry_exit(regs, state); - - if (syscall_redirect) { - enter_from_user_mode(regs); - local_irq_enable(); - regs->orig_gpr2 = regs->gprs[2]; - do_syscall(regs); - exit_to_user_mode(); - } } /* diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 6be2167943bb..aeb0a15bcbb7 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -358,6 +358,15 @@ static ssize_t uv_query_facilities(struct kobject *kobj, static struct kobj_attribute uv_query_facilities_attr = __ATTR(facilities, 0444, uv_query_facilities, NULL); +static ssize_t uv_query_feature_indications(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications); +} + +static struct kobj_attribute uv_query_feature_indications_attr = + __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); + static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -390,6 +399,7 @@ static struct kobj_attribute uv_query_max_guest_addr_attr = static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, + &uv_query_feature_indications_attr.attr, &uv_query_max_guest_cpus_attr.attr, &uv_query_max_guest_vms_attr.attr, &uv_query_max_guest_addr_attr.attr, diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 8c4e07d533c8..99694260cac9 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -20,7 +20,7 @@ #include <asm/vdso.h> extern char vdso64_start[], vdso64_end[]; -static unsigned int vdso_pages; +extern char vdso32_start[], vdso32_end[]; static struct vm_special_mapping vvar_mapping; @@ -37,18 +37,6 @@ enum vvar_pages { VVAR_NR_PAGES, }; -unsigned int __read_mostly vdso_enabled = 1; - -static int __init vdso_setup(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - vdso_enabled = enabled; - return 1; -} -__setup("vdso=", vdso_setup); - #ifdef CONFIG_TIME_NS struct vdso_data *arch_get_vdso_data(void *vvar_page) { @@ -155,7 +143,12 @@ static struct vm_special_mapping vvar_mapping = { .fault = vvar_fault, }; -static struct vm_special_mapping vdso_mapping = { +static struct vm_special_mapping vdso64_mapping = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; + +static struct vm_special_mapping vdso32_mapping = { .name = "[vdso]", .mremap = vdso_mremap, }; @@ -171,16 +164,22 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { unsigned long vdso_text_len, vdso_mapping_len; unsigned long vvar_start, vdso_text_start; + struct vm_special_mapping *vdso_mapping; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc; BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); - if (!vdso_enabled || is_compat_task()) - return 0; if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_text_len = vdso_pages << PAGE_SHIFT; + + if (is_compat_task()) { + vdso_text_len = vdso32_end - vdso32_start; + vdso_mapping = &vdso32_mapping; + } else { + vdso_text_len = vdso64_end - vdso64_start; + vdso_mapping = &vdso64_mapping; + } vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); rc = vvar_start; @@ -198,7 +197,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_mapping); + vdso_mapping); if (IS_ERR(vma)) { do_munmap(mm, vvar_start, PAGE_SIZE, NULL); rc = PTR_ERR(vma); @@ -211,21 +210,25 @@ out: return rc; } -static int __init vdso_init(void) +static struct page ** __init vdso_setup_pages(void *start, void *end) { - struct page **pages; + int pages = (end - start) >> PAGE_SHIFT; + struct page **pagelist; int i; - vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT; - pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - vdso_enabled = 0; - return -ENOMEM; - } - for (i = 0; i < vdso_pages; i++) - pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE); - pages[vdso_pages] = NULL; - vdso_mapping.pages = pages; + pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!pagelist) + panic("%s: Cannot allocate page list for VDSO", __func__); + for (i = 0; i < pages; i++) + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); + return pagelist; +} + +static int __init vdso_init(void) +{ + vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); + if (IS_ENABLED(CONFIG_COMPAT)) + vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); return 0; } arch_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore new file mode 100644 index 000000000000..5167384843b9 --- /dev/null +++ b/arch/s390/kernel/vdso32/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso32.lds diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile new file mode 100644 index 000000000000..b2349a3f4fa3 --- /dev/null +++ b/arch/s390/kernel/vdso32/Makefile @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +KCOV_INSTRUMENT := n +ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE +ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT + +include $(srctree)/lib/vdso/Makefile +obj-vdso32 = vdso_user_wrapper-32.o note-32.o + +# Build rules + +targets := $(obj-vdso32) vdso32.so vdso32.so.dbg +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 += -m31 -s + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin + +LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \ + --hash-style=both --build-id=sha1 -melf_s390 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) + +obj-y += vdso32_wrapper.o +CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE + $(call if_changed,ld) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj-vdso32): %-32.o: %.S FORCE + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso32cc = VDSO32C $@ + cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso32.so: $(obj)/vdso32.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso32.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh new file mode 100755 index 000000000000..9c4f951e227d --- /dev/null +++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S new file mode 100644 index 000000000000..db19d0680a0a --- /dev/null +++ b/arch/s390/kernel/vdso32/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S new file mode 100644 index 000000000000..bff50b6acd6d --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ + +#include <asm/page.h> +#include <asm/vdso.h> + +OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +OUTPUT_ARCH(s390:31-bit) +ENTRY(_start) + +SECTIONS +{ + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_compat_restart_syscall; + __kernel_compat_rt_sigreturn; + __kernel_compat_sigreturn; + local: *; + }; +} diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 000000000000..de2fb930471a --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/s390/kernel/vdso32/vdso32.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S new file mode 100644 index 000000000000..3f42f27f978c --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/unistd.h> +#include <asm/dwarf.h> + +.macro vdso_syscall func,syscall + .globl __kernel_compat_\func + .type __kernel_compat_\func,@function + .align 8 +__kernel_compat_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_compat_\func,.-__kernel_compat_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index a6e0fb6b91d6..2a2092ce19f1 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -74,3 +74,11 @@ vdso64.so: $(obj)/vdso64.so.dbg $(call cmd,vdso_install) vdso_install: vdso64.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh new file mode 100755 index 000000000000..37f05cb38dad --- /dev/null +++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 518f1ea405f4..d4fb336d747b 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -17,7 +17,7 @@ SECTIONS #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif - . = VDSO64_LBASE + SIZEOF_HEADERS; + . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -137,6 +137,9 @@ VERSION __kernel_clock_gettime; __kernel_clock_getres; __kernel_getcpu; + __kernel_restart_syscall; + __kernel_rt_sigreturn; + __kernel_sigreturn; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S index f773505c7e63..97f0c0a669a5 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -37,3 +37,20 @@ vdso_func gettimeofday vdso_func clock_getres vdso_func clock_gettime vdso_func getcpu + +.macro vdso_syscall func,syscall + .globl __kernel_\func + .type __kernel_\func,@function + .align 8 +__kernel_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn |