From 56e62a73702836017564eaacd5212e4d0fa1c01d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Sat, 21 Nov 2020 11:14:56 +0100 Subject: s390: convert to generic entry This patch converts s390 to use the generic entry infrastructure from kernel/entry/*. There are a few special things on s390: - PIF_PER_TRAP is moved to TIF_PER_TRAP as the generic code doesn't know about our PIF flags in exit_to_user_mode_loop(). - The old code had several ways to restart syscalls: a) PIF_SYSCALL_RESTART, which was only set during execve to force a restart after upgrading a process (usually qemu-kvm) to pgste page table extensions. b) PIF_SYSCALL, which is set by do_signal() to indicate that the current syscall should be restarted. This is changed so that do_signal() now also uses PIF_SYSCALL_RESTART. Continuing to use PIF_SYSCALL doesn't work with the generic code, and changing it to PIF_SYSCALL_RESTART makes PIF_SYSCALL and PIF_SYSCALL_RESTART more unique. - On s390 calling sys_sigreturn or sys_rt_sigreturn is implemented by executing a svc instruction on the process stack which causes a fault. While handling that fault the fault code sets PIF_SYSCALL to hand over processing to the syscall code on exit to usermode. The patch introduces PIF_SYSCALL_RET_SET, which is set if ptrace sets a return value for a syscall. The s390x ptrace ABI uses r2 both for the syscall number and return value, so ptrace cannot set the syscall number + return value at the same time. The flag makes handling that a bit easier. do_syscall() will just skip executing the syscall if PIF_SYSCALL_RET_SET is set. CONFIG_DEBUG_ASCE was removd in favour of the generic CONFIG_DEBUG_ENTRY. CR1/7/13 will be checked both on kernel entry and exit to contain the correct asces. Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/Makefile | 2 +- arch/s390/kernel/asm-offsets.c | 19 +- arch/s390/kernel/compat_signal.c | 1 + arch/s390/kernel/entry.S | 803 +++++---------------------------------- arch/s390/kernel/entry.h | 12 +- arch/s390/kernel/fpu.c | 88 +++++ arch/s390/kernel/idle.c | 24 ++ arch/s390/kernel/irq.c | 89 ++++- arch/s390/kernel/nmi.c | 19 +- arch/s390/kernel/process.c | 30 +- arch/s390/kernel/ptrace.c | 117 +----- arch/s390/kernel/setup.c | 3 +- arch/s390/kernel/signal.c | 12 +- arch/s390/kernel/smp.c | 2 +- arch/s390/kernel/sys_s390.c | 102 ----- arch/s390/kernel/syscall.c | 172 +++++++++ arch/s390/kernel/traps.c | 65 ++++ arch/s390/kernel/uprobes.c | 6 +- 18 files changed, 591 insertions(+), 975 deletions(-) delete mode 100644 arch/s390/kernel/sys_s390.c create mode 100644 arch/s390/kernel/syscall.c (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index dd73b7f07423..c97818a382f3 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -34,7 +34,7 @@ CFLAGS_dumpstack.o += -fno-optimize-sibling-calls CFLAGS_unwind_bc.o += -fno-optimize-sibling-calls obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o -obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o +obj-y += processor.o syscall.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o obj-y += sysinfo.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 79724d861dc9..d22bb28ef50c 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -26,26 +26,14 @@ int main(void) BLANK(); /* thread struct offsets */ OFFSET(__THREAD_ksp, thread_struct, ksp); - OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table); - OFFSET(__THREAD_last_break, thread_struct, last_break); - OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc); - OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs); - OFFSET(__THREAD_per_cause, thread_struct, per_event.cause); - OFFSET(__THREAD_per_address, thread_struct, per_event.address); - OFFSET(__THREAD_per_paid, thread_struct, per_event.paid); - OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb); BLANK(); /* thread info offsets */ OFFSET(__TI_flags, task_struct, thread_info.flags); BLANK(); /* pt_regs offsets */ - OFFSET(__PT_ARGS, pt_regs, args); OFFSET(__PT_PSW, pt_regs, psw); OFFSET(__PT_GPRS, pt_regs, gprs); OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2); - OFFSET(__PT_INT_CODE, pt_regs, int_code); - OFFSET(__PT_INT_PARM, pt_regs, int_parm); - OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long); OFFSET(__PT_FLAGS, pt_regs, flags); OFFSET(__PT_CR1, pt_regs, cr1); DEFINE(__PT_SIZE, sizeof(struct pt_regs)); @@ -64,6 +52,7 @@ int main(void) OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit); OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter); OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit); + OFFSET(__MT_CYCLES_ENTER, s390_idle_data, mt_cycles_enter); BLANK(); /* hardware defined lowcore locations 0x000 - 0x1ff */ OFFSET(__LC_EXT_PARAMS, lowcore, ext_params); @@ -115,13 +104,9 @@ int main(void) OFFSET(__LC_CPU_FLAGS, lowcore, cpu_flags); OFFSET(__LC_RETURN_PSW, lowcore, return_psw); OFFSET(__LC_RETURN_MCCK_PSW, lowcore, return_mcck_psw); - OFFSET(__LC_SYNC_ENTER_TIMER, lowcore, sync_enter_timer); - OFFSET(__LC_ASYNC_ENTER_TIMER, lowcore, async_enter_timer); + OFFSET(__LC_SYS_ENTER_TIMER, lowcore, sys_enter_timer); OFFSET(__LC_MCCK_ENTER_TIMER, lowcore, mcck_enter_timer); OFFSET(__LC_EXIT_TIMER, lowcore, exit_timer); - OFFSET(__LC_USER_TIMER, lowcore, user_timer); - OFFSET(__LC_SYSTEM_TIMER, lowcore, system_timer); - OFFSET(__LC_STEAL_TIMER, lowcore, steal_timer); OFFSET(__LC_LAST_UPDATE_TIMER, lowcore, last_update_timer); OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 38d4bdbc34b9..1d0e17ec93eb 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -118,6 +118,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ + clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index f1ba197b10c0..785425b59ac1 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -51,38 +51,8 @@ STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER STACK_SIZE = 1 << STACK_SHIFT STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE -_TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ - _TIF_UPROBE | _TIF_GUARDED_STORAGE | _TIF_PATCH_PENDING | \ - _TIF_NOTIFY_SIGNAL) -_TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ - _TIF_SYSCALL_TRACEPOINT) -_CIF_WORK = (_CIF_FPU) -_PIF_WORK = (_PIF_PER_TRAP | _PIF_SYSCALL_RESTART) - _LPP_OFFSET = __LC_LPP - .macro TRACE_IRQS_ON -#ifdef CONFIG_TRACE_IRQFLAGS - basr %r2,%r0 - brasl %r14,trace_hardirqs_on_caller -#endif - .endm - - .macro TRACE_IRQS_OFF -#ifdef CONFIG_TRACE_IRQFLAGS - basr %r2,%r0 - brasl %r14,trace_hardirqs_off_caller -#endif - .endm - - .macro LOCKDEP_SYS_EXIT -#ifdef CONFIG_LOCKDEP - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jz .+10 - brasl %r14,lockdep_sys_exit -#endif - .endm - .macro CHECK_STACK savearea #ifdef CONFIG_CHECK_STACK tml %r15,STACK_SIZE - CONFIG_STACK_GUARD @@ -91,12 +61,6 @@ _LPP_OFFSET = __LC_LPP #endif .endm - .macro DEBUG_USER_ASCE -#ifdef CONFIG_DEBUG_USER_ASCE - brasl %r14,debug_user_asce -#endif - .endm - .macro CHECK_VMAP_STACK savearea,oklabel #ifdef CONFIG_VMAP_STACK lgr %r14,%r15 @@ -117,9 +81,9 @@ _LPP_OFFSET = __LC_LPP #endif .endm - .macro SWITCH_ASYNC savearea,timer,clock + .macro SWITCH_KERNEL savearea tmhh %r8,0x0001 # interrupting from user ? - jnz 4f + jnz 1f #if IS_ENABLED(CONFIG_KVM) lgr %r14,%r9 larl %r13,.Lsie_gmap @@ -130,92 +94,16 @@ _LPP_OFFSET = __LC_LPP lghi %r11,\savearea # inside critical section, do cleanup brasl %r14,.Lcleanup_sie #endif -0: larl %r13,.Lpsw_idle_exit - cgr %r13,%r9 - jne 3f - - larl %r1,smp_cpu_mtid - llgf %r1,0(%r1) - ltgr %r1,%r1 - jz 2f # no SMT, skip mt_cycles calculation - .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+80(%r15) - larl %r3,mt_cycles - ag %r3,__LC_PERCPU_OFFSET - la %r4,__SF_EMPTY+16(%r15) -1: lg %r0,0(%r3) - slg %r0,0(%r4) - alg %r0,64(%r4) - stg %r0,0(%r3) - la %r3,8(%r3) - la %r4,8(%r4) - brct %r1,1b - -2: mvc __CLOCK_IDLE_EXIT(8,%r2), \clock - mvc __TIMER_IDLE_EXIT(8,%r2), \timer - # account system time going idle - ni __LC_CPU_FLAGS+7,255-_CIF_ENABLED_WAIT - - lg %r13,__LC_STEAL_TIMER - alg %r13,__CLOCK_IDLE_ENTER(%r2) - slg %r13,__LC_LAST_UPDATE_CLOCK - stg %r13,__LC_STEAL_TIMER - - mvc __LC_LAST_UPDATE_CLOCK(8),__CLOCK_IDLE_EXIT(%r2) - - lg %r13,__LC_SYSTEM_TIMER - alg %r13,__LC_LAST_UPDATE_TIMER - slg %r13,__TIMER_IDLE_ENTER(%r2) - stg %r13,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),__TIMER_IDLE_EXIT(%r2) - - nihh %r8,0xfcfd # clear wait state and irq bits -3: lg %r14,__LC_ASYNC_STACK # are we already on the target stack? - slgr %r14,%r15 - srag %r14,%r14,STACK_SHIFT - jnz 5f - CHECK_STACK \savearea +0: CHECK_STACK \savearea + lgr %r11,%r15 aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - j 6f -4: UPDATE_VTIME %r14,%r15,\timer - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP -5: lg %r15,__LC_ASYNC_STACK # load async stack -6: la %r11,STACK_FRAME_OVERHEAD(%r15) - .endm - - .macro UPDATE_VTIME w1,w2,enter_timer - lg \w1,__LC_EXIT_TIMER - lg \w2,__LC_LAST_UPDATE_TIMER - slg \w1,\enter_timer - slg \w2,__LC_EXIT_TIMER - alg \w1,__LC_USER_TIMER - alg \w2,__LC_SYSTEM_TIMER - stg \w1,__LC_USER_TIMER - stg \w2,__LC_SYSTEM_TIMER - mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer - .endm - - .macro RESTORE_SM_CLEAR_PER - stg %r8,__LC_RETURN_PSW - ni __LC_RETURN_PSW,0xbf - ssm __LC_RETURN_PSW - .endm - - .macro ENABLE_INTS - stosm __SF_EMPTY(%r15),3 - .endm - - .macro ENABLE_INTS_TRACE - TRACE_IRQS_ON - ENABLE_INTS - .endm - - .macro DISABLE_INTS - stnsm __SF_EMPTY(%r15),0xfc - .endm - - .macro DISABLE_INTS_TRACE - DISABLE_INTS - TRACE_IRQS_OFF + stg %r11,__SF_BACKCHAIN(%r15) + j 2f +1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP + lctlg %c1,%c1,__LC_KERNEL_ASCE + lg %r15,__LC_KERNEL_STACK + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) +2: la %r11,STACK_FRAME_OVERHEAD(%r15) .endm .macro STCK savearea @@ -267,18 +155,17 @@ _LPP_OFFSET = __LC_LPP "jnz .+8; .long 0xb2e8d000", 82 .endm - GEN_BR_THUNK %r9 GEN_BR_THUNK %r14 GEN_BR_THUNK %r14,%r11 .section .kprobes.text, "ax" .Ldummy: /* - * This nop exists only in order to avoid that __switch_to starts at + * This nop exists only in order to avoid that __bpon starts at * the beginning of the kprobes text section. In that case we would * have several symbols at the same address. E.g. objdump would take * an arbitrary symbol name when disassembling this code. - * With the added nop in between the __switch_to symbol is unique + * With the added nop in between the __bpon symbol is unique * again. */ nop 0 @@ -327,10 +214,6 @@ ENTRY(sie64a) stg %r3,__SF_SIE_SAVEAREA(%r15) # save guest register save area xc __SF_SIE_REASON(8,%r15),__SF_SIE_REASON(%r15) # reason code = 0 mvc __SF_SIE_FLAGS(8,%r15),__TI_flags(%r12) # copy thread flags - TSTMSK __LC_CPU_FLAGS,_CIF_FPU # load guest fp/vx registers ? - jno .Lsie_load_guest_gprs - brasl %r14,load_fpu_regs # load guest fp/vx regs -.Lsie_load_guest_gprs: lmg %r0,%r13,0(%r3) # load guest gprs 0-13 lg %r14,__LC_GMAP # get gmap pointer ltgr %r14,%r14 @@ -370,7 +253,6 @@ sie_exit: stmg %r0,%r13,0(%r14) # save guest gprs 0-13 xgr %r0,%r0 # clear guest registers to xgr %r1,%r1 # prevent speculative use - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 @@ -397,249 +279,68 @@ EXPORT_SYMBOL(sie_exit) */ ENTRY(system_call) - stpt __LC_SYNC_ENTER_TIMER + stpt __LC_SYS_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_SYNC BPOFF - lg %r12,__LC_CURRENT - lghi %r14,_PIF_SYSCALL + lghi %r14,0 .Lsysc_per: lctlg %c1,%c1,__LC_KERNEL_ASCE - lghi %r13,__TASK_thread + lg %r12,__LC_CURRENT lg %r15,__LC_KERNEL_STACK - la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs - UPDATE_VTIME %r8,%r9,__LC_SYNC_ENTER_TIMER - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - stmg %r0,%r7,__PT_R0(%r11) - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW - mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC - stg %r14,__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - ENABLE_INTS -.Lsysc_do_svc: + stmg %r0,%r7,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP # clear user controlled register to prevent speculative use xgr %r0,%r0 - # load address of system call table - lg %r10,__THREAD_sysc_table(%r13,%r12) - llgh %r8,__PT_INT_CODE+2(%r11) - slag %r8,%r8,3 # shift and test for svc 0 - jnz .Lsysc_nr_ok - # svc 0: system call number in %r1 - llgfr %r1,%r1 # clear high word in r1 - sth %r1,__PT_INT_CODE+2(%r11) - cghi %r1,NR_syscalls - jnl .Lsysc_nr_ok - slag %r8,%r1,3 -.Lsysc_nr_ok: - stg %r2,__PT_ORIG_GPR2(%r11) - stg %r7,STACK_FRAME_OVERHEAD(%r15) - lg %r9,0(%r8,%r10) # get system call add. - TSTMSK __TI_flags(%r12),_TIF_TRACE - jnz .Lsysc_tracesys - BASR_EX %r14,%r9 # call sys_xxxx - stg %r2,__PT_R2(%r11) # store return value - -.Lsysc_return: -#ifdef CONFIG_DEBUG_RSEQ - lgr %r2,%r11 - brasl %r14,rseq_syscall -#endif - LOCKDEP_SYS_EXIT -.Lsysc_tif: - DISABLE_INTS - TSTMSK __PT_FLAGS(%r11),_PIF_WORK - jnz .Lsysc_work - TSTMSK __TI_flags(%r12),_TIF_WORK - jnz .Lsysc_work # check for work - DEBUG_USER_ASCE + xgr %r1,%r1 + xgr %r4,%r4 + xgr %r5,%r5 + xgr %r6,%r6 + xgr %r7,%r7 + xgr %r8,%r8 + xgr %r9,%r9 + xgr %r10,%r10 + xgr %r11,%r11 + la %r2,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs + lgr %r3,%r14 + brasl %r14,__do_syscall lctlg %c1,%c1,__LC_USER_ASCE - BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP - TSTMSK __LC_CPU_FLAGS, _CIF_FPU - jz .Lsysc_skip_fpu - brasl %r14,load_fpu_regs -.Lsysc_skip_fpu: - mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) stpt __LC_EXIT_TIMER - lmg %r0,%r15,__PT_R0(%r11) b __LC_RETURN_LPSWE - -# -# One of the work bits is on. Find out which one. -# -.Lsysc_work: - ENABLE_INTS - TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED - jo .Lsysc_reschedule - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART - jo .Lsysc_syscall_restart -#ifdef CONFIG_UPROBES - TSTMSK __TI_flags(%r12),_TIF_UPROBE - jo .Lsysc_uprobe_notify -#endif - TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE - jo .Lsysc_guarded_storage - TSTMSK __PT_FLAGS(%r11),_PIF_PER_TRAP - jo .Lsysc_singlestep -#ifdef CONFIG_LIVEPATCH - TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING - jo .Lsysc_patch_pending # handle live patching just before - # signals and possible syscall restart -#endif - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL_RESTART - jo .Lsysc_syscall_restart - TSTMSK __TI_flags(%r12),(_TIF_SIGPENDING|_TIF_NOTIFY_SIGNAL) - jnz .Lsysc_sigpending - TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME - jo .Lsysc_notify_resume - j .Lsysc_return - -# -# _TIF_NEED_RESCHED is set, call schedule -# -.Lsysc_reschedule: - larl %r14,.Lsysc_return - jg schedule - -# -# _TIF_SIGPENDING is set, call do_signal -# -.Lsysc_sigpending: - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_signal - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL - jno .Lsysc_return -.Lsysc_do_syscall: - lghi %r13,__TASK_thread - lmg %r2,%r7,__PT_R2(%r11) # load svc arguments - lghi %r1,0 # svc 0 returns -ENOSYS - j .Lsysc_do_svc - -# -# _TIF_NOTIFY_RESUME is set, call do_notify_resume -# -.Lsysc_notify_resume: - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg do_notify_resume - -# -# _TIF_UPROBE is set, call uprobe_notify_resume -# -#ifdef CONFIG_UPROBES -.Lsysc_uprobe_notify: - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg uprobe_notify_resume -#endif - -# -# _TIF_GUARDED_STORAGE is set, call guarded_storage_load -# -.Lsysc_guarded_storage: - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg gs_load_bc_cb -# -# _TIF_PATCH_PENDING is set, call klp_update_patch_state -# -#ifdef CONFIG_LIVEPATCH -.Lsysc_patch_pending: - lg %r2,__LC_CURRENT # pass pointer to task struct - larl %r14,.Lsysc_return - jg klp_update_patch_state -#endif - -# -# _PIF_PER_TRAP is set, call do_per_trap -# -.Lsysc_singlestep: - ni __PT_FLAGS+7(%r11),255-_PIF_PER_TRAP - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg do_per_trap - -# -# _PIF_SYSCALL_RESTART is set, repeat the current system call -# -.Lsysc_syscall_restart: - ni __PT_FLAGS+7(%r11),255-_PIF_SYSCALL_RESTART - lmg %r1,%r7,__PT_R1(%r11) # load svc arguments - lg %r2,__PT_ORIG_GPR2(%r11) - j .Lsysc_do_svc - -# -# call tracehook_report_syscall_entry/tracehook_report_syscall_exit before -# and after the system call -# -.Lsysc_tracesys: - lgr %r2,%r11 # pass pointer to pt_regs - la %r3,0 - llgh %r0,__PT_INT_CODE+2(%r11) - stg %r0,__PT_R2(%r11) - brasl %r14,do_syscall_trace_enter - lghi %r0,NR_syscalls - clgr %r0,%r2 - jnh .Lsysc_tracenogo - sllg %r8,%r2,3 - lg %r9,0(%r8,%r10) - lmg %r3,%r7,__PT_R3(%r11) - stg %r7,STACK_FRAME_OVERHEAD(%r15) - lg %r2,__PT_ORIG_GPR2(%r11) - BASR_EX %r14,%r9 # call sys_xxx - stg %r2,__PT_R2(%r11) # store return value -.Lsysc_tracenogo: - TSTMSK __TI_flags(%r12),_TIF_TRACE - jz .Lsysc_return - lgr %r2,%r11 # pass pointer to pt_regs - larl %r14,.Lsysc_return - jg do_syscall_trace_exit ENDPROC(system_call) # # a new process exits the kernel with ret_from_fork # ENTRY(ret_from_fork) - la %r11,STACK_FRAME_OVERHEAD(%r15) - lg %r12,__LC_CURRENT - brasl %r14,schedule_tail - tm __PT_PSW+1(%r11),0x01 # forking a kernel thread ? - jne .Lsysc_tracenogo - # it's a kernel thread - lmg %r9,%r10,__PT_R9(%r11) # load gprs - la %r2,0(%r10) - BASR_EX %r14,%r9 - j .Lsysc_tracenogo + lgr %r3,%r11 + brasl %r14,__ret_from_fork + lctlg %c1,%c1,__LC_USER_ASCE + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) + stpt __LC_EXIT_TIMER + b __LC_RETURN_LPSWE ENDPROC(ret_from_fork) -ENTRY(kernel_thread_starter) - la %r2,0(%r10) - BASR_EX %r14,%r9 - j .Lsysc_tracenogo -ENDPROC(kernel_thread_starter) - /* * Program check handler routine */ ENTRY(pgm_check_handler) - stpt __LC_SYNC_ENTER_TIMER + stpt __LC_SYS_ENTER_TIMER BPOFF stmg %r8,%r15,__LC_SAVE_AREA_SYNC - lg %r10,__LC_LAST_BREAK - srag %r11,%r10,12 - jnz 0f - /* if __LC_LAST_BREAK is < 4096, it contains one of - * the lpswe addresses in lowcore. Set it to 1 (initial state) - * to prevent leaking that address to userspace. - */ - lghi %r10,1 -0: lg %r12,__LC_CURRENT - lghi %r11,0 + lg %r12,__LC_CURRENT + lghi %r10,0 lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # coming from user space? jno .Lpgm_skip_asce lctlg %c1,%c1,__LC_KERNEL_ASCE - j 3f + j 3f # -> fault in user space .Lpgm_skip_asce: #if IS_ENABLED(CONFIG_KVM) # cleanup critical section for program checks in sie64a @@ -653,7 +354,7 @@ ENTRY(pgm_check_handler) ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit - lghi %r11,_PIF_GUEST_FAULT + lghi %r10,_PIF_GUEST_FAULT #endif 1: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 2f # -> enabled, can't be a double fault @@ -661,82 +362,37 @@ ENTRY(pgm_check_handler) jnz .Lpgm_svcper # -> single stepped svc 2: CHECK_STACK __LC_SAVE_AREA_SYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - # CHECK_VMAP_STACK branches to stack_overflow or 5f - CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,5f -3: UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER - BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP + # CHECK_VMAP_STACK branches to stack_overflow or 4f + CHECK_VMAP_STACK __LC_SAVE_AREA_SYNC,4f +3: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP lg %r15,__LC_KERNEL_STACK - lgr %r14,%r12 - aghi %r14,__TASK_thread # pointer to thread_struct - lghi %r13,__LC_PGM_TDB - tm __LC_PGM_ILC+2,0x02 # check for transaction abort - jz 4f - mvc __THREAD_trap_tdb(256,%r14),0(%r13) -4: stg %r10,__THREAD_last_break(%r14) -5: lgr %r13,%r11 - la %r11,STACK_FRAME_OVERHEAD(%r15) +4: la %r11,STACK_FRAME_OVERHEAD(%r15) + stg %r10,__PT_FLAGS(%r11) + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) stmg %r0,%r7,__PT_R0(%r11) + mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC + stmg %r8,%r9,__PT_PSW(%r11) + # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 xgr %r6,%r6 xgr %r7,%r7 - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC - stmg %r8,%r9,__PT_PSW(%r11) - mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC - mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE - stg %r13,__PT_FLAGS(%r11) - stg %r10,__PT_ARGS(%r11) - tm __LC_PGM_ILC+3,0x80 # check for per exception - jz 6f - tmhh %r8,0x0001 # kernel per event ? - jz .Lpgm_kprobe - oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP - mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS - mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE - mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -6: xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - RESTORE_SM_CLEAR_PER - larl %r1,pgm_check_table - llgh %r10,__PT_INT_CODE+2(%r11) - nill %r10,0x007f - sll %r10,3 - je .Lpgm_return - lg %r9,0(%r10,%r1) # load address of handler routine - lgr %r2,%r11 # pass pointer to pt_regs - BASR_EX %r14,%r9 # branch to interrupt-handler -.Lpgm_return: - LOCKDEP_SYS_EXIT - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jno .Lpgm_restore - TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL - jo .Lsysc_do_syscall - j .Lsysc_tif -.Lpgm_restore: - DISABLE_INTS - TSTMSK __LC_CPU_FLAGS, _CIF_FPU - jz .Lpgm_skip_fpu - brasl %r14,load_fpu_regs -.Lpgm_skip_fpu: - mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + lgr %r2,%r11 + brasl %r14,__do_pgm_check + tmhh %r8,0x0001 # returning to user space? + jno .Lpgm_exit_kernel + lctlg %c1,%c1,__LC_USER_ASCE + BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER - lmg %r0,%r15,__PT_R0(%r11) +.Lpgm_exit_kernel: + mvc __LC_RETURN_PSW(16),STACK_FRAME_OVERHEAD+__PT_PSW(%r15) + lmg %r0,%r15,STACK_FRAME_OVERHEAD+__PT_R0(%r15) b __LC_RETURN_LPSWE -# -# PER event in supervisor state, must be kprobes -# -.Lpgm_kprobe: - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - RESTORE_SM_CLEAR_PER - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_per_trap - j .Lpgm_return - # # single stepped system call # @@ -744,26 +400,26 @@ ENTRY(pgm_check_handler) mvc __LC_RETURN_PSW(8),__LC_SVC_NEW_PSW larl %r14,.Lsysc_per stg %r14,__LC_RETURN_PSW+8 - lghi %r14,_PIF_SYSCALL | _PIF_PER_TRAP + lghi %r14,1 lpswe __LC_RETURN_PSW # branch to .Lsysc_per ENDPROC(pgm_check_handler) /* - * IO interrupt handler routine + * Interrupt handler macro used for external and IO interrupts. */ -ENTRY(io_int_handler) +.macro INT_HANDLER name,lc_old_psw,handler +ENTRY(\name) STCK __LC_INT_CLOCK - stpt __LC_ASYNC_ENTER_TIMER + stpt __LC_SYS_ENTER_TIMER BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT - lmg %r8,%r9,__LC_IO_OLD_PSW - SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER,__LC_INT_CLOCK + lmg %r8,%r9,\lc_old_psw + SWITCH_KERNEL __LC_SAVE_AREA_ASYNC stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 @@ -772,322 +428,48 @@ ENTRY(io_int_handler) xgr %r10,%r10 mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC stmg %r8,%r9,__PT_PSW(%r11) - tm __PT_PSW+1(%r11),0x01 # coming from user space? - jno .Lio_skip_asce + tm %r8,0x0001 # coming from user space? + jno 1f lctlg %c1,%c1,__LC_KERNEL_ASCE -.Lio_skip_asce: - mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID - xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - TRACE_IRQS_OFF -.Lio_loop: - lgr %r2,%r11 # pass pointer to pt_regs - lghi %r3,IO_INTERRUPT - tm __PT_INT_CODE+8(%r11),0x80 # adapter interrupt ? - jz .Lio_call - lghi %r3,THIN_INTERRUPT -.Lio_call: - brasl %r14,do_IRQ - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR - jz .Lio_return - tpi 0 - jz .Lio_return - mvc __PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID - j .Lio_loop -.Lio_return: - LOCKDEP_SYS_EXIT - TSTMSK __TI_flags(%r12),_TIF_WORK - jnz .Lio_work # there is work to do (signals etc.) - TSTMSK __LC_CPU_FLAGS,_CIF_WORK - jnz .Lio_work -.Lio_restore: - TRACE_IRQS_ON +1: lgr %r2,%r11 # pass pointer to pt_regs + brasl %r14,\handler mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jno .Lio_exit_kernel - DEBUG_USER_ASCE + tmhh %r8,0x0001 # returning to user ? + jno 2f lctlg %c1,%c1,__LC_USER_ASCE BPEXIT __TI_flags(%r12),_TIF_ISOLATE_BP stpt __LC_EXIT_TIMER -.Lio_exit_kernel: - lmg %r0,%r15,__PT_R0(%r11) +2: lmg %r0,%r15,__PT_R0(%r11) b __LC_RETURN_LPSWE -.Lio_done: - -# -# There is work todo, find out in which context we have been interrupted: -# 1) if we return to user space we can do all _TIF_WORK work -# 2) if we return to kernel code and kvm is enabled check if we need to -# modify the psw to leave SIE -# 3) if we return to kernel code and preemptive scheduling is enabled check -# the preemption counter and if it is zero call preempt_schedule_irq -# Before any work can be done, a switch to the kernel stack is required. -# -.Lio_work: - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jo .Lio_work_user # yes -> do resched & signal -#ifdef CONFIG_PREEMPTION - # check for preemptive scheduling - icm %r0,15,__LC_PREEMPT_COUNT - jnz .Lio_restore # preemption is disabled - TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED - jno .Lio_restore - # switch to kernel stack - lg %r1,__PT_R15(%r11) - aghi %r1,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) - xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) - la %r11,STACK_FRAME_OVERHEAD(%r1) - lgr %r15,%r1 - brasl %r14,preempt_schedule_irq - j .Lio_return -#else - j .Lio_restore -#endif - -# -# Need to do work before returning to userspace, switch to kernel stack -# -.Lio_work_user: - lg %r1,__LC_KERNEL_STACK - mvc STACK_FRAME_OVERHEAD(__PT_SIZE,%r1),0(%r11) - xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) - la %r11,STACK_FRAME_OVERHEAD(%r1) - lgr %r15,%r1 - -# -# One of the work bits is on. Find out which one. -# - TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED - jo .Lio_reschedule -#ifdef CONFIG_LIVEPATCH - TSTMSK __TI_flags(%r12),_TIF_PATCH_PENDING - jo .Lio_patch_pending -#endif - TSTMSK __TI_flags(%r12),(_TIF_SIGPENDING|_TIF_NOTIFY_SIGNAL) - jnz .Lio_sigpending - TSTMSK __TI_flags(%r12),_TIF_NOTIFY_RESUME - jo .Lio_notify_resume - TSTMSK __TI_flags(%r12),_TIF_GUARDED_STORAGE - jo .Lio_guarded_storage - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lio_vxrs - j .Lio_return - -# -# CIF_FPU is set, restore floating-point controls and floating-point registers. -# -.Lio_vxrs: - larl %r14,.Lio_return - jg load_fpu_regs - -# -# _TIF_GUARDED_STORAGE is set, call guarded_storage_load -# -.Lio_guarded_storage: - ENABLE_INTS_TRACE - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,gs_load_bc_cb - DISABLE_INTS_TRACE - j .Lio_return +ENDPROC(\name) +.endm -# -# _TIF_NEED_RESCHED is set, call schedule -# -.Lio_reschedule: - ENABLE_INTS_TRACE - brasl %r14,schedule # call scheduler - DISABLE_INTS_TRACE - j .Lio_return - -# -# _TIF_PATCH_PENDING is set, call klp_update_patch_state -# -#ifdef CONFIG_LIVEPATCH -.Lio_patch_pending: - lg %r2,__LC_CURRENT # pass pointer to task struct - larl %r14,.Lio_return - jg klp_update_patch_state -#endif - -# -# _TIF_SIGPENDING or is set, call do_signal -# -.Lio_sigpending: - ENABLE_INTS_TRACE - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_signal - DISABLE_INTS_TRACE - j .Lio_return - -# -# _TIF_NOTIFY_RESUME or is set, call do_notify_resume -# -.Lio_notify_resume: - ENABLE_INTS_TRACE - lgr %r2,%r11 # pass pointer to pt_regs - brasl %r14,do_notify_resume - DISABLE_INTS_TRACE - j .Lio_return -ENDPROC(io_int_handler) - -/* - * External interrupt handler routine - */ -ENTRY(ext_int_handler) - STCK __LC_INT_CLOCK - stpt __LC_ASYNC_ENTER_TIMER - BPOFF - stmg %r8,%r15,__LC_SAVE_AREA_ASYNC - lg %r12,__LC_CURRENT - lmg %r8,%r9,__LC_EXT_OLD_PSW - SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER,__LC_INT_CLOCK - stmg %r0,%r7,__PT_R0(%r11) - # clear user controlled registers to prevent speculative use - xgr %r0,%r0 - xgr %r1,%r1 - xgr %r2,%r2 - xgr %r3,%r3 - xgr %r4,%r4 - xgr %r5,%r5 - xgr %r6,%r6 - xgr %r7,%r7 - xgr %r10,%r10 - mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC - stmg %r8,%r9,__PT_PSW(%r11) - tm __PT_PSW+1(%r11),0x01 # coming from user space? - jno .Lext_skip_asce - lctlg %c1,%c1,__LC_KERNEL_ASCE -.Lext_skip_asce: - lghi %r1,__LC_EXT_PARAMS2 - mvc __PT_INT_CODE(4,%r11),__LC_EXT_CPU_ADDR - mvc __PT_INT_PARM(4,%r11),__LC_EXT_PARAMS - mvc __PT_INT_PARM_LONG(8,%r11),0(%r1) - xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) - TRACE_IRQS_OFF - lgr %r2,%r11 # pass pointer to pt_regs - lghi %r3,EXT_INTERRUPT - brasl %r14,do_IRQ - j .Lio_return -ENDPROC(ext_int_handler) +INT_HANDLER ext_int_handler,__LC_EXT_OLD_PSW,do_ext_irq +INT_HANDLER io_int_handler,__LC_IO_OLD_PSW,do_io_irq /* * Load idle PSW. */ ENTRY(psw_idle) stg %r3,__SF_EMPTY(%r15) - larl %r1,.Lpsw_idle_exit + larl %r1,psw_idle_exit stg %r1,__SF_EMPTY+8(%r15) larl %r1,smp_cpu_mtid llgf %r1,0(%r1) ltgr %r1,%r1 jz .Lpsw_idle_stcctm - .insn rsy,0xeb0000000017,%r1,5,__SF_EMPTY+16(%r15) + .insn rsy,0xeb0000000017,%r1,5,__MT_CYCLES_ENTER(%r2) .Lpsw_idle_stcctm: oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT BPON STCK __CLOCK_IDLE_ENTER(%r2) stpt __TIMER_IDLE_ENTER(%r2) lpswe __SF_EMPTY(%r15) -.Lpsw_idle_exit: +.globl psw_idle_exit +psw_idle_exit: BR_EX %r14 ENDPROC(psw_idle) -/* - * Store floating-point controls and floating-point or vector register - * depending whether the vector facility is available. A critical section - * cleanup assures that the registers are stored even if interrupted for - * some other work. The CIF_FPU flag is set to trigger a lazy restore - * of the register contents at return from io or a system call. - */ -ENTRY(save_fpu_regs) - stnsm __SF_EMPTY(%r15),0xfc - lg %r2,__LC_CURRENT - aghi %r2,__TASK_thread - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jo .Lsave_fpu_regs_exit - stfpc __THREAD_FPU_fpc(%r2) - lg %r3,__THREAD_FPU_regs(%r2) - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - jz .Lsave_fpu_regs_fp # no -> store FP regs - VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3) - VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3) - j .Lsave_fpu_regs_done # -> set CIF_FPU flag -.Lsave_fpu_regs_fp: - std 0,0(%r3) - std 1,8(%r3) - std 2,16(%r3) - std 3,24(%r3) - std 4,32(%r3) - std 5,40(%r3) - std 6,48(%r3) - std 7,56(%r3) - std 8,64(%r3) - std 9,72(%r3) - std 10,80(%r3) - std 11,88(%r3) - std 12,96(%r3) - std 13,104(%r3) - std 14,112(%r3) - std 15,120(%r3) -.Lsave_fpu_regs_done: - oi __LC_CPU_FLAGS+7,_CIF_FPU -.Lsave_fpu_regs_exit: - ssm __SF_EMPTY(%r15) - BR_EX %r14 -.Lsave_fpu_regs_end: -ENDPROC(save_fpu_regs) -EXPORT_SYMBOL(save_fpu_regs) - -/* - * Load floating-point controls and floating-point or vector registers. - * A critical section cleanup assures that the register contents are - * loaded even if interrupted for some other work. - * - * There are special calling conventions to fit into sysc and io return work: - * %r15: - * The function requires: - * %r4 - */ -load_fpu_regs: - stnsm __SF_EMPTY(%r15),0xfc - lg %r4,__LC_CURRENT - aghi %r4,__TASK_thread - TSTMSK __LC_CPU_FLAGS,_CIF_FPU - jno .Lload_fpu_regs_exit - lfpc __THREAD_FPU_fpc(%r4) - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area - jz .Lload_fpu_regs_fp # -> no VX, load FP regs - VLM %v0,%v15,0,%r4 - VLM %v16,%v31,256,%r4 - j .Lload_fpu_regs_done -.Lload_fpu_regs_fp: - ld 0,0(%r4) - ld 1,8(%r4) - ld 2,16(%r4) - ld 3,24(%r4) - ld 4,32(%r4) - ld 5,40(%r4) - ld 6,48(%r4) - ld 7,56(%r4) - ld 8,64(%r4) - ld 9,72(%r4) - ld 10,80(%r4) - ld 11,88(%r4) - ld 12,96(%r4) - ld 13,104(%r4) - ld 14,112(%r4) - ld 15,120(%r4) -.Lload_fpu_regs_done: - ni __LC_CPU_FLAGS+7,255-_CIF_FPU -.Lload_fpu_regs_exit: - ssm __SF_EMPTY(%r15) - BR_EX %r14 -.Lload_fpu_regs_end: -ENDPROC(load_fpu_regs) - /* * Machine check handler routines */ @@ -1146,11 +528,8 @@ ENTRY(mcck_int_handler) mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID jo 3f - la %r14,__LC_SYNC_ENTER_TIMER - clc 0(8,%r14),__LC_ASYNC_ENTER_TIMER - jl 0f - la %r14,__LC_ASYNC_ENTER_TIMER -0: clc 0(8,%r14),__LC_EXIT_TIMER + la %r14,__LC_SYS_ENTER_TIMER + clc 0(8,%r14),__LC_EXIT_TIMER jl 1f la %r14,__LC_EXIT_TIMER 1: clc 0(8,%r14),__LC_LAST_UPDATE_TIMER @@ -1165,14 +544,13 @@ ENTRY(mcck_int_handler) TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic 4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off - SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER,__LC_MCCK_CLOCK + SWITCH_KERNEL __LC_GPREGS_SAVE_AREA+64 .Lmcck_skip: lghi %r14,__LC_GPREGS_SAVE_AREA+64 stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 xgr %r1,%r1 - xgr %r2,%r2 xgr %r3,%r3 xgr %r4,%r4 xgr %r5,%r5 @@ -1183,7 +561,6 @@ ENTRY(mcck_int_handler) stmg %r8,%r9,__PT_PSW(%r11) la %r14,4095 mvc __PT_CR1(8,%r11),__LC_CREGS_SAVE_AREA-4095+8(%r14) - lctlg %c1,%c1,__LC_KERNEL_ASCE xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs @@ -1195,9 +572,7 @@ ENTRY(mcck_int_handler) xc __SF_BACKCHAIN(8,%r1),__SF_BACKCHAIN(%r1) la %r11,STACK_FRAME_OVERHEAD(%r1) lgr %r15,%r1 - TRACE_IRQS_OFF brasl %r14,s390_handle_mcck - TRACE_IRQS_ON .Lmcck_return: lctlg %c1,%c1,__PT_CR1(%r11) lmg %r0,%r10,__PT_R0(%r11) diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index a16c33b32ab0..3d0c0ac5c20e 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -17,8 +17,9 @@ void io_int_handler(void); void mcck_int_handler(void); void restart_int_handler(void); -asmlinkage long do_syscall_trace_enter(struct pt_regs *regs); -asmlinkage void do_syscall_trace_exit(struct pt_regs *regs); +void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs); +void __do_pgm_check(struct pt_regs *regs); +void __do_syscall(struct pt_regs *regs, int per_trap); void do_protection_exception(struct pt_regs *regs); void do_dat_exception(struct pt_regs *regs); @@ -48,9 +49,7 @@ void translation_exception(struct pt_regs *regs); void vector_exception(struct pt_regs *regs); void monitor_event_exception(struct pt_regs *regs); -void do_per_trap(struct pt_regs *regs); void do_report_trap(struct pt_regs *regs, int si_signo, int si_code, char *str); -void syscall_trace(struct pt_regs *regs, int entryexit); void kernel_stack_overflow(struct pt_regs * regs); void do_signal(struct pt_regs *regs); void handle_signal32(struct ksignal *ksig, sigset_t *oldset, @@ -58,7 +57,8 @@ void handle_signal32(struct ksignal *ksig, sigset_t *oldset, void do_notify_resume(struct pt_regs *regs); void __init init_IRQ(void); -void do_IRQ(struct pt_regs *regs, int irq); +void do_io_irq(struct pt_regs *regs); +void do_ext_irq(struct pt_regs *regs); void do_restart(void); void __init startup_init(void); void die(struct pt_regs *regs, const char *str); @@ -82,8 +82,6 @@ long sys_s390_sthyi(unsigned long function_code, void __user *buffer, u64 __user DECLARE_PER_CPU(u64, mt_cycles[8]); -void gs_load_bc_cb(struct pt_regs *regs); - unsigned long stack_alloc(void); void stack_free(unsigned long stack); diff --git a/arch/s390/kernel/fpu.c b/arch/s390/kernel/fpu.c index 0da378e2eb25..d864c9a325e2 100644 --- a/arch/s390/kernel/fpu.c +++ b/arch/s390/kernel/fpu.c @@ -175,3 +175,91 @@ void __kernel_fpu_end(struct kernel_fpu *state, u32 flags) : "1", "cc"); } EXPORT_SYMBOL(__kernel_fpu_end); + +void __load_fpu_regs(void) +{ + struct fpu *state = ¤t->thread.fpu; + unsigned long *regs = current->thread.fpu.regs; + + asm volatile("lfpc %0" : : "Q" (state->fpc)); + if (likely(MACHINE_HAS_VX)) { + asm volatile("lgr 1,%0\n" + "VLM 0,15,0,1\n" + "VLM 16,31,256,1\n" + : + : "d" (regs) + : "1", "cc", "memory"); + } else { + asm volatile("ld 0,%0" : : "Q" (regs[0])); + asm volatile("ld 1,%0" : : "Q" (regs[1])); + asm volatile("ld 2,%0" : : "Q" (regs[2])); + asm volatile("ld 3,%0" : : "Q" (regs[3])); + asm volatile("ld 4,%0" : : "Q" (regs[4])); + asm volatile("ld 5,%0" : : "Q" (regs[5])); + asm volatile("ld 6,%0" : : "Q" (regs[6])); + asm volatile("ld 7,%0" : : "Q" (regs[7])); + asm volatile("ld 8,%0" : : "Q" (regs[8])); + asm volatile("ld 9,%0" : : "Q" (regs[9])); + asm volatile("ld 10,%0" : : "Q" (regs[10])); + asm volatile("ld 11,%0" : : "Q" (regs[11])); + asm volatile("ld 12,%0" : : "Q" (regs[12])); + asm volatile("ld 13,%0" : : "Q" (regs[13])); + asm volatile("ld 14,%0" : : "Q" (regs[14])); + asm volatile("ld 15,%0" : : "Q" (regs[15])); + } + clear_cpu_flag(CIF_FPU); +} +EXPORT_SYMBOL(__load_fpu_regs); + +void load_fpu_regs(void) +{ + raw_local_irq_disable(); + __load_fpu_regs(); + raw_local_irq_enable(); +} +EXPORT_SYMBOL(load_fpu_regs); + +void save_fpu_regs(void) +{ + unsigned long flags, *regs; + struct fpu *state; + + local_irq_save(flags); + + if (test_cpu_flag(CIF_FPU)) + goto out; + + state = ¤t->thread.fpu; + regs = current->thread.fpu.regs; + + asm volatile("stfpc %0" : "=Q" (state->fpc)); + if (likely(MACHINE_HAS_VX)) { + asm volatile("lgr 1,%0\n" + "VSTM 0,15,0,1\n" + "VSTM 16,31,256,1\n" + : + : "d" (regs) + : "1", "cc", "memory"); + } else { + asm volatile("std 0,%0" : "=Q" (regs[0])); + asm volatile("std 1,%0" : "=Q" (regs[1])); + asm volatile("std 2,%0" : "=Q" (regs[2])); + asm volatile("std 3,%0" : "=Q" (regs[3])); + asm volatile("std 4,%0" : "=Q" (regs[4])); + asm volatile("std 5,%0" : "=Q" (regs[5])); + asm volatile("std 6,%0" : "=Q" (regs[6])); + asm volatile("std 7,%0" : "=Q" (regs[7])); + asm volatile("std 8,%0" : "=Q" (regs[8])); + asm volatile("std 9,%0" : "=Q" (regs[9])); + asm volatile("std 10,%0" : "=Q" (regs[10])); + asm volatile("std 11,%0" : "=Q" (regs[11])); + asm volatile("std 12,%0" : "=Q" (regs[12])); + asm volatile("std 13,%0" : "=Q" (regs[13])); + asm volatile("std 14,%0" : "=Q" (regs[14])); + asm volatile("std 15,%0" : "=Q" (regs[15])); + } + set_cpu_flag(CIF_FPU); +out: + local_irq_restore(flags); +} +EXPORT_SYMBOL(save_fpu_regs); diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index a5d4d80d6ede..812073ea073e 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -14,12 +14,36 @@ #include #include #include +#include #include #include #include "entry.h" static DEFINE_PER_CPU(struct s390_idle_data, s390_idle); +void account_idle_time_irq(void) +{ + struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); + u64 cycles_new[8]; + int i; + + clear_cpu_flag(CIF_ENABLED_WAIT); + if (smp_cpu_mtid) { + stcctm(MT_DIAG, smp_cpu_mtid, cycles_new); + for (i = 0; i < smp_cpu_mtid; i++) + this_cpu_add(mt_cycles[i], cycles_new[i] - idle->mt_cycles_enter[i]); + } + + idle->clock_idle_exit = S390_lowcore.int_clock; + idle->timer_idle_exit = S390_lowcore.sys_enter_timer; + + S390_lowcore.steal_timer += idle->clock_idle_enter - S390_lowcore.last_update_clock; + S390_lowcore.last_update_clock = idle->clock_idle_exit; + + S390_lowcore.system_timer += S390_lowcore.last_update_timer - idle->timer_idle_enter; + S390_lowcore.last_update_timer = idle->timer_idle_exit; +} + void arch_cpu_idle(void) { struct s390_idle_data *idle = this_cpu_ptr(&s390_idle); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index f8a8b9428ae2..c6d40bcf4a68 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -95,19 +96,97 @@ static const struct irq_class irqclass_sub_desc[] = { {.irq = CPU_RST, .name = "RST", .desc = "[CPU] CPU Restart"}, }; -void do_IRQ(struct pt_regs *regs, int irq) +static void do_IRQ(struct pt_regs *regs, int irq) { - struct pt_regs *old_regs; - - old_regs = set_irq_regs(regs); - irq_enter(); if (tod_after_eq(S390_lowcore.int_clock, S390_lowcore.clock_comparator)) /* Serve timer interrupts first. */ clock_comparator_work(); generic_handle_irq(irq); +} + +static int on_async_stack(void) +{ + unsigned long frame = current_frame_address(); + + return !!!((S390_lowcore.async_stack - frame) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)); +} + +static void do_irq_async(struct pt_regs *regs, int irq) +{ + if (on_async_stack()) + do_IRQ(regs, irq); + else + CALL_ON_STACK(do_IRQ, S390_lowcore.async_stack, 2, regs, irq); +} + +static int irq_pending(struct pt_regs *regs) +{ + int cc; + + asm volatile("tpi 0\n" + "ipm %0" : "=d" (cc) : : "cc"); + return cc >> 28; +} + +void noinstr do_io_irq(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + struct pt_regs *old_regs = set_irq_regs(regs); + int from_idle; + + irq_enter(); + + if (user_mode(regs)) + update_timer_sys(); + + from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + if (from_idle) + account_idle_time_irq(); + + do { + memcpy(®s->int_code, &S390_lowcore.subchannel_id, 12); + if (S390_lowcore.io_int_word & BIT(31)) + do_irq_async(regs, THIN_INTERRUPT); + else + do_irq_async(regs, IO_INTERRUPT); + } while (MACHINE_IS_LPAR && irq_pending(regs)); + + irq_exit(); + set_irq_regs(old_regs); + irqentry_exit(regs, state); + + if (from_idle) + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); +} + +void noinstr do_ext_irq(struct pt_regs *regs) +{ + irqentry_state_t state = irqentry_enter(regs); + struct pt_regs *old_regs = set_irq_regs(regs); + int from_idle; + + irq_enter(); + + if (user_mode(regs)) + update_timer_sys(); + + memcpy(®s->int_code, &S390_lowcore.ext_cpu_addr, 4); + regs->int_parm = S390_lowcore.ext_params; + regs->int_parm_long = *(unsigned long *)S390_lowcore.ext_params2; + + from_idle = !user_mode(regs) && regs->psw.addr == (unsigned long)psw_idle_exit; + if (from_idle) + account_idle_time_irq(); + + do_irq_async(regs, EXT_INTERRUPT); + irq_exit(); set_irq_regs(old_regs); + irqentry_exit(regs, state); + + if (from_idle) + regs->psw.mask &= ~(PSW_MASK_EXT | PSW_MASK_IO | PSW_MASK_WAIT); } static void show_msi_interrupt(struct seq_file *p, int irq) diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 86c8d5370e7f..11f8c296f60d 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -131,12 +131,11 @@ static notrace void s390_handle_damage(void) NOKPROBE_SYMBOL(s390_handle_damage); /* - * Main machine check handler function. Will be called with interrupts enabled - * or disabled and machine checks enabled or disabled. + * Main machine check handler function. Will be called with interrupts disabled + * and machine checks enabled. */ -void s390_handle_mcck(void) +void __s390_handle_mcck(void) { - unsigned long flags; struct mcck_struct mcck; /* @@ -144,12 +143,10 @@ void s390_handle_mcck(void) * machine checks. Afterwards delete the old state and enable machine * checks again. */ - local_irq_save(flags); local_mcck_disable(); mcck = *this_cpu_ptr(&cpu_mcck); memset(this_cpu_ptr(&cpu_mcck), 0, sizeof(mcck)); local_mcck_enable(); - local_irq_restore(flags); if (mcck.channel_report) crw_handle_channel_report(); @@ -181,8 +178,13 @@ void s390_handle_mcck(void) do_exit(SIGSEGV); } } -EXPORT_SYMBOL_GPL(s390_handle_mcck); +void noinstr s390_handle_mcck(void) +{ + trace_hardirqs_off(); + __s390_handle_mcck(); + trace_hardirqs_on(); +} /* * returns 0 if all required registers are available * returns 1 otherwise @@ -344,6 +346,9 @@ int notrace s390_do_machine_check(struct pt_regs *regs) int mcck_pending = 0; nmi_enter(); + + if (user_mode(regs)) + update_timer_mcck(); inc_irq_stat(NMI_NMI); mci.val = S390_lowcore.mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index bc3ca54edfb4..367bd000f6d1 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -43,9 +44,22 @@ #include #include "entry.h" -asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); +void ret_from_fork(void) asm("ret_from_fork"); -extern void kernel_thread_starter(void); +void __ret_from_fork(struct task_struct *prev, struct pt_regs *regs) +{ + void (*func)(void *arg); + + schedule_tail(prev); + + if (!user_mode(regs)) { + /* Kernel thread */ + func = (void *)regs->gprs[9]; + func((void *)regs->gprs[10]); + } + clear_pt_regs_flag(regs, PIF_SYSCALL); + syscall_exit_to_user_mode(regs); +} void flush_thread(void) { @@ -108,10 +122,12 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, p->thread.last_break = 1; frame->sf.back_chain = 0; + frame->sf.gprs[5] = (unsigned long)frame + sizeof(struct stack_frame); + frame->sf.gprs[6] = (unsigned long)p; /* new return point is ret_from_fork */ - frame->sf.gprs[8] = (unsigned long) ret_from_fork; + frame->sf.gprs[8] = (unsigned long)ret_from_fork; /* fake return stack for resume(), don't go back to schedule */ - frame->sf.gprs[9] = (unsigned long) frame; + frame->sf.gprs[9] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ if (unlikely(p->flags & PF_KTHREAD)) { @@ -120,10 +136,10 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; frame->childregs.psw.addr = - (unsigned long) kernel_thread_starter; + (unsigned long)__ret_from_fork; frame->childregs.gprs[9] = new_stackp; /* function */ frame->childregs.gprs[10] = arg; - frame->childregs.gprs[11] = (unsigned long) do_exit; + frame->childregs.gprs[11] = (unsigned long)do_exit; frame->childregs.orig_gpr2 = -1; return 0; @@ -153,7 +169,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, return 0; } -asmlinkage void execve_tail(void) +void execve_tail(void) { current->thread.fpu.fpc = 0; asm volatile("sfpc %0" : : "d" (0)); diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index a76dd27fb2e8..18b3416fd663 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -7,6 +7,7 @@ * Martin Schwidefsky (schwidefsky@de.ibm.com) */ +#include "asm/ptrace.h" #include #include #include @@ -37,9 +38,6 @@ #include "compat_ptrace.h" #endif -#define CREATE_TRACE_POINTS -#include - void update_cr_regs(struct task_struct *task) { struct pt_regs *regs = task_pt_regs(task); @@ -140,7 +138,7 @@ void ptrace_disable(struct task_struct *task) memset(&task->thread.per_user, 0, sizeof(task->thread.per_user)); memset(&task->thread.per_event, 0, sizeof(task->thread.per_event)); clear_tsk_thread_flag(task, TIF_SINGLE_STEP); - clear_pt_regs_flag(task_pt_regs(task), PIF_PER_TRAP); + clear_tsk_thread_flag(task, TIF_PER_TRAP); task->thread.per_flags = 0; } @@ -322,25 +320,6 @@ static inline void __poke_user_per(struct task_struct *child, child->thread.per_user.end = data; } -static void fixup_int_code(struct task_struct *child, addr_t data) -{ - struct pt_regs *regs = task_pt_regs(child); - int ilc = regs->int_code >> 16; - u16 insn; - - if (ilc > 6) - return; - - if (ptrace_access_vm(child, regs->psw.addr - (regs->int_code >> 16), - &insn, sizeof(insn), FOLL_FORCE) != sizeof(insn)) - return; - - /* double check that tracee stopped on svc instruction */ - if ((insn >> 8) != 0xa) - return; - - regs->int_code = 0x20000 | (data & 0xffff); -} /* * Write a word to the user area of a process at location addr. This * operation does have an additional problem compared to peek_user. @@ -374,10 +353,12 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) } if (test_pt_regs_flag(regs, PIF_SYSCALL) && - addr == offsetof(struct user, regs.gprs[2])) - fixup_int_code(child, data); - *(addr_t *)((addr_t) ®s->psw + addr) = data; + addr == offsetof(struct user, regs.gprs[2])) { + struct pt_regs *regs = task_pt_regs(child); + regs->int_code = 0x20000 | (data & 0xffff); + } + *(addr_t *)((addr_t) ®s->psw + addr) = data; } else if (addr < (addr_t) (&dummy->regs.orig_gpr2)) { /* * access registers are stored in the thread structure @@ -742,10 +723,12 @@ static int __poke_user_compat(struct task_struct *child, regs->psw.mask = (regs->psw.mask & ~PSW_MASK_BA) | (__u64)(tmp & PSW32_ADDR_AMODE); } else { - if (test_pt_regs_flag(regs, PIF_SYSCALL) && - addr == offsetof(struct compat_user, regs.gprs[2])) - fixup_int_code(child, data); + addr == offsetof(struct compat_user, regs.gprs[2])) { + struct pt_regs *regs = task_pt_regs(child); + + regs->int_code = 0x20000 | (data & 0xffff); + } /* gpr 0-15 */ *(__u32*)((addr_t) ®s->psw + addr*2 + 4) = tmp; } @@ -862,82 +845,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } #endif -asmlinkage long do_syscall_trace_enter(struct pt_regs *regs) -{ - unsigned long mask = -1UL; - long ret = -1; - - if (is_compat_task()) - mask = 0xffffffff; - - /* - * The sysc_tracesys code in entry.S stored the system - * call number to gprs[2]. - */ - if (test_thread_flag(TIF_SYSCALL_TRACE) && - tracehook_report_syscall_entry(regs)) { - /* - * Tracing decided this syscall should not happen. Skip - * the system call and the system call restart handling. - */ - goto skip; - } - -#ifdef CONFIG_SECCOMP - /* Do the secure computing check after ptrace. */ - if (unlikely(test_thread_flag(TIF_SECCOMP))) { - struct seccomp_data sd; - - if (is_compat_task()) { - sd.instruction_pointer = regs->psw.addr & 0x7fffffff; - sd.arch = AUDIT_ARCH_S390; - } else { - sd.instruction_pointer = regs->psw.addr; - sd.arch = AUDIT_ARCH_S390X; - } - - sd.nr = regs->int_code & 0xffff; - sd.args[0] = regs->orig_gpr2 & mask; - sd.args[1] = regs->gprs[3] & mask; - sd.args[2] = regs->gprs[4] & mask; - sd.args[3] = regs->gprs[5] & mask; - sd.args[4] = regs->gprs[6] & mask; - sd.args[5] = regs->gprs[7] & mask; - - if (__secure_computing(&sd) == -1) - goto skip; - } -#endif /* CONFIG_SECCOMP */ - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->int_code & 0xffff); - - - audit_syscall_entry(regs->int_code & 0xffff, regs->orig_gpr2 & mask, - regs->gprs[3] &mask, regs->gprs[4] &mask, - regs->gprs[5] &mask); - - if ((signed long)regs->gprs[2] >= NR_syscalls) { - regs->gprs[2] = -ENOSYS; - ret = -ENOSYS; - } - return regs->gprs[2]; -skip: - clear_pt_regs_flag(regs, PIF_SYSCALL); - return ret; -} - -asmlinkage void do_syscall_trace_exit(struct pt_regs *regs) -{ - audit_syscall_exit(regs); - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_exit(regs, regs->gprs[2]); - - if (test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, 0); -} - /* * user_regset definitions. */ diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 1fbed91c73bc..c7feda84edbb 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -411,8 +411,7 @@ static void __init setup_lowcore_dat_off(void) memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, sizeof(lc->alt_stfle_fac_list)); nmi_alloc_boot_cpu(lc); - lc->sync_enter_timer = S390_lowcore.sync_enter_timer; - lc->async_enter_timer = S390_lowcore.async_enter_timer; + lc->sys_enter_timer = S390_lowcore.sys_enter_timer; lc->exit_timer = S390_lowcore.exit_timer; lc->user_timer = S390_lowcore.user_timer; lc->system_timer = S390_lowcore.system_timer; diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index b27b6c1f058d..fce1b2a28a40 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -170,6 +170,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ + clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } @@ -459,7 +460,8 @@ static void handle_signal(struct ksignal *ksig, sigset_t *oldset, * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -void do_signal(struct pt_regs *regs) + +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { struct ksignal ksig; sigset_t *oldset = sigmask_to_save(); @@ -472,7 +474,7 @@ void do_signal(struct pt_regs *regs) current->thread.system_call = test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; - if (test_thread_flag(TIF_SIGPENDING) && get_signal(&ksig)) { + if (has_signal && get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ if (current->thread.system_call) { regs->int_code = current->thread.system_call; @@ -498,6 +500,7 @@ void do_signal(struct pt_regs *regs) } /* No longer in a system call */ clear_pt_regs_flag(regs, PIF_SYSCALL); + clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); rseq_signal_deliver(&ksig, regs); if (is_compat_task()) handle_signal32(&ksig, oldset, regs); @@ -508,6 +511,7 @@ void do_signal(struct pt_regs *regs) /* No handlers present - check for system call restart */ clear_pt_regs_flag(regs, PIF_SYSCALL); + clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); if (current->thread.system_call) { regs->int_code = current->thread.system_call; switch (regs->gprs[2]) { @@ -520,9 +524,9 @@ void do_signal(struct pt_regs *regs) case -ERESTARTNOINTR: /* Restart system call with magic TIF bit. */ regs->gprs[2] = regs->orig_gpr2; - set_pt_regs_flag(regs, PIF_SYSCALL); + set_pt_regs_flag(regs, PIF_SYSCALL_RESTART); if (test_thread_flag(TIF_SINGLE_STEP)) - clear_pt_regs_flag(regs, PIF_PER_TRAP); + clear_thread_flag(TIF_PER_TRAP); break; } } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 27c763014114..c5abbb94ac6e 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -499,7 +499,7 @@ static void smp_handle_ext_call(void) if (test_bit(ec_call_function_single, &bits)) generic_smp_call_function_single_interrupt(); if (test_bit(ec_mcck_pending, &bits)) - s390_handle_mcck(); + __s390_handle_mcck(); } static void do_ext_call_interrupt(struct ext_code ext_code, diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c deleted file mode 100644 index 202fa73ac167..000000000000 --- a/arch/s390/kernel/sys_s390.c +++ /dev/null @@ -1,102 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * S390 version - * Copyright IBM Corp. 1999, 2000 - * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), - * Thomas Spatzier (tspat@de.ibm.com) - * - * Derived from "arch/i386/kernel/sys_i386.c" - * - * This file contains various random system calls that - * have a non-standard calling sequence on the Linux/s390 - * platform. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "entry.h" - -/* - * Perform the mmap() system call. Linux for S/390 isn't able to handle more - * than 5 system call parameters, so this system call uses a memory block - * for parameter passing. - */ - -struct s390_mmap_arg_struct { - unsigned long addr; - unsigned long len; - unsigned long prot; - unsigned long flags; - unsigned long fd; - unsigned long offset; -}; - -SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) -{ - struct s390_mmap_arg_struct a; - int error = -EFAULT; - - if (copy_from_user(&a, arg, sizeof(a))) - goto out; - error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); -out: - return error; -} - -#ifdef CONFIG_SYSVIPC -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls. - */ -SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second, - unsigned long, third, void __user *, ptr) -{ - if (call >> 16) - return -EINVAL; - /* The s390 sys_ipc variant has only five parameters instead of six - * like the generic variant. The only difference is the handling of - * the SEMTIMEDOP subcall where on s390 the third parameter is used - * as a pointer to a struct timespec where the generic variant uses - * the fifth parameter. - * Therefore we can call the generic variant by simply passing the - * third parameter also as fifth parameter. - */ - return ksys_ipc(call, first, second, third, ptr, third); -} -#endif /* CONFIG_SYSVIPC */ - -SYSCALL_DEFINE1(s390_personality, unsigned int, personality) -{ - unsigned int ret = current->personality; - - if (personality(current->personality) == PER_LINUX32 && - personality(personality) == PER_LINUX) - personality |= PER_LINUX32; - - if (personality != 0xffffffff) - set_personality(personality); - - if (personality(ret) == PER_LINUX32) - ret &= ~PER_LINUX32; - - return ret; -} - -SYSCALL_DEFINE0(ni_syscall) -{ - return -ENOSYS; -} diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c new file mode 100644 index 000000000000..25c0fb19b0a5 --- /dev/null +++ b/arch/s390/kernel/syscall.c @@ -0,0 +1,172 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * S390 version + * Copyright IBM Corp. 1999, 2000 + * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com), + * Thomas Spatzier (tspat@de.ibm.com) + * + * Derived from "arch/i386/kernel/sys_i386.c" + * + * This file contains various random system calls that + * have a non-standard calling sequence on the Linux/s390 + * platform. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "entry.h" + +/* + * Perform the mmap() system call. Linux for S/390 isn't able to handle more + * than 5 system call parameters, so this system call uses a memory block + * for parameter passing. + */ + +struct s390_mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(mmap2, struct s390_mmap_arg_struct __user *, arg) +{ + struct s390_mmap_arg_struct a; + int error = -EFAULT; + + if (copy_from_user(&a, arg, sizeof(a))) + goto out; + error = ksys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.offset); +out: + return error; +} + +#ifdef CONFIG_SYSVIPC +/* + * sys_ipc() is the de-multiplexer for the SysV IPC calls. + */ +SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second, + unsigned long, third, void __user *, ptr) +{ + if (call >> 16) + return -EINVAL; + /* The s390 sys_ipc variant has only five parameters instead of six + * like the generic variant. The only difference is the handling of + * the SEMTIMEDOP subcall where on s390 the third parameter is used + * as a pointer to a struct timespec where the generic variant uses + * the fifth parameter. + * Therefore we can call the generic variant by simply passing the + * third parameter also as fifth parameter. + */ + return ksys_ipc(call, first, second, third, ptr, third); +} +#endif /* CONFIG_SYSVIPC */ + +SYSCALL_DEFINE1(s390_personality, unsigned int, personality) +{ + unsigned int ret = current->personality; + + if (personality(current->personality) == PER_LINUX32 && + personality(personality) == PER_LINUX) + personality |= PER_LINUX32; + + if (personality != 0xffffffff) + set_personality(personality); + + if (personality(ret) == PER_LINUX32) + ret &= ~PER_LINUX32; + + return ret; +} + +SYSCALL_DEFINE0(ni_syscall) +{ + return -ENOSYS; +} + +void do_syscall(struct pt_regs *regs) +{ + unsigned long nr; + + nr = regs->int_code & 0xffff; + if (!nr) { + nr = regs->gprs[1] & 0xffff; + regs->int_code &= ~0xffffUL; + regs->int_code |= nr; + } + + regs->gprs[2] = nr; + + nr = syscall_enter_from_user_mode_work(regs, nr); + + /* + * In the s390 ptrace ABI, both the syscall number and the return value + * use gpr2. However, userspace puts the syscall number either in the + * svc instruction itself, or uses gpr1. To make at least skipping syscalls + * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here + * and if set, the syscall will be skipped. + */ + if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) { + regs->gprs[2] = -ENOSYS; + if (likely(nr < NR_syscalls)) { + regs->gprs[2] = current->thread.sys_call_table[nr]( + regs->orig_gpr2, regs->gprs[3], + regs->gprs[4], regs->gprs[5], + regs->gprs[6], regs->gprs[7]); + } + } else { + clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET); + } + syscall_exit_to_user_mode_work(regs); +} + +void noinstr __do_syscall(struct pt_regs *regs, int per_trap) +{ + enter_from_user_mode(regs); + + memcpy(®s->gprs[8], S390_lowcore.save_area_sync, 8 * sizeof(unsigned long)); + memcpy(®s->int_code, &S390_lowcore.svc_ilc, sizeof(regs->int_code)); + regs->psw = S390_lowcore.svc_old_psw; + + update_timer_sys(); + + local_irq_enable(); + regs->orig_gpr2 = regs->gprs[2]; + + if (per_trap) + set_thread_flag(TIF_PER_TRAP); + + for (;;) { + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); + do_syscall(regs); + if (!test_pt_regs_flag(regs, PIF_SYSCALL_RESTART)) + break; + local_irq_enable(); + } + exit_to_user_mode(); +} diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 8d1e8a1a97df..db7dd59b570c 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -13,6 +13,8 @@ * 'Traps.c' handles hardware traps and faults after we have saved some * state in 'asm.s'. */ +#include "asm/irqflags.h" +#include "asm/ptrace.h" #include #include #include @@ -23,7 +25,9 @@ #include #include #include +#include #include +#include #include "entry.h" static inline void __user *get_trap_ip(struct pt_regs *regs) @@ -288,3 +292,64 @@ void __init trap_init(void) local_mcck_enable(); test_monitor_call(); } + +void noinstr __do_pgm_check(struct pt_regs *regs) +{ + unsigned long last_break = S390_lowcore.breaking_event_addr; + unsigned int trapnr, syscall_redirect = 0; + irqentry_state_t state; + + regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; + regs->int_parm_long = S390_lowcore.trans_exc_code; + + state = irqentry_enter(regs); + + if (user_mode(regs)) { + update_timer_sys(); + if (last_break < 4096) + last_break = 1; + current->thread.last_break = last_break; + regs->args[0] = last_break; + } + + if (S390_lowcore.pgm_code & 0x0200) { + /* transaction abort */ + memcpy(¤t->thread.trap_tdb, &S390_lowcore.pgm_tdb, 256); + } + + if (S390_lowcore.pgm_code & PGM_INT_CODE_PER) { + if (user_mode(regs)) { + struct per_event *ev = ¤t->thread.per_event; + + set_thread_flag(TIF_PER_TRAP); + ev->address = S390_lowcore.per_address; + ev->cause = *(u16 *)&S390_lowcore.per_code; + ev->paid = S390_lowcore.per_access_id; + } else { + /* PER event in kernel is kprobes */ + __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); + do_per_trap(regs); + goto out; + } + } + + if (!irqs_disabled_flags(regs->psw.mask)) + trace_hardirqs_on(); + __arch_local_irq_ssm(regs->psw.mask & ~PSW_MASK_PER); + + trapnr = regs->int_code & PGM_INT_CODE_MASK; + if (trapnr) + pgm_check_table[trapnr](regs); + syscall_redirect = user_mode(regs) && test_pt_regs_flag(regs, PIF_SYSCALL); +out: + local_irq_disable(); + irqentry_exit(regs, state); + + if (syscall_redirect) { + enter_from_user_mode(regs); + local_irq_enable(); + regs->orig_gpr2 = regs->gprs[2]; + do_syscall(regs); + exit_to_user_mode(); + } +} diff --git a/arch/s390/kernel/uprobes.c b/arch/s390/kernel/uprobes.c index 5007fac01bb5..bbf8622bbf5d 100644 --- a/arch/s390/kernel/uprobes.c +++ b/arch/s390/kernel/uprobes.c @@ -32,7 +32,7 @@ int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) return -EINVAL; if (!is_compat_task() && psw_bits(regs->psw).eaba == PSW_BITS_AMODE_31BIT) return -EINVAL; - clear_pt_regs_flag(regs, PIF_PER_TRAP); + clear_thread_flag(TIF_PER_TRAP); auprobe->saved_per = psw_bits(regs->psw).per; auprobe->saved_int_code = regs->int_code; regs->int_code = UPROBE_TRAP_NR; @@ -103,7 +103,7 @@ int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) /* fix per address */ current->thread.per_event.address = utask->vaddr; /* trigger per event */ - set_pt_regs_flag(regs, PIF_PER_TRAP); + set_thread_flag(TIF_PER_TRAP); } return 0; } @@ -259,7 +259,7 @@ static void sim_stor_event(struct pt_regs *regs, void *addr, int len) return; current->thread.per_event.address = regs->psw.addr; current->thread.per_event.cause = PER_EVENT_STORE >> 16; - set_pt_regs_flag(regs, PIF_PER_TRAP); + set_thread_flag(TIF_PER_TRAP); } /* -- cgit v1.2.3 From 3a790cc1c9ef1b7b613cf648e6fb756a842caa16 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Mon, 18 Jan 2021 09:35:38 +0100 Subject: s390: pass struct pt_regs instead of registers to syscalls Instead of fetching all registers from struct pt_regs and passing them to the syscall wrappers, let the system call wrappers only fetch the values really required. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/processor.h | 4 +- arch/s390/include/asm/syscall_wrapper.h | 114 ++++++++++++++++++++------------ arch/s390/kernel/syscall.c | 8 +-- 3 files changed, 75 insertions(+), 51 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index fa67b66bf144..023a15dc25a3 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -40,9 +40,7 @@ #include #include -typedef long (*sys_call_ptr_t)(unsigned long, unsigned long, - unsigned long, unsigned long, - unsigned long, unsigned long); +typedef long (*sys_call_ptr_t)(struct pt_regs *regs); static inline void set_cpu_flag(int flag) { diff --git a/arch/s390/include/asm/syscall_wrapper.h b/arch/s390/include/asm/syscall_wrapper.h index 5364bfc866e0..ad2c996e7e93 100644 --- a/arch/s390/include/asm/syscall_wrapper.h +++ b/arch/s390/include/asm/syscall_wrapper.h @@ -7,6 +7,33 @@ #ifndef _ASM_S390_SYSCALL_WRAPPER_H #define _ASM_S390_SYSCALL_WRAPPER_H +#define __SC_TYPE(t, a) t + +#define SYSCALL_PT_ARG6(regs, m, t1, t2, t3, t4, t5, t6)\ + SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5), \ + m(t6, (regs->gprs[7])) + +#define SYSCALL_PT_ARG5(regs, m, t1, t2, t3, t4, t5) \ + SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4), \ + m(t5, (regs->gprs[6])) + +#define SYSCALL_PT_ARG4(regs, m, t1, t2, t3, t4) \ + SYSCALL_PT_ARG3(regs, m, t1, t2, t3), \ + m(t4, (regs->gprs[5])) + +#define SYSCALL_PT_ARG3(regs, m, t1, t2, t3) \ + SYSCALL_PT_ARG2(regs, m, t1, t2), \ + m(t3, (regs->gprs[4])) + +#define SYSCALL_PT_ARG2(regs, m, t1, t2) \ + SYSCALL_PT_ARG1(regs, m, t1), \ + m(t2, (regs->gprs[3])) + +#define SYSCALL_PT_ARG1(regs, m, t1) \ + m(t1, (regs->orig_gpr2)) + +#define SYSCALL_PT_ARGS(x, ...) SYSCALL_PT_ARG##x(__VA_ARGS__) + #ifdef CONFIG_COMPAT #define __SC_COMPAT_TYPE(t, a) \ __typeof(__builtin_choose_expr(sizeof(t) > 4, 0L, (t)0)) a @@ -29,14 +56,15 @@ (t)__ReS; \ }) -#define __S390_SYS_STUBx(x, name, ...) \ - long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ - ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ - long __s390_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ - { \ - long ret = __s390x_sys##name(__MAP(x,__SC_COMPAT_CAST,__VA_ARGS__));\ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ +#define __S390_SYS_STUBx(x, name, ...) \ + long __s390_sys##name(struct pt_regs *regs); \ + ALLOW_ERROR_INJECTION(__s390_sys##name, ERRNO); \ + long __s390_sys##name(struct pt_regs *regs) \ + { \ + long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \ + __SC_COMPAT_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + return ret; \ } /* @@ -65,23 +93,24 @@ SYSCALL_ALIAS(__s390x_sys_##name, sys_ni_posix_timers); \ SYSCALL_ALIAS(__s390_sys_##name, sys_ni_posix_timers) -#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments");\ - long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - long __s390_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ - __attribute__((alias(__stringify(__se_compat_sys##name)))); \ - ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ - static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\ - long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ - long __se_compat_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ - { \ - long ret = __do_compat_sys##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));\ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } \ - __diag_pop(); \ +#define COMPAT_SYSCALL_DEFINEx(x, name, ...) \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments"); \ + long __s390_compat_sys##name(struct pt_regs *regs); \ + long __s390_compat_sys##name(struct pt_regs *regs) \ + __attribute__((alias(__stringify(__se_compat_sys##name)))); \ + ALLOW_ERROR_INJECTION(__s390_compat_sys##name, ERRNO); \ + static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + long __se_compat_sys##name(struct pt_regs *regs); \ + long __se_compat_sys##name(struct pt_regs *regs) \ + { \ + long ret = __do_compat_sys##name(SYSCALL_PT_ARGS(x, regs, __SC_DELOUSE, \ + __MAP(x, __SC_TYPE, __VA_ARGS__))); \ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + return ret; \ + } \ + __diag_pop(); \ static inline long __do_compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) /* @@ -113,23 +142,24 @@ #endif /* CONFIG_COMPAT */ -#define __SYSCALL_DEFINEx(x, name, ...) \ - __diag_push(); \ - __diag_ignore(GCC, 8, "-Wattribute-alias", \ - "Type aliasing is used to sanitize syscall arguments");\ - long __s390x_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \ - __attribute__((alias(__stringify(__se_sys##name)))); \ - ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ - long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \ - static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ - __S390_SYS_STUBx(x, name, __VA_ARGS__) \ - long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \ - { \ - long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \ - __MAP(x,__SC_TEST,__VA_ARGS__); \ - return ret; \ - } \ - __diag_pop(); \ +#define __SYSCALL_DEFINEx(x, name, ...) \ + __diag_push(); \ + __diag_ignore(GCC, 8, "-Wattribute-alias", \ + "Type aliasing is used to sanitize syscall arguments"); \ + long __s390x_sys##name(struct pt_regs *regs) \ + __attribute__((alias(__stringify(__se_sys##name)))); \ + ALLOW_ERROR_INJECTION(__s390x_sys##name, ERRNO); \ + static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \ + long __se_sys##name(struct pt_regs *regs); \ + __S390_SYS_STUBx(x, name, __VA_ARGS__) \ + long __se_sys##name(struct pt_regs *regs) \ + { \ + long ret = __do_sys##name(SYSCALL_PT_ARGS(x, regs, \ + __SC_CAST, __MAP(x, __SC_TYPE, __VA_ARGS__))); \ + __MAP(x,__SC_TEST,__VA_ARGS__); \ + return ret; \ + } \ + __diag_pop(); \ static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) #endif /* _ASM_X86_SYSCALL_WRAPPER_H */ diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 25c0fb19b0a5..bc8e650e377d 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -132,12 +132,8 @@ void do_syscall(struct pt_regs *regs) */ if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) { regs->gprs[2] = -ENOSYS; - if (likely(nr < NR_syscalls)) { - regs->gprs[2] = current->thread.sys_call_table[nr]( - regs->orig_gpr2, regs->gprs[3], - regs->gprs[4], regs->gprs[5], - regs->gprs[6], regs->gprs[7]); - } + if (likely(nr < NR_syscalls)) + regs->gprs[2] = current->thread.sys_call_table[nr](regs); } else { clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET); } -- cgit v1.2.3 From c1971eae30cfc0c239ffb9bb3152d750854e05f2 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 21 Jan 2021 10:36:13 +0100 Subject: s390: add missing include to arch/s390/kernel/signal.c This fixes the following warning: CHECK linux/arch/s390/kernel/signal.c linux/arch/s390/kernel/signal.c:465:6: warning: symbol 'arch_do_signal_or_restart' was not declared. Should it be static? Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index fce1b2a28a40..90163e6184f5 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From 1432cfe69e25819d96f653a4a44dad41e1163a83 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 19:44:18 +0100 Subject: s390/vdso: fix vdso data page definition The vdso data page actually contains an array. Fix that. This doesn't fix a real bug, just reflects reality. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 8bc269c55fd3..c6aeddcd687d 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -87,10 +87,10 @@ __setup("vdso=", vdso_setup); * The vdso data page */ static union { - struct vdso_data data; + struct vdso_data data[CS_BASES]; u8 page[PAGE_SIZE]; } vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = (struct vdso_data *)&vdso_data_store.data; +struct vdso_data *vdso_data = vdso_data_store.data; void vdso_getcpu_init(void) { -- cgit v1.2.3 From 96c0c7ae5266ec347041312ae22d947b5371e5b3 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 19:51:34 +0100 Subject: s390/vdso: convert vdso_init() to arch_initcall Convert vdso_init() to arch_initcall like it is on all other architectures. This requires to remove the vdso_getcpu_init() call from vdso_init() since it must be called before smp is enabled. vdso_getcpu_init() is now an early_initcall like on powerpc. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/vdso.h | 2 +- arch/s390/kernel/vdso.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index f46c2596c21f..e4ea142a082c 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -13,7 +13,7 @@ extern struct vdso_data *vdso_data; -void vdso_getcpu_init(void); +int vdso_getcpu_init(void); #endif /* __ASSEMBLY__ */ #endif /* __S390_VDSO_H__ */ diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index c6aeddcd687d..0bb287ae0f04 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -92,10 +92,12 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = vdso_data_store.data; -void vdso_getcpu_init(void) +int vdso_getcpu_init(void) { set_tod_programmable_field(smp_processor_id()); + return 0; } +early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ /* * This is called from binfmt_elf, we create the special vma for the @@ -167,7 +169,6 @@ static int __init vdso_init(void) { int i; - vdso_getcpu_init(); /* Calculate the size of the 64 bit vDSO */ vdso64_pages = ((&vdso64_end - &vdso64_start + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; @@ -188,4 +189,4 @@ static int __init vdso_init(void) return 0; } -early_initcall(vdso_init); +arch_initcall(vdso_init); -- cgit v1.2.3 From ea44de691ebad701c849b174dabd376ed6d7d1ae Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 20:04:08 +0100 Subject: s390/vdso: simplify vdso size calculation The vdso is (and must) be page aligned and its size must also be a multiple of PAGE_SIZE. Therefore no need to round upwards. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 0bb287ae0f04..7075459aed51 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -170,8 +170,7 @@ static int __init vdso_init(void) int i; /* Calculate the size of the 64 bit vDSO */ - vdso64_pages = ((&vdso64_end - &vdso64_start - + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1; + vdso64_pages = ((&vdso64_end - &vdso64_start) >> PAGE_SHIFT) + 1; /* Make sure pages are in the correct state */ vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), -- cgit v1.2.3 From e1eac1947bae72eff74925b2fb82b93ded11ae6a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 20:08:40 +0100 Subject: s390/vdso: remove BUG_ON() Handle allocation error gracefully and simply disable vdso instead of leaving the system in an undefined state. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 7075459aed51..f06791c085e7 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -175,7 +175,10 @@ static int __init vdso_init(void) /* Make sure pages are in the correct state */ vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), GFP_KERNEL); - BUG_ON(vdso64_pagelist == NULL); + if (!vdso64_pagelist) { + vdso_enabled = 0; + return -ENOMEM; + } for (i = 0; i < vdso64_pages - 1; i++) { struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); get_page(pg); -- cgit v1.2.3 From 5ffd9af0fb611069f0e390b568a6460ff2c4122c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 20:10:27 +0100 Subject: s390/vdso: remove superfluous check vdso_pages (aka vdso64_pages) is never 0, therefore remove the check. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index f06791c085e7..da18ba855099 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -118,13 +118,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return 0; vdso_pages = vdso64_pages; - /* - * vDSO has a problem and was disabled, just don't "enable" it for - * the process - */ - if (vdso_pages == 0) - return 0; - /* * pick a base address for the vDSO in process space. We try to put * it at vdso_base which is the "natural" base for it, but we might -- cgit v1.2.3 From 6755270b5ee28c7699f80216f7781557c1c2eb40 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 20:22:29 +0100 Subject: s390/vdso: remove superfluous variables A few local variables exist only so the contents of a global variable can be copied to them, and use that value only for reading. Just remove them and rename some global variables. Also change vdso64_[start|end] to be character arrays to be consistent with other architectures, and get rid of the global variable vdso64_kbase. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 35 +++++++++++++---------------------- 1 file changed, 13 insertions(+), 22 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index da18ba855099..f4e1e4580b77 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -31,10 +31,9 @@ #include #include -extern char vdso64_start, vdso64_end; -static void *vdso64_kbase = &vdso64_start; -static unsigned int vdso64_pages; -static struct page **vdso64_pagelist; +extern char vdso64_start[], vdso64_end[]; +static unsigned int vdso_pages; +static struct page **vdso_pagelist; /* * Should the kernel map a VDSO page into processes and pass its @@ -45,12 +44,6 @@ unsigned int __read_mostly vdso_enabled = 1; static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct page **vdso_pagelist; - unsigned long vdso_pages; - - vdso_pagelist = vdso64_pagelist; - vdso_pages = vdso64_pages; - if (vmf->pgoff >= vdso_pages) return VM_FAULT_SIGBUS; @@ -107,7 +100,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long vdso_pages; unsigned long vdso_base; int rc; @@ -117,7 +109,6 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (is_compat_task()) return 0; - vdso_pages = vdso64_pages; /* * pick a base address for the vDSO in process space. We try to put * it at vdso_base which is the "natural" base for it, but we might @@ -162,23 +153,23 @@ static int __init vdso_init(void) { int i; - /* Calculate the size of the 64 bit vDSO */ - vdso64_pages = ((&vdso64_end - &vdso64_start) >> PAGE_SHIFT) + 1; + /* Calculate the size of the vDSO */ + vdso_pages = ((vdso64_end - vdso64_start) >> PAGE_SHIFT) + 1; /* Make sure pages are in the correct state */ - vdso64_pagelist = kcalloc(vdso64_pages + 1, sizeof(struct page *), - GFP_KERNEL); - if (!vdso64_pagelist) { + vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), + GFP_KERNEL); + if (!vdso_pagelist) { vdso_enabled = 0; return -ENOMEM; } - for (i = 0; i < vdso64_pages - 1; i++) { - struct page *pg = virt_to_page(vdso64_kbase + i*PAGE_SIZE); + for (i = 0; i < vdso_pages - 1; i++) { + struct page *pg = virt_to_page(vdso64_start + i * PAGE_SIZE); get_page(pg); - vdso64_pagelist[i] = pg; + vdso_pagelist[i] = pg; } - vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data); - vdso64_pagelist[vdso64_pages] = NULL; + vdso_pagelist[vdso_pages - 1] = virt_to_page(vdso_data); + vdso_pagelist[vdso_pages] = NULL; get_page(virt_to_page(vdso_data)); -- cgit v1.2.3 From 8d4be7f318bc69cb63b712a4fd0dfd8eebe64d0b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 20:57:08 +0100 Subject: s390/vdso: misc simple code changes - remove unneeded includes - move functions around - remove obvious and/or incorrect comments - shorten some if conditions No functional change. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 102 ++++++++++++++---------------------------------- 1 file changed, 30 insertions(+), 72 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index f4e1e4580b77..a7eed8bf3959 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -6,41 +6,41 @@ * Author(s): Martin Schwidefsky (schwidefsky@de.ibm.com) */ -#include +#include +#include +#include #include -#include +#include #include #include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include +#include #include -#include -#include -#include -#include -#include #include -#include -#include extern char vdso64_start[], vdso64_end[]; static unsigned int vdso_pages; static struct page **vdso_pagelist; -/* - * Should the kernel map a VDSO page into processes and pass its - * address down to glibc upon exec()? - */ +static union { + struct vdso_data data[CS_BASES]; + u8 page[PAGE_SIZE]; +} vdso_data_store __page_aligned_data; + +struct vdso_data *vdso_data = vdso_data_store.data; + unsigned int __read_mostly vdso_enabled = 1; +static int __init vdso_setup(char *str) +{ + bool enabled; + + if (!kstrtobool(str, &enabled)) + vdso_enabled = enabled; + return 1; +} +__setup("vdso=", vdso_setup); + static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -56,7 +56,6 @@ static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) { current->mm->context.vdso_base = vma->vm_start; - return 0; } @@ -66,25 +65,6 @@ static const struct vm_special_mapping vdso_mapping = { .mremap = vdso_mremap, }; -static int __init vdso_setup(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - vdso_enabled = enabled; - return 1; -} -__setup("vdso=", vdso_setup); - -/* - * The vdso data page - */ -static union { - struct vdso_data data[CS_BASES]; - u8 page[PAGE_SIZE]; -} vdso_data_store __page_aligned_data; -struct vdso_data *vdso_data = vdso_data_store.data; - int vdso_getcpu_init(void) { set_tod_programmable_field(smp_processor_id()); @@ -92,10 +72,6 @@ int vdso_getcpu_init(void) } early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ -/* - * This is called from binfmt_elf, we create the special vma for the - * vDSO and insert it into the mm struct tree - */ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { struct mm_struct *mm = current->mm; @@ -103,25 +79,14 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) unsigned long vdso_base; int rc; - if (!vdso_enabled) - return 0; - - if (is_compat_task()) + if (!vdso_enabled || is_compat_task()) return 0; - - /* - * pick a base address for the vDSO in process space. We try to put - * it at vdso_base which is the "natural" base for it, but we might - * fail and end up putting it elsewhere. - */ if (mmap_write_lock_killable(mm)) return -EINTR; vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0); - if (IS_ERR_VALUE(vdso_base)) { - rc = vdso_base; - goto out_up; - } - + rc = vdso_base; + if (IS_ERR_VALUE(vdso_base)) + goto out; /* * our vma flags don't have VM_WRITE so by default, the process * isn't allowed to write those pages. @@ -136,15 +101,12 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, &vdso_mapping); - if (IS_ERR(vma)) { - rc = PTR_ERR(vma); - goto out_up; - } - + rc = PTR_ERR(vma); + if (IS_ERR(vma)) + goto out; current->mm->context.vdso_base = vdso_base; rc = 0; - -out_up: +out: mmap_write_unlock(mm); return rc; } @@ -153,9 +115,7 @@ static int __init vdso_init(void) { int i; - /* Calculate the size of the vDSO */ vdso_pages = ((vdso64_end - vdso64_start) >> PAGE_SHIFT) + 1; - /* Make sure pages are in the correct state */ vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); @@ -170,9 +130,7 @@ static int __init vdso_init(void) } vdso_pagelist[vdso_pages - 1] = virt_to_page(vdso_data); vdso_pagelist[vdso_pages] = NULL; - get_page(virt_to_page(vdso_data)); - return 0; } arch_initcall(vdso_init); -- cgit v1.2.3 From dfc11c98763aed6b2fa17d5d23f28a429ab9877b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 21:36:14 +0100 Subject: s390/vdso: get rid of vdso_fault Implement vdso mapping similar to arm64 and powerpc. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 35 +++++++++-------------------------- 1 file changed, 9 insertions(+), 26 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index a7eed8bf3959..86e7a3921348 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -20,7 +20,6 @@ extern char vdso64_start[], vdso64_end[]; static unsigned int vdso_pages; -static struct page **vdso_pagelist; static union { struct vdso_data data[CS_BASES]; @@ -41,17 +40,6 @@ static int __init vdso_setup(char *str) } __setup("vdso=", vdso_setup); -static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, struct vm_fault *vmf) -{ - if (vmf->pgoff >= vdso_pages) - return VM_FAULT_SIGBUS; - - vmf->page = vdso_pagelist[vmf->pgoff]; - get_page(vmf->page); - return 0; -} - static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) { @@ -59,9 +47,8 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } -static const struct vm_special_mapping vdso_mapping = { +static struct vm_special_mapping vdso_mapping = { .name = "[vdso]", - .fault = vdso_fault, .mremap = vdso_mremap, }; @@ -113,24 +100,20 @@ out: static int __init vdso_init(void) { + struct page **pages; int i; vdso_pages = ((vdso64_end - vdso64_start) >> PAGE_SHIFT) + 1; - /* Make sure pages are in the correct state */ - vdso_pagelist = kcalloc(vdso_pages + 1, sizeof(struct page *), - GFP_KERNEL); - if (!vdso_pagelist) { + pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!pages) { vdso_enabled = 0; return -ENOMEM; } - for (i = 0; i < vdso_pages - 1; i++) { - struct page *pg = virt_to_page(vdso64_start + i * PAGE_SIZE); - get_page(pg); - vdso_pagelist[i] = pg; - } - vdso_pagelist[vdso_pages - 1] = virt_to_page(vdso_data); - vdso_pagelist[vdso_pages] = NULL; - get_page(virt_to_page(vdso_data)); + for (i = 0; i < vdso_pages - 1; i++) + pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE); + pages[vdso_pages - 1] = virt_to_page(vdso_data); + pages[vdso_pages] = NULL; + vdso_mapping.pages = pages; return 0; } arch_initcall(vdso_init); -- cgit v1.2.3 From 5056c2c53a22a61facb1a551bf736df9b06e513a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 24 Jan 2021 22:01:16 +0100 Subject: s390/vdso: put vdso datapage in a separate vma Add a separate "[vvar]" mapping for the vdso datapage, since it doesn't need to be executable or COW-able. This is actually the s390 implementation of commit 871549385278 ("arm64: vdso: put vdso datapage in a separate vma") Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 55 +++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 86e7a3921348..968b263f64b4 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -40,6 +40,14 @@ static int __init vdso_setup(char *str) } __setup("vdso=", vdso_setup); +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + if (vmf->pgoff == 0) + return vmf_insert_pfn(vma, vmf->address, virt_to_pfn(vdso_data)); + return VM_FAULT_SIGBUS; +} + static int vdso_mremap(const struct vm_special_mapping *sm, struct vm_area_struct *vma) { @@ -47,6 +55,11 @@ static int vdso_mremap(const struct vm_special_mapping *sm, return 0; } +static struct vm_special_mapping vvar_mapping = { + .name = "[vvar]", + .fault = vvar_fault, +}; + static struct vm_special_mapping vdso_mapping = { .name = "[vdso]", .mremap = vdso_mremap, @@ -61,38 +74,41 @@ early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { + unsigned long addr, vdso_text_start, vdso_text_len, vdso_mapping_len; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long vdso_base; int rc; if (!vdso_enabled || is_compat_task()) return 0; if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_base = get_unmapped_area(NULL, 0, vdso_pages << PAGE_SHIFT, 0, 0); - rc = vdso_base; - if (IS_ERR_VALUE(vdso_base)) + vdso_text_len = vdso_pages << PAGE_SHIFT; + vdso_mapping_len = vdso_text_len + PAGE_SIZE; + addr = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); + rc = addr; + if (IS_ERR_VALUE(addr)) goto out; - /* - * our vma flags don't have VM_WRITE so by default, the process - * isn't allowed to write those pages. - * gdb can break that with ptrace interface, and thus trigger COW - * on those pages but it's then your responsibility to never do that - * on the "data" page of the vDSO or you'll stop getting kernel - * updates and your nice userland gettimeofday will be totally dead. - * It's fine to use that for setting breakpoints in the vDSO code - * pages though. - */ - vma = _install_special_mapping(mm, vdso_base, vdso_pages << PAGE_SHIFT, + /* VM_MAYWRITE for COW so gdb can set breakpoints */ + vdso_text_start = addr; + vma = _install_special_mapping(mm, addr, vdso_text_len, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, &vdso_mapping); rc = PTR_ERR(vma); if (IS_ERR(vma)) goto out; - current->mm->context.vdso_base = vdso_base; - rc = 0; + addr += vdso_text_len; + vma = _install_special_mapping(mm, addr, PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_PFNMAP, + &vvar_mapping); + if (IS_ERR(vma)) { + do_munmap(mm, vdso_text_start, vdso_text_len, NULL); + rc = PTR_ERR(vma); + } else { + current->mm->context.vdso_base = vdso_text_start; + rc = 0; + } out: mmap_write_unlock(mm); return rc; @@ -103,15 +119,14 @@ static int __init vdso_init(void) struct page **pages; int i; - vdso_pages = ((vdso64_end - vdso64_start) >> PAGE_SHIFT) + 1; + vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT; pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); if (!pages) { vdso_enabled = 0; return -ENOMEM; } - for (i = 0; i < vdso_pages - 1; i++) + for (i = 0; i < vdso_pages; i++) pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE); - pages[vdso_pages - 1] = virt_to_page(vdso_data); pages[vdso_pages] = NULL; vdso_mapping.pages = pages; return 0; -- cgit v1.2.3 From 214b3564869cd93258616411962a6fceef2c5ec7 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 5 Feb 2021 16:09:14 +0100 Subject: s390/vdso: move data page before code pages For consistency with x86 and arm64 move the data page before code pages. Similar to commit 601255ae3c98 ("arm64: vdso: move data page before code pages"). Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 28 ++++++++++++++-------------- arch/s390/kernel/vdso64/vdso64.lds.S | 4 +--- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 968b263f64b4..31920b76ae6d 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -74,7 +74,8 @@ early_initcall(vdso_getcpu_init); /* Must be called before SMP init */ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { - unsigned long addr, vdso_text_start, vdso_text_len, vdso_mapping_len; + unsigned long vdso_text_len, vdso_mapping_len; + unsigned long vvar_start, vdso_text_start; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc; @@ -85,25 +86,24 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) return -EINTR; vdso_text_len = vdso_pages << PAGE_SHIFT; vdso_mapping_len = vdso_text_len + PAGE_SIZE; - addr = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); - rc = addr; - if (IS_ERR_VALUE(addr)) + vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); + rc = vvar_start; + if (IS_ERR_VALUE(vvar_start)) goto out; + vma = _install_special_mapping(mm, vvar_start, PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_PFNMAP, + &vvar_mapping); + rc = PTR_ERR(vma); + if (IS_ERR(vma)) + goto out; + vdso_text_start = vvar_start + PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ - vdso_text_start = addr; - vma = _install_special_mapping(mm, addr, vdso_text_len, + vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, &vdso_mapping); - rc = PTR_ERR(vma); - if (IS_ERR(vma)) - goto out; - addr += vdso_text_len; - vma = _install_special_mapping(mm, addr, PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_PFNMAP, - &vvar_mapping); if (IS_ERR(vma)) { - do_munmap(mm, vdso_text_start, vdso_text_len, NULL); + do_munmap(mm, vvar_start, PAGE_SIZE, NULL); rc = PTR_ERR(vma); } else { current->mm->context.vdso_base = vdso_text_start; diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 7bde3909290f..99063b4c6e27 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -13,6 +13,7 @@ ENTRY(_start) SECTIONS { + PROVIDE(_vdso_data = . - PAGE_SIZE); . = VDSO64_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text @@ -94,9 +95,6 @@ SECTIONS .debug_ranges 0 : { *(.debug_ranges) } .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } - . = ALIGN(PAGE_SIZE); - PROVIDE(_vdso_data = .); - /DISCARD/ : { *(.note.GNU-stack) *(.branch_lt) -- cgit v1.2.3 From eeab78b05d202f15e58ab10675a4f736a1c9bd29 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 5 Feb 2021 16:19:32 +0100 Subject: s390/vdso: implement generic vdso time namespace support Implement generic vdso time namespace support which also enables time namespaces for s390. This is quite similar to what arm64 has. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/Kconfig | 1 + arch/s390/configs/zfcpdump_defconfig | 1 + arch/s390/include/asm/vdso.h | 2 + arch/s390/include/asm/vdso/gettimeofday.h | 7 ++ arch/s390/kernel/vdso.c | 102 ++++++++++++++++++++++++++++-- arch/s390/kernel/vdso64/vdso64.lds.S | 5 +- 6 files changed, 110 insertions(+), 8 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 41a2c58c6e7a..5de9f409e4d0 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -129,6 +129,7 @@ config S390 select GENERIC_PTDUMP select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL + select GENERIC_VDSO_TIME_NS select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL diff --git a/arch/s390/configs/zfcpdump_defconfig b/arch/s390/configs/zfcpdump_defconfig index 0200ccf10ace..acf982a2ae4c 100644 --- a/arch/s390/configs/zfcpdump_defconfig +++ b/arch/s390/configs/zfcpdump_defconfig @@ -3,6 +3,7 @@ CONFIG_NO_HZ_IDLE=y CONFIG_HIGH_RES_TIMERS=y # CONFIG_CPU_ISOLATION is not set # CONFIG_UTS_NS is not set +# CONFIG_TIME_NS is not set # CONFIG_PID_NS is not set # CONFIG_NET_NS is not set CONFIG_BLK_DEV_INITRD=y diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index e4ea142a082c..b45e3dddd2c2 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -7,6 +7,8 @@ /* Default link address for the vDSO */ #define VDSO64_LBASE 0 +#define __VVAR_PAGES 2 + #define VDSO_VERSION_STRING LINUX_2.6.29 #ifndef __ASSEMBLY__ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h index c92b0dec0d79..ed89ef742530 100644 --- a/arch/s390/include/asm/vdso/gettimeofday.h +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -67,4 +67,11 @@ long clock_getres_fallback(clockid_t clkid, struct __kernel_timespec *ts) return r2; } +#ifdef CONFIG_TIME_NS +static __always_inline const struct vdso_data *__arch_get_timens_vdso_data(void) +{ + return _timens_data; +} +#endif + #endif diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 31920b76ae6d..dd967af29d2b 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -15,12 +15,15 @@ #include #include #include +#include #include #include extern char vdso64_start[], vdso64_end[]; static unsigned int vdso_pages; +static struct vm_special_mapping vvar_mapping; + static union { struct vdso_data data[CS_BASES]; u8 page[PAGE_SIZE]; @@ -28,6 +31,12 @@ static union { struct vdso_data *vdso_data = vdso_data_store.data; +enum vvar_pages { + VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, + VVAR_NR_PAGES, +}; + unsigned int __read_mostly vdso_enabled = 1; static int __init vdso_setup(char *str) @@ -40,12 +49,89 @@ static int __init vdso_setup(char *str) } __setup("vdso=", vdso_setup); +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page); +} + +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops(). + * For more details check_vma_flags() and __access_remote_vm() + */ + WARN(1, "vvar_page accessed remotely"); + return NULL; +} + +/* + * The VVAR page layout depends on whether a task belongs to the root or + * non-root time namespace. Whenever a task changes its namespace, the VVAR + * page tables are cleared and then they will be re-faulted with a + * corresponding layout. + * See also the comment near timens_setup_vdso_data() for details. + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + mmap_read_lock(mm); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (!vma_is_special_mapping(vma, &vvar_mapping)) + continue; + zap_page_range(vma, vma->vm_start, size); + break; + } + mmap_read_unlock(mm); + return 0; +} +#else +static inline struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} +#endif + static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { - if (vmf->pgoff == 0) - return vmf_insert_pfn(vma, vmf->address, virt_to_pfn(vdso_data)); - return VM_FAULT_SIGBUS; + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long pfn; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = virt_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = virt_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + return vmf_insert_pfn(vma, vmf->address, pfn); } static int vdso_mremap(const struct vm_special_mapping *sm, @@ -80,23 +166,25 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) struct vm_area_struct *vma; int rc; + BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); if (!vdso_enabled || is_compat_task()) return 0; if (mmap_write_lock_killable(mm)) return -EINTR; vdso_text_len = vdso_pages << PAGE_SHIFT; - vdso_mapping_len = vdso_text_len + PAGE_SIZE; + vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); rc = vvar_start; if (IS_ERR_VALUE(vvar_start)) goto out; - vma = _install_special_mapping(mm, vvar_start, PAGE_SIZE, - VM_READ|VM_MAYREAD|VM_PFNMAP, + vma = _install_special_mapping(mm, vvar_start, VVAR_NR_PAGES*PAGE_SIZE, + VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| + VM_PFNMAP, &vvar_mapping); rc = PTR_ERR(vma); if (IS_ERR(vma)) goto out; - vdso_text_start = vvar_start + PAGE_SIZE; + vdso_text_start = vvar_start + VVAR_NR_PAGES * PAGE_SIZE; /* VM_MAYWRITE for COW so gdb can set breakpoints */ vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 99063b4c6e27..518f1ea405f4 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -13,7 +13,10 @@ ENTRY(_start) SECTIONS { - PROVIDE(_vdso_data = . - PAGE_SIZE); + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = VDSO64_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text -- cgit v1.2.3 From fe8344a09272f3a8b71c2ad72fdf8ef3eaef71e5 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 31 Jan 2021 23:07:42 +0100 Subject: s390/vdso: on timens page fault prefault also VVAR page This is the s390 variant of commit e6b28ec65b6d ("x86/vdso: On timens page fault prefault also VVAR page"). Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index dd967af29d2b..8c4e07d533c8 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -105,14 +105,23 @@ static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) { struct page *timens_page = find_timens_vvar_page(vma); - unsigned long pfn; + unsigned long addr, pfn; + vm_fault_t err; switch (vmf->pgoff) { case VVAR_DATA_PAGE_OFFSET: - if (timens_page) + pfn = virt_to_pfn(vdso_data); + if (timens_page) { + /* + * Fault in VVAR page too, since it will be accessed + * to get clock data anyway. + */ + addr = vmf->address + VVAR_TIMENS_PAGE_OFFSET * PAGE_SIZE; + err = vmf_insert_pfn(vma, addr, pfn); + if (unlikely(err & VM_FAULT_ERROR)) + return err; pfn = page_to_pfn(timens_page); - else - pfn = virt_to_pfn(vdso_data); + } break; #ifdef CONFIG_TIME_NS case VVAR_TIMENS_PAGE_OFFSET: -- cgit v1.2.3 From b29c5093820d333eef22f58cd04ec0d089059c39 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 Feb 2021 16:45:37 +0100 Subject: s390/vtime: fix inline assembly clobber list The stck/stckf instruction used within the inline assembly within do_account_vtime() changes the condition code. This is not reflected with the clobber list, and therefore might result in incorrect code generation. It seems unlikely that the compiler could generate incorrect code considering the surrounding C code, but it must still be fixed. Cc: Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vtime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 5aaa2ca6a928..978a35ea6081 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -136,7 +136,8 @@ static int do_account_vtime(struct task_struct *tsk) " stck %1" /* Store current tod clock value */ #endif : "=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock)); + "=Q" (S390_lowcore.last_update_clock) + : : "cc"); clock = S390_lowcore.last_update_clock - clock; timer -= S390_lowcore.last_update_timer; -- cgit v1.2.3 From 683071b02c440eb84d9133dc33bd3d3d37522a5f Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 1 Feb 2021 21:40:22 +0100 Subject: s390/cpum_cf_diag: use get_tod_clock_fast() Use get_tod_clock_fast() instead of store_tod_clock(), since store_tod_clock() can be very slow. Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/perf_cpum_cf_diag.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index e949ab832ed7..6f6b3382edb7 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -119,7 +119,7 @@ static void cf_diag_trailer(struct cf_trailer_entry *te) te->speed = 1; te->clock_base = 1; /* Save clock base */ memcpy(&te->tod_base, &tod_clock_base[1], 8); - store_tod_clock((__u64 *)&te->timestamp); + te->timestamp = get_tod_clock_fast(); } /* -- cgit v1.2.3 From b22446d00af972ef624958a09dcbe85974b701fd Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 1 Feb 2021 21:53:08 +0100 Subject: s390/time: use stcke instead of stck Use STORE CLOCK EXTENDED instead of STORE CLOCK in early tod clock setup. This is just to remove another usage of stck, trying to remove all usages of STORE CLOCK. This doesn't fix anything. Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/timex.h | 10 ++++++---- arch/s390/kernel/early.c | 6 +++--- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index c8e244ecdfde..63bf3bd6e83f 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -17,6 +17,8 @@ /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL +#define STORE_CLOCK_EXT_SIZE 16 /* stcke writes 16 bytes */ + extern u64 clock_comparator_max; /* Inline functions for clock register access. */ @@ -32,15 +34,16 @@ static inline int set_tod_clock(__u64 time) return cc; } -static inline int store_tod_clock(__u64 *time) +static inline int store_tod_clock_ext(char *time) { + typedef struct { char _[STORE_CLOCK_EXT_SIZE]; } addrtype; int cc; asm volatile( - " stck %1\n" + " stcke %1\n" " ipm %0\n" " srl %0,28\n" - : "=d" (cc), "=Q" (*time) : : "cc"); + : "=d" (cc), "=Q" (*(addrtype *)time) : : "cc"); return cc; } @@ -144,7 +147,6 @@ static inline void local_tick_enable(unsigned long long comp) } #define CLOCK_TICK_RATE 1193180 /* Underlying HZ */ -#define STORE_CLOCK_EXT_SIZE 16 /* stcke writes 16 bytes */ typedef unsigned long long cycles_t; diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index cc89763a4d3c..ee063b56b5d1 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -35,12 +35,12 @@ static void __init reset_tod_clock(void) { - u64 time; + char time[STORE_CLOCK_EXT_SIZE]; - if (store_tod_clock(&time) == 0) + if (store_tod_clock_ext(time) == 0) return; /* TOD clock not running. Set the clock to Unix Epoch. */ - if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock(&time) != 0) + if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock_ext(time) != 0) disabled_wait(); memset(tod_clock_base, 0, 16); -- cgit v1.2.3 From 78f6570946228d0e1dac5f42f398e3e07924b945 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 Feb 2021 13:46:47 +0100 Subject: s390/entry: use cpu alternative for stck/stckf Use a cpu alternative to switch between stck and stckf instead of making it compile time dependent. This will also make kernels compiled for old machines, but running on newer machines, use stckf. Reviewed-by: Christian Borntraeger Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 785425b59ac1..9b3aea98f886 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -106,12 +106,10 @@ _LPP_OFFSET = __LC_LPP 2: la %r11,STACK_FRAME_OVERHEAD(%r15) .endm + # Use STORE CLOCK by default, switch to STORE CLOCK FAST if available. .macro STCK savearea -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - .insn s,0xb27c0000,\savearea # store clock fast -#else - .insn s,0xb2050000,\savearea # store clock -#endif + ALTERNATIVE ".insn s,0xb2050000,\savearea", \ + ".insn s,0xb27c0000,\savearea", 25 .endm /* -- cgit v1.2.3 From 1c7673476b82983768c6a4dd78775f817f0e0f88 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 2 Feb 2021 16:59:50 +0100 Subject: s390/vtime: use cpu alternative for stck/stckf Use a cpu alternative to switch between stck and stckf instead of making it compile time dependent. This will also make kernels compiled for old machines, but running on newer machines, use stckf. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vtime.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 978a35ea6081..73c7afcc0527 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -13,7 +13,7 @@ #include #include #include - +#include #include #include #include @@ -128,16 +128,13 @@ static int do_account_vtime(struct task_struct *tsk) timer = S390_lowcore.last_update_timer; clock = S390_lowcore.last_update_clock; - asm volatile( - " stpt %0\n" /* Store current cpu timer value */ -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - " stckf %1" /* Store current tod clock value */ -#else - " stck %1" /* Store current tod clock value */ -#endif - : "=Q" (S390_lowcore.last_update_timer), - "=Q" (S390_lowcore.last_update_clock) - : : "cc"); + /* Use STORE CLOCK by default, STORE CLOCK FAST if available. */ + alternative_io("stpt %0\n .insn s,0xb2050000,%1\n", + "stpt %0\n .insn s,0xb27c0000,%1\n", + 25, + ASM_OUTPUT2("=Q" (S390_lowcore.last_update_timer), + "=Q" (S390_lowcore.last_update_clock)), + ASM_NO_INPUT_CLOBBER("cc")); clock = S390_lowcore.last_update_clock - clock; timer -= S390_lowcore.last_update_timer; -- cgit v1.2.3 From b0d31159a46787380353426faaad8febc9bef009 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Thu, 28 Jan 2021 13:06:05 +0100 Subject: s390: open code SWITCH_KERNEL macro This is a preparation patch for two later bugfixes. In the past both int_handler and machine check handler used SWITCH_KERNEL to switch to the kernel stack. However, SWITCH_KERNEL doesn't work properly in machine check context. So instead of adding more complexity to this macro, just remove it. Signed-off-by: Sven Schnelle Cc: # v5.8+ Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 74 ++++++++++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 28 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 9b3aea98f886..ed5acf95235f 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -81,32 +81,6 @@ _LPP_OFFSET = __LC_LPP #endif .endm - .macro SWITCH_KERNEL savearea - tmhh %r8,0x0001 # interrupting from user ? - jnz 1f -#if IS_ENABLED(CONFIG_KVM) - lgr %r14,%r9 - larl %r13,.Lsie_gmap - slgr %r14,%r13 - lghi %r13,.Lsie_done - .Lsie_gmap - clgr %r14,%r13 - jhe 0f - lghi %r11,\savearea # inside critical section, do cleanup - brasl %r14,.Lcleanup_sie -#endif -0: CHECK_STACK \savearea - lgr %r11,%r15 - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - stg %r11,__SF_BACKCHAIN(%r15) - j 2f -1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP - lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK - xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) -2: la %r11,STACK_FRAME_OVERHEAD(%r15) - .endm - - # Use STORE CLOCK by default, switch to STORE CLOCK FAST if available. .macro STCK savearea ALTERNATIVE ".insn s,0xb2050000,\savearea", \ ".insn s,0xb27c0000,\savearea", 25 @@ -413,7 +387,28 @@ ENTRY(\name) stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT lmg %r8,%r9,\lc_old_psw - SWITCH_KERNEL __LC_SAVE_AREA_ASYNC + tmhh %r8,0x0001 # interrupting from user ? + jnz 1f +#if IS_ENABLED(CONFIG_KVM) + lgr %r14,%r9 + larl %r13,.Lsie_gmap + slgr %r14,%r13 + lghi %r13,.Lsie_done - .Lsie_gmap + clgr %r14,%r13 + jhe 0f + lghi %r11,__LC_SAVE_AREA_ASYNC # inside critical section, do cleanup + brasl %r14,.Lcleanup_sie +#endif +0: CHECK_STACK __LC_SAVE_AREA_ASYNC + lgr %r11,%r15 + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + stg %r11,__SF_BACKCHAIN(%r15) + j 2f +1: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP + lctlg %c1,%c1,__LC_KERNEL_ASCE + lg %r15,__LC_KERNEL_STACK + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) +2: la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use xgr %r0,%r0 @@ -542,7 +537,30 @@ ENTRY(mcck_int_handler) TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic 4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off - SWITCH_KERNEL __LC_GPREGS_SAVE_AREA+64 + tmhh %r8,0x0001 # interrupting from user ? + jnz .Lmcck_user +#if IS_ENABLED(CONFIG_KVM) + lgr %r14,%r9 + larl %r13,.Lsie_gmap + slgr %r14,%r13 + lghi %r13,.Lsie_done - .Lsie_gmap + clgr %r14,%r13 + jhe .Lmcck_stack + lghi %r11,__LC_GPREGS_SAVE_AREA+64 # inside critical section, do cleanup + brasl %r14,.Lcleanup_sie +.Lmcck_stack: +#endif + CHECK_STACK __LC_GPREGS_SAVE_AREA+64 + lgr %r11,%r15 + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + stg %r11,__SF_BACKCHAIN(%r15) + j 5f +.Lmcck_user: + BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP + lctlg %c1,%c1,__LC_KERNEL_ASCE + lg %r15,__LC_KERNEL_STACK + xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) +5: la %r11,STACK_FRAME_OVERHEAD(%r15) .Lmcck_skip: lghi %r14,__LC_GPREGS_SAVE_AREA+64 stmg %r0,%r7,__PT_R0(%r11) -- cgit v1.2.3 From 64985c3a223d15f151204b3aa37e587b9466378d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 10 Feb 2021 13:39:19 +0100 Subject: s390: use WRITE_ONCE when re-allocating async stack The code does: S390_lowcore.async_stack = new + STACK_INIT_OFFSET; But the compiler is free to first assign one value and add the other value later. If a IRQ would be coming in between these two operations, it would run with an invalid stack. Prevent this by using WRITE_ONCE. Signed-off-by: Sven Schnelle Signed-off-by: Vasily Gorbik --- arch/s390/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index c7feda84edbb..6b004940c4dc 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -346,7 +346,7 @@ static int __init async_stack_realloc(void) new = stack_alloc(); if (!new) panic("Couldn't allocate async stack"); - S390_lowcore.async_stack = new + STACK_INIT_OFFSET; + WRITE_ONCE(S390_lowcore.async_stack, new + STACK_INIT_OFFSET); free_pages(old, THREAD_SIZE_ORDER); return 0; } -- cgit v1.2.3 From b61b1595124a1694501105e5dd488de0c0c6bc2a Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 3 Feb 2021 09:02:51 +0100 Subject: s390: add stack for machine check handler The previous code used the normal kernel stack for machine checks. This is problematic when a machine check interrupts a system call or interrupt handler right at the beginning where registers are set up. Assume system_call is interrupted at the first instruction and a machine check is triggered. The machine check handler is called, checks the PSW to see whether it is coming from user space, notices that it is already in kernel mode but %r15 still contains the user space stack. This would lead to a kernel crash. There are basically two ways of fixing that: Either using the 'critical cleanup' approach which compares the address in the PSW to see whether it is already at a point where the stack has been set up, or use an extra stack for the machine check handler. For simplicity, we will go with the second approach and allocate an extra stack. This adds some memory overhead for large systems, but usually large system have plenty of memory so this isn't really a concern. But it keeps the mchk stack setup simple and less error prone. Fixes: 0b0ed657fe00 ("s390: remove critical section cleanup from entry.S") Signed-off-by: Sven Schnelle Cc: # v5.8+ Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/lowcore.h | 13 ++++++------- arch/s390/kernel/asm-offsets.c | 1 + arch/s390/kernel/entry.S | 17 +++++++---------- arch/s390/kernel/setup.c | 18 ++++++++++++++++-- arch/s390/kernel/smp.c | 17 +++++++++++------ 5 files changed, 41 insertions(+), 25 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 4d65c8e4e6d0..22bceeeba4bc 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -107,16 +107,15 @@ struct lowcore { __u64 async_stack; /* 0x0350 */ __u64 nodat_stack; /* 0x0358 */ __u64 restart_stack; /* 0x0360 */ - + __u64 mcck_stack; /* 0x0368 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0368 */ - __u64 restart_data; /* 0x0370 */ - __u64 restart_source; /* 0x0378 */ + __u64 restart_fn; /* 0x0370 */ + __u64 restart_data; /* 0x0378 */ + __u64 restart_source; /* 0x0380 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0380 */ - __u64 user_asce; /* 0x0388 */ - __u8 pad_0x0390[0x0398-0x0390]; /* 0x0390 */ + __u64 kernel_asce; /* 0x0388 */ + __u64 user_asce; /* 0x0390 */ /* * The lpp and current_pid fields form a diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index d22bb28ef50c..15e637728a4b 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -118,6 +118,7 @@ int main(void) OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); OFFSET(__LC_NODAT_STACK, lowcore, nodat_stack); OFFSET(__LC_RESTART_STACK, lowcore, restart_stack); + OFFSET(__LC_MCCK_STACK, lowcore, mcck_stack); OFFSET(__LC_RESTART_FN, lowcore, restart_fn); OFFSET(__LC_RESTART_DATA, lowcore, restart_data); OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index ed5acf95235f..f7953bb17558 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -70,6 +70,8 @@ _LPP_OFFSET = __LC_LPP je \oklabel clg %r14,__LC_ASYNC_STACK je \oklabel + clg %r14,__LC_MCCK_STACK + je \oklabel clg %r14,__LC_NODAT_STACK je \oklabel clg %r14,__LC_RESTART_STACK @@ -548,20 +550,16 @@ ENTRY(mcck_int_handler) jhe .Lmcck_stack lghi %r11,__LC_GPREGS_SAVE_AREA+64 # inside critical section, do cleanup brasl %r14,.Lcleanup_sie -.Lmcck_stack: #endif - CHECK_STACK __LC_GPREGS_SAVE_AREA+64 - lgr %r11,%r15 - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - stg %r11,__SF_BACKCHAIN(%r15) - j 5f + j .Lmcck_stack .Lmcck_user: BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP +.Lmcck_stack: + lg %r15,__LC_MCCK_STACK +.Lmcck_skip: + la %r11,STACK_FRAME_OVERHEAD(%r15) lctlg %c1,%c1,__LC_KERNEL_ASCE - lg %r15,__LC_KERNEL_STACK xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) -5: la %r11,STACK_FRAME_OVERHEAD(%r15) -.Lmcck_skip: lghi %r14,__LC_GPREGS_SAVE_AREA+64 stmg %r0,%r7,__PT_R0(%r11) # clear user controlled registers to prevent speculative use @@ -602,7 +600,6 @@ ENTRY(mcck_int_handler) .Lmcck_panic: lg %r15,__LC_NODAT_STACK - la %r11,STACK_FRAME_OVERHEAD(%r15) j .Lmcck_skip ENDPROC(mcck_int_handler) diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 6b004940c4dc..60da976eee6f 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -338,7 +338,7 @@ int __init arch_early_irq_init(void) return 0; } -static int __init async_stack_realloc(void) +static int __init stack_realloc(void) { unsigned long old, new; @@ -348,9 +348,16 @@ static int __init async_stack_realloc(void) panic("Couldn't allocate async stack"); WRITE_ONCE(S390_lowcore.async_stack, new + STACK_INIT_OFFSET); free_pages(old, THREAD_SIZE_ORDER); + + old = S390_lowcore.mcck_stack - STACK_INIT_OFFSET; + new = stack_alloc(); + if (!new) + panic("Couldn't allocate machine check stack"); + WRITE_ONCE(S390_lowcore.mcck_stack, new + STACK_INIT_OFFSET); + memblock_free(old, THREAD_SIZE); return 0; } -early_initcall(async_stack_realloc); +early_initcall(stack_realloc); void __init arch_call_rest_init(void) { @@ -372,6 +379,7 @@ void __init arch_call_rest_init(void) static void __init setup_lowcore_dat_off(void) { unsigned long int_psw_mask = PSW_KERNEL_BITS; + unsigned long mcck_stack; struct lowcore *lc; if (IS_ENABLED(CONFIG_KASAN)) @@ -439,6 +447,12 @@ static void __init setup_lowcore_dat_off(void) lc->restart_data = 0; lc->restart_source = -1UL; + mcck_stack = (unsigned long)memblock_alloc(THREAD_SIZE, THREAD_SIZE); + if (!mcck_stack) + panic("%s: Failed to allocate %lu bytes align=0x%lx\n", + __func__, THREAD_SIZE, THREAD_SIZE); + lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET; + /* Setup absolute zero lowcore */ mem_assign_absolute(S390_lowcore.restart_stack, lc->restart_stack); mem_assign_absolute(S390_lowcore.restart_fn, lc->restart_fn); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index c5abbb94ac6e..e299892440b6 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -189,7 +189,7 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit) static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) { - unsigned long async_stack, nodat_stack; + unsigned long async_stack, nodat_stack, mcck_stack; struct lowcore *lc; if (pcpu != &pcpu_devices[0]) { @@ -202,13 +202,15 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET; } async_stack = stack_alloc(); - if (!async_stack) - goto out; + mcck_stack = stack_alloc(); + if (!async_stack || !mcck_stack) + goto out_stack; lc = pcpu->lowcore; memcpy(lc, &S390_lowcore, 512); memset((char *) lc + 512, 0, sizeof(*lc) - 512); lc->async_stack = async_stack + STACK_INIT_OFFSET; lc->nodat_stack = nodat_stack + STACK_INIT_OFFSET; + lc->mcck_stack = mcck_stack + STACK_INIT_OFFSET; lc->cpu_nr = cpu; lc->spinlock_lockval = arch_spin_lockval(cpu); lc->spinlock_index = 0; @@ -216,12 +218,13 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); if (nmi_alloc_per_cpu(lc)) - goto out_async; + goto out_stack; lowcore_ptr[cpu] = lc; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc); return 0; -out_async: +out_stack: + stack_free(mcck_stack); stack_free(async_stack); out: if (pcpu != &pcpu_devices[0]) { @@ -233,16 +236,18 @@ out: static void pcpu_free_lowcore(struct pcpu *pcpu) { - unsigned long async_stack, nodat_stack, lowcore; + unsigned long async_stack, nodat_stack, mcck_stack, lowcore; nodat_stack = pcpu->lowcore->nodat_stack - STACK_INIT_OFFSET; async_stack = pcpu->lowcore->async_stack - STACK_INIT_OFFSET; + mcck_stack = pcpu->lowcore->mcck_stack - STACK_INIT_OFFSET; lowcore = (unsigned long) pcpu->lowcore; pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0); lowcore_ptr[pcpu - pcpu_devices] = NULL; nmi_free_per_cpu(pcpu->lowcore); stack_free(async_stack); + stack_free(mcck_stack); if (pcpu == &pcpu_devices[0]) return; free_pages(nodat_stack, THREAD_SIZE_ORDER); -- cgit v1.2.3 From 26521412ae22d06caab98721757b2721c6d7c46c Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 3 Feb 2021 09:16:45 +0100 Subject: s390: fix kernel asce loading when sie is interrupted If a machine check is coming in during sie, the PU saves the control registers to the machine check save area. Afterwards mcck_int_handler is called, which loads __LC_KERNEL_ASCE into %cr1. Later the code restores %cr1 from the machine check area, but that is wrong when SIE was interrupted because the machine check area still contains the gmap asce. Instead it should return with either __KERNEL_ASCE in %cr1 when interrupted in SIE or the previous %cr1 content saved in the machine check save area. Fixes: 87d598634521 ("s390/mm: remove set_fs / rework address space handling") Signed-off-by: Sven Schnelle Cc: # v5.8+ Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index f7953bb17558..377294969954 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -558,6 +558,7 @@ ENTRY(mcck_int_handler) lg %r15,__LC_MCCK_STACK .Lmcck_skip: la %r11,STACK_FRAME_OVERHEAD(%r15) + stctg %c1,%c1,__PT_CR1(%r11) lctlg %c1,%c1,__LC_KERNEL_ASCE xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lghi %r14,__LC_GPREGS_SAVE_AREA+64 @@ -573,8 +574,6 @@ ENTRY(mcck_int_handler) xgr %r10,%r10 mvc __PT_R8(64,%r11),0(%r14) stmg %r8,%r9,__PT_PSW(%r11) - la %r14,4095 - mvc __PT_CR1(8,%r11),__LC_CREGS_SAVE_AREA-4095+8(%r14) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs -- cgit v1.2.3 From 33ea04872da15ea8236f92da6009af5a1b0af641 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 3 Feb 2021 17:46:12 +0100 Subject: s390: use r13 in cleanup_sie as temp register Instead of thrashing r11 which is normally our pointer to struct pt_regs on the stack, use r13 as temporary register in the BR_EX macro. r13 is already used in cleanup_sie, so no need to thrash another register. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 377294969954..d1236a9f73b2 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -130,7 +130,7 @@ _LPP_OFFSET = __LC_LPP .endm GEN_BR_THUNK %r14 - GEN_BR_THUNK %r14,%r11 + GEN_BR_THUNK %r14,%r13 .section .kprobes.text, "ax" .Ldummy: @@ -665,7 +665,7 @@ ENDPROC(stack_overflow) ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE lctlg %c1,%c1,__LC_KERNEL_ASCE larl %r9,sie_exit # skip forward to sie_exit - BR_EX %r14,%r11 + BR_EX %r14,%r13 #endif .section .rodata, "a" -- cgit v1.2.3 From efa54735905c03bf876b4451cfaef6b45046bc53 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Wed, 3 Feb 2021 17:50:00 +0100 Subject: s390: split cleanup_sie The current code uses the address in %r11 to figure out whether it was called from the machine check handler or from a normal interrupt handler. Instead of doing this implicit logic (which is mostly a leftover from the old critical cleanup approach) just add a second label and use that. Signed-off-by: Sven Schnelle Reviewed-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/entry.S | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index d1236a9f73b2..c10b9f31eef7 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -214,7 +214,7 @@ ENTRY(sie64a) # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. # Other instructions between sie64a and .Lsie_done should not cause program # interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -# See also .Lcleanup_sie +# See also .Lcleanup_sie_mcck/.Lcleanup_sie_int .Lrewind_pad6: nopr 7 .Lrewind_pad4: @@ -398,8 +398,7 @@ ENTRY(\name) lghi %r13,.Lsie_done - .Lsie_gmap clgr %r14,%r13 jhe 0f - lghi %r11,__LC_SAVE_AREA_ASYNC # inside critical section, do cleanup - brasl %r14,.Lcleanup_sie + brasl %r14,.Lcleanup_sie_int #endif 0: CHECK_STACK __LC_SAVE_AREA_ASYNC lgr %r11,%r15 @@ -548,8 +547,7 @@ ENTRY(mcck_int_handler) lghi %r13,.Lsie_done - .Lsie_gmap clgr %r14,%r13 jhe .Lmcck_stack - lghi %r11,__LC_GPREGS_SAVE_AREA+64 # inside critical section, do cleanup - brasl %r14,.Lcleanup_sie + brasl %r14,.Lcleanup_sie_mcck #endif j .Lmcck_stack .Lmcck_user: @@ -651,16 +649,15 @@ ENDPROC(stack_overflow) #endif #if IS_ENABLED(CONFIG_KVM) -.Lcleanup_sie: - cghi %r11,__LC_SAVE_AREA_ASYNC #Is this in normal interrupt? - je 1f +.Lcleanup_sie_mcck: larl %r13,.Lsie_entry slgr %r9,%r13 larl %r13,.Lsie_skip clgr %r9,%r13 - jh 1f + jh .Lcleanup_sie_int oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST -1: BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) +.Lcleanup_sie_int: + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE lctlg %c1,%c1,__LC_KERNEL_ASCE -- cgit v1.2.3 From 530f639f1efe076df8d56719ab45eb7203175ecf Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 8 Feb 2021 13:56:49 +0100 Subject: s390/time: rename store_tod_clock_ext() and use union tod_clock Rename store_tod_clock_ext() to store_tod_clock_ext_cc() to reflect that it returns a condition code and also use union tod_clock as parameter. Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/timex.h | 5 ++--- arch/s390/kernel/early.c | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 271a1e12cc73..24964579684b 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -53,16 +53,15 @@ static inline int set_tod_clock(__u64 time) return cc; } -static inline int store_tod_clock_ext(char *time) +static inline int store_tod_clock_ext_cc(union tod_clock *clk) { - typedef struct { char _[STORE_CLOCK_EXT_SIZE]; } addrtype; int cc; asm volatile( " stcke %1\n" " ipm %0\n" " srl %0,28\n" - : "=d" (cc), "=Q" (*(addrtype *)time) : : "cc"); + : "=d" (cc), "=Q" (*clk) : : "cc"); return cc; } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index ee063b56b5d1..8f046f985d8e 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -35,12 +35,12 @@ static void __init reset_tod_clock(void) { - char time[STORE_CLOCK_EXT_SIZE]; + union tod_clock clk; - if (store_tod_clock_ext(time) == 0) + if (store_tod_clock_ext_cc(&clk) == 0) return; /* TOD clock not running. Set the clock to Unix Epoch. */ - if (set_tod_clock(TOD_UNIX_EPOCH) != 0 || store_tod_clock_ext(time) != 0) + if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) disabled_wait(); memset(tod_clock_base, 0, 16); -- cgit v1.2.3 From f8d8977a3d971011ab04e4569a664628bd03935e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 8 Feb 2021 16:06:10 +0100 Subject: s390/time: convert tod_clock_base to union Convert tod_clock_base to union tod_clock. This simplifies quite a bit of code and also fixes a bug in read_persistent_clock64(); void read_persistent_clock64(struct timespec64 *ts) { __u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; get_tod_clock_ext(clk); *(__u64 *) &clk[1] -= delta; if (*(__u64 *) &clk[1] > delta) clk[0]--; ext_to_timespec64(clk, ts); } Assume &clk[1] == 3 and delta == 2; then after the substraction the if condition becomes true and the epoch part of the clock is decremented by one because of an assumed overflow, even though there is none. Fix this by using 128 bit arithmetics and let the compiler do the right thing: void read_persistent_clock64(struct timespec64 *ts) { union tod_clock clk; u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; store_tod_clock_ext(&clk); clk.eitod -= delta; ext_to_timespec64(&clk, ts); } Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/include/asm/timex.h | 4 +-- arch/s390/kernel/early.c | 6 ++--- arch/s390/kernel/perf_cpum_cf_diag.c | 2 +- arch/s390/kernel/perf_cpum_sf.c | 2 +- arch/s390/kernel/time.c | 50 ++++++++++++------------------------ 5 files changed, 24 insertions(+), 40 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 7bfdcae34515..dbd7b44bc9da 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -208,7 +208,7 @@ static inline cycles_t get_cycles(void) int get_phys_clock(unsigned long *clock); void init_cpu_timer(void); -extern unsigned char tod_clock_base[16] __aligned(8); +extern union tod_clock tod_clock_base; /** * get_clock_monotonic - returns current time in clock rate units @@ -222,7 +222,7 @@ static inline unsigned long long get_tod_clock_monotonic(void) unsigned long long tod; preempt_disable_notrace(); - tod = get_tod_clock() - *(unsigned long long *) &tod_clock_base[1]; + tod = get_tod_clock() - tod_clock_base.tod; preempt_enable_notrace(); return tod; } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 8f046f985d8e..a361d2e70025 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -43,8 +43,8 @@ static void __init reset_tod_clock(void) if (set_tod_clock(TOD_UNIX_EPOCH) || store_tod_clock_ext_cc(&clk)) disabled_wait(); - memset(tod_clock_base, 0, 16); - *(__u64 *) &tod_clock_base[1] = TOD_UNIX_EPOCH; + memset(&tod_clock_base, 0, sizeof(tod_clock_base)); + tod_clock_base.tod = TOD_UNIX_EPOCH; S390_lowcore.last_update_clock = TOD_UNIX_EPOCH; } @@ -230,7 +230,7 @@ static __init void detect_machine_facilities(void) } if (test_facility(133)) S390_lowcore.machine_flags |= MACHINE_FLAG_GS; - if (test_facility(139) && (tod_clock_base[1] & 0x80)) { + if (test_facility(139) && (tod_clock_base.tod >> 63)) { /* Enabled signed clock comparator comparisons */ S390_lowcore.machine_flags |= MACHINE_FLAG_SCC; clock_comparator_max = -1ULL >> 1; diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index 6f6b3382edb7..b5c86fb70d63 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -118,7 +118,7 @@ static void cf_diag_trailer(struct cf_trailer_entry *te) if (te->cpu_speed) te->speed = 1; te->clock_base = 1; /* Save clock base */ - memcpy(&te->tod_base, &tod_clock_base[1], 8); + te->tod_base = tod_clock_base.tod; te->timestamp = get_tod_clock_fast(); } diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 19cd7b961c45..db62def4ef28 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1682,7 +1682,7 @@ static void aux_sdb_init(unsigned long sdb) /* Save clock base */ te->clock_base = 1; - memcpy(&te->progusage2, &tod_clock_base[1], 8); + te->progusage2 = tod_clock_base.tod; } /* diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index c59cb44fbb7d..06bcfa636638 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -55,11 +55,7 @@ #include #include "entry.h" -unsigned char tod_clock_base[16] __aligned(8) = { - /* Force to data section. */ - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff -}; +union tod_clock tod_clock_base __section(".data"); EXPORT_SYMBOL_GPL(tod_clock_base); u64 clock_comparator_max = -1ULL; @@ -86,7 +82,7 @@ void __init time_early_init(void) struct ptff_qui qui; /* Initialize TOD steering parameters */ - tod_steering_end = *(unsigned long long *) &tod_clock_base[1]; + tod_steering_end = tod_clock_base.tod; vdso_data->arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) @@ -113,18 +109,13 @@ unsigned long long notrace sched_clock(void) } NOKPROBE_SYMBOL(sched_clock); -static void ext_to_timespec64(unsigned char *clk, struct timespec64 *xt) +static void ext_to_timespec64(union tod_clock *clk, struct timespec64 *xt) { - unsigned long long high, low, rem, sec, nsec; + unsigned long rem, sec, nsec; - /* Split extendnd TOD clock to micro-seconds and sub-micro-seconds */ - high = (*(unsigned long long *) clk) >> 4; - low = (*(unsigned long long *)&clk[7]) << 4; - /* Calculate seconds and nano-seconds */ - sec = high; + sec = clk->us; rem = do_div(sec, 1000000); - nsec = (((low >> 32) + (rem << 32)) * 1000) >> 32; - + nsec = ((clk->sus + (rem << 12)) * 125) >> 9; xt->tv_sec = sec; xt->tv_nsec = nsec; } @@ -204,30 +195,26 @@ static void stp_reset(void); void read_persistent_clock64(struct timespec64 *ts) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; - __u64 delta; + union tod_clock clk; + u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; - get_tod_clock_ext(clk); - *(__u64 *) &clk[1] -= delta; - if (*(__u64 *) &clk[1] > delta) - clk[0]--; - ext_to_timespec64(clk, ts); + store_tod_clock_ext(&clk); + clk.eitod -= delta; + ext_to_timespec64(&clk, ts); } void __init read_persistent_wall_and_boot_offset(struct timespec64 *wall_time, struct timespec64 *boot_offset) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; struct timespec64 boot_time; - __u64 delta; + union tod_clock clk; + u64 delta; delta = initial_leap_seconds + TOD_UNIX_EPOCH; - memcpy(clk, tod_clock_base, STORE_CLOCK_EXT_SIZE); - *(__u64 *)&clk[1] -= delta; - if (*(__u64 *)&clk[1] > delta) - clk[0]--; - ext_to_timespec64(clk, &boot_time); + clk = tod_clock_base; + clk.eitod -= delta; + ext_to_timespec64(&clk, &boot_time); read_persistent_clock64(wall_time); *boot_offset = timespec64_sub(*wall_time, boot_time); @@ -381,10 +368,7 @@ static void clock_sync_global(unsigned long long delta) struct ptff_qto qto; /* Fixup the monotonic sched clock. */ - *(unsigned long long *) &tod_clock_base[1] += delta; - if (*(unsigned long long *) &tod_clock_base[1] < delta) - /* Epoch overflow */ - tod_clock_base[0]++; + tod_clock_base.eitod += delta; /* Adjust TOD steering parameters. */ now = get_tod_clock(); adj = tod_steering_end - now; -- cgit v1.2.3 From 169ceac42926155870e7ad8165f01ab15caac17a Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 8 Feb 2021 16:16:28 +0100 Subject: s390/vdso: use union tod_clock Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/vdso64/getcpu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/vdso64/getcpu.c b/arch/s390/kernel/vdso64/getcpu.c index 5b2bc7494d5b..5c5d4a848b76 100644 --- a/arch/s390/kernel/vdso64/getcpu.c +++ b/arch/s390/kernel/vdso64/getcpu.c @@ -8,12 +8,12 @@ int __s390_vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { - __u16 todval[8]; + union tod_clock clk; /* CPU number is stored in the programmable field of the TOD clock */ - get_tod_clock_ext((char *)todval); + store_tod_clock_ext(&clk); if (cpu) - *cpu = todval[7]; + *cpu = clk.pf; /* NUMA node is always zero */ if (node) *node = 0; -- cgit v1.2.3 From d1deda6f2b238bfcd3a4521b3221974443416342 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 8 Feb 2021 16:32:27 +0100 Subject: s390/debug: use union tod_clock Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik --- arch/s390/kernel/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/s390/kernel') diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index b6619ae9a3e0..bb958d32bd81 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -829,11 +829,11 @@ static inline debug_entry_t *get_active_entry(debug_info_t *id) static inline void debug_finish_entry(debug_info_t *id, debug_entry_t *active, int level, int exception) { - unsigned char clk[STORE_CLOCK_EXT_SIZE]; unsigned long timestamp; + union tod_clock clk; - get_tod_clock_ext(clk); - timestamp = *(unsigned long *) &clk[0] >> 4; + store_tod_clock_ext(&clk); + timestamp = clk.us; timestamp -= TOD_UNIX_EPOCH >> 12; active->clock = timestamp; active->cpu = smp_processor_id(); -- cgit v1.2.3