diff options
Diffstat (limited to 'arch/s390')
55 files changed, 1971 insertions, 1651 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 07b2328f29e0..a0e2130f0100 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -163,7 +163,6 @@ config S390 select HAVE_GCC_PLUGINS select HAVE_GENERIC_VDSO select HAVE_IOREMAP_PROT if PCI - select HAVE_IRQ_EXIT_ON_IRQ_STACK select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 @@ -438,6 +437,7 @@ config COMPAT select COMPAT_OLD_SIGACTION select HAVE_UID16 depends on MULTIUSER + depends on !CC_IS_CLANG help Select this option if you want to enable your system kernel to handle system-calls from ELF binaries for 31 bit ESA. This option diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 098abe3a56f3..95c75e653e43 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -166,6 +166,19 @@ archheaders: archprepare: $(Q)$(MAKE) $(build)=$(syscalls) kapi $(Q)$(MAKE) $(build)=$(tools) kapi +ifeq ($(KBUILD_EXTMOD),) +# We need to generate vdso-offsets.h before compiling certain files in kernel/. +# In order to do that, we should use the archprepare target, but we can't since +# asm-offsets.h is included in some files used to generate vdso-offsets.h, and +# asm-offsets.h is built in prepare0, for which archprepare is a dependency. +# Therefore we need to generate the header after prepare0 has been made, hence +# this hack. +prepare: vdso_prepare +vdso_prepare: prepare0 + $(Q)$(MAKE) $(build)=arch/s390/kernel/vdso64 include/generated/vdso64-offsets.h + $(if $(CONFIG_COMPAT),$(Q)$(MAKE) \ + $(build)=arch/s390/kernel/vdso32 include/generated/vdso32-offsets.h) +endif # Don't use tabs in echo arguments define archhelp diff --git a/arch/s390/boot/startup.c b/arch/s390/boot/startup.c index bbe4df6c2f8b..d0cf21641e3a 100644 --- a/arch/s390/boot/startup.c +++ b/arch/s390/boot/startup.c @@ -23,6 +23,7 @@ unsigned long __bootdata_preserved(vmemmap_size); unsigned long __bootdata_preserved(MODULES_VADDR); unsigned long __bootdata_preserved(MODULES_END); unsigned long __bootdata(ident_map_size); +int __bootdata(is_full_image) = 1; u64 __bootdata_preserved(stfle_fac_list[16]); u64 __bootdata_preserved(alt_stfle_fac_list[16]); diff --git a/arch/s390/boot/uv.c b/arch/s390/boot/uv.c index 82b99b916243..f6b0c4f43c99 100644 --- a/arch/s390/boot/uv.c +++ b/arch/s390/boot/uv.c @@ -36,6 +36,7 @@ void uv_query_info(void) uv_info.max_sec_stor_addr = ALIGN(uvcb.max_guest_stor_addr, PAGE_SIZE); uv_info.max_num_sec_conf = uvcb.max_num_sec_conf; uv_info.max_guest_cpu_id = uvcb.max_guest_cpu_id; + uv_info.uv_feature_indications = uvcb.uv_feature_indications; } #ifdef CONFIG_PROTECTED_VIRTUALIZATION_GUEST diff --git a/arch/s390/include/asm/ap.h b/arch/s390/include/asm/ap.h index 837d1699b109..3afbee21dc1f 100644 --- a/arch/s390/include/asm/ap.h +++ b/arch/s390/include/asm/ap.h @@ -53,18 +53,20 @@ struct ap_queue_status { */ static inline bool ap_instructions_available(void) { - register unsigned long reg0 asm ("0") = AP_MKQID(0, 0); - register unsigned long reg1 asm ("1") = 0; - register unsigned long reg2 asm ("2") = 0; + unsigned long reg0 = AP_MKQID(0, 0); + unsigned long reg1 = 0; asm volatile( - " .long 0xb2af0000\n" /* PQAP(TAPQ) */ - "0: la %0,1\n" + " lgr 0,%[reg0]\n" /* qid into gr0 */ + " lghi 1,0\n" /* 0 into gr1 */ + " lghi 2,0\n" /* 0 into gr2 */ + " .long 0xb2af0000\n" /* PQAP(TAPQ) */ + "0: la %[reg1],1\n" /* 1 into reg1 */ "1:\n" EX_TABLE(0b, 1b) - : "+d" (reg1), "+d" (reg2) - : "d" (reg0) - : "cc"); + : [reg1] "+&d" (reg1) + : [reg0] "d" (reg0) + : "cc", "0", "1", "2"); return reg1 != 0; } @@ -77,14 +79,18 @@ static inline bool ap_instructions_available(void) */ static inline struct ap_queue_status ap_tapq(ap_qid_t qid, unsigned long *info) { - register unsigned long reg0 asm ("0") = qid; - register struct ap_queue_status reg1 asm ("1"); - register unsigned long reg2 asm ("2"); - - asm volatile(".long 0xb2af0000" /* PQAP(TAPQ) */ - : "=d" (reg1), "=d" (reg2) - : "d" (reg0) - : "cc"); + struct ap_queue_status reg1; + unsigned long reg2; + + asm volatile( + " lgr 0,%[qid]\n" /* qid into gr0 */ + " lghi 2,0\n" /* 0 into gr2 */ + " .long 0xb2af0000\n" /* PQAP(TAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* gr2 into reg2 */ + : [reg1] "=&d" (reg1), [reg2] "=&d" (reg2) + : [qid] "d" (qid) + : "cc", "0", "1", "2"); if (info) *info = reg2; return reg1; @@ -115,14 +121,16 @@ static inline struct ap_queue_status ap_test_queue(ap_qid_t qid, */ static inline struct ap_queue_status ap_rapq(ap_qid_t qid) { - register unsigned long reg0 asm ("0") = qid | (1UL << 24); - register struct ap_queue_status reg1 asm ("1"); + unsigned long reg0 = qid | (1UL << 24); /* fc 1UL is RAPQ */ + struct ap_queue_status reg1; asm volatile( - ".long 0xb2af0000" /* PQAP(RAPQ) */ - : "=d" (reg1) - : "d" (reg0) - : "cc"); + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .long 0xb2af0000\n" /* PQAP(RAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1) + : [reg0] "d" (reg0) + : "cc", "0", "1"); return reg1; } @@ -134,14 +142,16 @@ static inline struct ap_queue_status ap_rapq(ap_qid_t qid) */ static inline struct ap_queue_status ap_zapq(ap_qid_t qid) { - register unsigned long reg0 asm ("0") = qid | (2UL << 24); - register struct ap_queue_status reg1 asm ("1"); + unsigned long reg0 = qid | (2UL << 24); /* fc 2UL is ZAPQ */ + struct ap_queue_status reg1; asm volatile( - ".long 0xb2af0000" /* PQAP(ZAPQ) */ - : "=d" (reg1) - : "d" (reg0) - : "cc"); + " lgr 0,%[reg0]\n" /* qid arg into gr0 */ + " .long 0xb2af0000\n" /* PQAP(ZAPQ) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "=&d" (reg1) + : [reg0] "d" (reg0) + : "cc", "0", "1"); return reg1; } @@ -172,18 +182,20 @@ struct ap_config_info { */ static inline int ap_qci(struct ap_config_info *config) { - register unsigned long reg0 asm ("0") = 4UL << 24; - register unsigned long reg1 asm ("1") = -EOPNOTSUPP; - register struct ap_config_info *reg2 asm ("2") = config; + unsigned long reg0 = 4UL << 24; /* fc 4UL is QCI */ + unsigned long reg1 = -EOPNOTSUPP; + struct ap_config_info *reg2 = config; asm volatile( - ".long 0xb2af0000\n" /* PQAP(QCI) */ - "0: la %0,0\n" + " lgr 0,%[reg0]\n" /* QCI fc into gr0 */ + " lgr 2,%[reg2]\n" /* ptr to config into gr2 */ + " .long 0xb2af0000\n" /* PQAP(QCI) */ + "0: la %[reg1],0\n" /* good case, QCI fc available */ "1:\n" EX_TABLE(0b, 1b) - : "+d" (reg1) - : "d" (reg0), "d" (reg2) - : "cc", "memory"); + : [reg1] "+&d" (reg1) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "memory", "0", "2"); return reg1; } @@ -220,21 +232,25 @@ static inline struct ap_queue_status ap_aqic(ap_qid_t qid, struct ap_qirq_ctrl qirqctrl, void *ind) { - register unsigned long reg0 asm ("0") = qid | (3UL << 24); - register union { + unsigned long reg0 = qid | (3UL << 24); /* fc 3UL is AQIC */ + union { unsigned long value; struct ap_qirq_ctrl qirqctrl; struct ap_queue_status status; - } reg1 asm ("1"); - register void *reg2 asm ("2") = ind; + } reg1; + void *reg2 = ind; reg1.qirqctrl = qirqctrl; asm volatile( - ".long 0xb2af0000" /* PQAP(AQIC) */ - : "+d" (reg1) - : "d" (reg0), "d" (reg2) - : "cc"); + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lgr 1,%[reg1]\n" /* irq ctrl into gr1 */ + " lgr 2,%[reg2]\n" /* ni addr into gr2 */ + " .long 0xb2af0000\n" /* PQAP(AQIC) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg1] "+&d" (reg1) + : [reg0] "d" (reg0), [reg2] "d" (reg2) + : "cc", "0", "1", "2"); return reg1.status; } @@ -268,21 +284,24 @@ union ap_qact_ap_info { static inline struct ap_queue_status ap_qact(ap_qid_t qid, int ifbit, union ap_qact_ap_info *apinfo) { - register unsigned long reg0 asm ("0") = qid | (5UL << 24) - | ((ifbit & 0x01) << 22); - register union { + unsigned long reg0 = qid | (5UL << 24) | ((ifbit & 0x01) << 22); + union { unsigned long value; struct ap_queue_status status; - } reg1 asm ("1"); - register unsigned long reg2 asm ("2"); + } reg1; + unsigned long reg2; reg1.value = apinfo->val; asm volatile( - ".long 0xb2af0000" /* PQAP(QACT) */ - : "+d" (reg1), "=d" (reg2) - : "d" (reg0) - : "cc"); + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lgr 1,%[reg1]\n" /* qact in info into gr1 */ + " .long 0xb2af0000\n" /* PQAP(QACT) */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* qact out info into reg2 */ + : [reg1] "+&d" (reg1), [reg2] "=&d" (reg2) + : [reg0] "d" (reg0) + : "cc", "0", "1", "2"); apinfo->val = reg2; return reg1.status; } @@ -303,19 +322,24 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, unsigned long long psmid, void *msg, size_t length) { - register unsigned long reg0 asm ("0") = qid | 0x40000000UL; - register struct ap_queue_status reg1 asm ("1"); - register unsigned long reg2 asm ("2") = (unsigned long) msg; - register unsigned long reg3 asm ("3") = (unsigned long) length; - register unsigned long reg4 asm ("4") = (unsigned int) (psmid >> 32); - register unsigned long reg5 asm ("5") = psmid & 0xffffffff; + unsigned long reg0 = qid | 0x40000000UL; /* 0x4... is last msg part */ + union register_pair nqap_r1, nqap_r2; + struct ap_queue_status reg1; + + nqap_r1.even = (unsigned int)(psmid >> 32); + nqap_r1.odd = psmid & 0xffffffff; + nqap_r2.even = (unsigned long)msg; + nqap_r2.odd = (unsigned long)length; asm volatile ( - "0: .long 0xb2ad0042\n" /* NQAP */ - " brc 2,0b" - : "+d" (reg0), "=d" (reg1), "+d" (reg2), "+d" (reg3) - : "d" (reg4), "d" (reg5) - : "cc", "memory"); + " lgr 0,%[reg0]\n" /* qid param in gr0 */ + "0: .insn rre,0xb2ad0000,%[nqap_r1],%[nqap_r2]\n" + " brc 2,0b\n" /* handle partial completion */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), + [nqap_r2] "+&d" (nqap_r2.pair) + : [nqap_r1] "d" (nqap_r1.pair) + : "cc", "memory", "0", "1"); return reg1; } @@ -325,6 +349,8 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, * @psmid: Pointer to program supplied message identifier * @msg: The message text * @length: The message length + * @reslength: Resitual length on return + * @resgr0: input: gr0 value (only used if != 0), output: resitual gr0 content * * Returns AP queue status structure. * Condition code 1 on DQAP means the receive has taken place @@ -336,27 +362,65 @@ static inline struct ap_queue_status ap_nqap(ap_qid_t qid, * Note that gpr2 is used by the DQAP instruction to keep track of * any 'residual' length, in case the instruction gets interrupted. * Hence it gets zeroed before the instruction. + * If the message does not fit into the buffer, this function will + * return with a truncated message and the reply in the firmware queue + * is not removed. This is indicated to the caller with an + * ap_queue_status response_code value of all bits on (0xFF) and (if + * the reslength ptr is given) the remaining length is stored in + * *reslength and (if the resgr0 ptr is given) the updated gr0 value + * for further processing of this msg entry is stored in *resgr0. The + * caller needs to detect this situation and should invoke ap_dqap + * with a valid resgr0 ptr and a value in there != 0 to indicate that + * *resgr0 is to be used instead of qid to further process this entry. */ static inline struct ap_queue_status ap_dqap(ap_qid_t qid, unsigned long long *psmid, - void *msg, size_t length) + void *msg, size_t length, + size_t *reslength, + unsigned long *resgr0) { - register unsigned long reg0 asm("0") = qid | 0x80000000UL; - register struct ap_queue_status reg1 asm ("1"); - register unsigned long reg2 asm("2") = 0UL; - register unsigned long reg4 asm("4") = (unsigned long) msg; - register unsigned long reg5 asm("5") = (unsigned long) length; - register unsigned long reg6 asm("6") = 0UL; - register unsigned long reg7 asm("7") = 0UL; + unsigned long reg0 = resgr0 && *resgr0 ? *resgr0 : qid | 0x80000000UL; + struct ap_queue_status reg1; + unsigned long reg2; + union register_pair rp1, rp2; + rp1.even = 0UL; + rp1.odd = 0UL; + rp2.even = (unsigned long)msg; + rp2.odd = (unsigned long)length; asm volatile( - "0: .long 0xb2ae0064\n" /* DQAP */ - " brc 6,0b\n" - : "+d" (reg0), "=d" (reg1), "+d" (reg2), - "+d" (reg4), "+d" (reg5), "+d" (reg6), "+d" (reg7) - : : "cc", "memory"); - *psmid = (((unsigned long long) reg6) << 32) + reg7; + " lgr 0,%[reg0]\n" /* qid param into gr0 */ + " lghi 2,0\n" /* 0 into gr2 (res length) */ + "0: ltgr %N[rp2],%N[rp2]\n" /* check buf len */ + " jz 2f\n" /* go out if buf len is 0 */ + "1: .insn rre,0xb2ae0000,%[rp1],%[rp2]\n" + " brc 6,0b\n" /* handle partial complete */ + "2: lgr %[reg0],0\n" /* gr0 (qid + info) into reg0 */ + " lgr %[reg1],1\n" /* gr1 (status) into reg1 */ + " lgr %[reg2],2\n" /* gr2 (res length) into reg2 */ + : [reg0] "+&d" (reg0), [reg1] "=&d" (reg1), [reg2] "=&d" (reg2), + [rp1] "+&d" (rp1.pair), [rp2] "+&d" (rp2.pair) + : + : "cc", "memory", "0", "1", "2"); + + if (reslength) + *reslength = reg2; + if (reg2 != 0 && rp2.odd == 0) { + /* + * Partially complete, status in gr1 is not set. + * Signal the caller that this dqap is only partially received + * with a special status response code 0xFF and *resgr0 updated + */ + reg1.response_code = 0xFF; + if (resgr0) + *resgr0 = reg0; + } else { + *psmid = (((unsigned long long)rp1.even) << 32) + rp1.odd; + if (resgr0) + *resgr0 = 0; + } + return reg1; } diff --git a/arch/s390/include/asm/cpu_mcf.h b/arch/s390/include/asm/cpu_mcf.h index 4dcefddb7751..ca0e0e5ddbc4 100644 --- a/arch/s390/include/asm/cpu_mcf.h +++ b/arch/s390/include/asm/cpu_mcf.h @@ -32,39 +32,22 @@ static const u64 cpumf_ctr_ctl[CPUMF_CTR_SET_MAX] = { [CPUMF_CTR_SET_MT_DIAG] = 0x20, }; -static inline void ctr_set_enable(u64 *state, int ctr_set) -{ - *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT; -} -static inline void ctr_set_disable(u64 *state, int ctr_set) -{ - *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ENABLE_SHIFT); -} -static inline void ctr_set_start(u64 *state, int ctr_set) -{ - *state |= cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT; -} -static inline void ctr_set_stop(u64 *state, int ctr_set) -{ - *state &= ~(cpumf_ctr_ctl[ctr_set] << CPUMF_LCCTL_ACTCTL_SHIFT); -} - -static inline void ctr_set_multiple_enable(u64 *state, u64 ctrsets) +static inline void ctr_set_enable(u64 *state, u64 ctrsets) { *state |= ctrsets << CPUMF_LCCTL_ENABLE_SHIFT; } -static inline void ctr_set_multiple_disable(u64 *state, u64 ctrsets) +static inline void ctr_set_disable(u64 *state, u64 ctrsets) { *state &= ~(ctrsets << CPUMF_LCCTL_ENABLE_SHIFT); } -static inline void ctr_set_multiple_start(u64 *state, u64 ctrsets) +static inline void ctr_set_start(u64 *state, u64 ctrsets) { *state |= ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT; } -static inline void ctr_set_multiple_stop(u64 *state, u64 ctrsets) +static inline void ctr_set_stop(u64 *state, u64 ctrsets) { *state &= ~(ctrsets << CPUMF_LCCTL_ACTCTL_SHIFT); } @@ -92,8 +75,15 @@ struct cpu_cf_events { struct cpumf_ctr_info info; atomic_t ctr_set[CPUMF_CTR_SET_MAX]; atomic64_t alert; - u64 state; + u64 state; /* For perf_event_open SVC */ + u64 dev_state; /* For /dev/hwctr */ unsigned int flags; + size_t used; /* Bytes used in data */ + size_t usedss; /* Bytes used in start/stop */ + unsigned char start[PAGE_SIZE]; /* Counter set at event add */ + unsigned char stop[PAGE_SIZE]; /* Counter set at event delete */ + unsigned char data[PAGE_SIZE]; /* Counter set at /dev/hwctr */ + unsigned int sets; /* # Counter set saved in memory */ }; DECLARE_PER_CPU(struct cpu_cf_events, cpu_cf_events); @@ -124,4 +114,6 @@ static inline int stccm_avail(void) size_t cpum_cf_ctrset_size(enum cpumf_ctr_set ctrset, struct cpumf_ctr_info *info); +int cfset_online_cpu(unsigned int cpu); +int cfset_offline_cpu(unsigned int cpu); #endif /* _ASM_S390_CPU_MCF_H */ diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index ed5efbb531c4..adc0179fa34e 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -21,8 +21,6 @@ #define CR0_INTERRUPT_KEY_SUBMASK BIT(63 - 57) #define CR0_MEASUREMENT_ALERT_SUBMASK BIT(63 - 58) -#define CR2_GUARDED_STORAGE BIT(63 - 59) - #define CR14_UNUSED_32 BIT(63 - 32) #define CR14_UNUSED_33 BIT(63 - 33) #define CR14_CHANNEL_REPORT_SUBMASK BIT(63 - 35) diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 66d51ad090ab..bd00c94620d3 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -144,10 +144,6 @@ typedef s390_compat_regs compat_elf_gregset_t; #include <linux/sched/mm.h> /* for task_struct */ #include <asm/mmu_context.h> -#include <asm/vdso.h> - -extern unsigned int vdso_enabled; - /* * This is used to ensure we don't load something for the wrong architecture. */ @@ -176,7 +172,7 @@ struct arch_elf_state { !current->mm->context.alloc_pgste) { \ set_thread_flag(TIF_PGSTE); \ set_pt_regs_flag(task_pt_regs(current), \ - PIF_SYSCALL_RESTART); \ + PIF_EXECVE_PGSTE_RESTART); \ _state->rc = -EAGAIN; \ } \ _state->rc; \ @@ -268,11 +264,10 @@ do { \ #define STACK_RND_MASK MMAP_RND_MASK /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ -#define ARCH_DLINFO \ -do { \ - if (vdso_enabled) \ - NEW_AUX_ENT(AT_SYSINFO_EHDR, \ - (unsigned long)current->mm->context.vdso_base); \ +#define ARCH_DLINFO \ +do { \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, \ + (unsigned long)current->mm->context.vdso_base); \ } while (0) struct linux_binprm; diff --git a/arch/s390/include/asm/entry-common.h b/arch/s390/include/asm/entry-common.h index baa8005090c3..17aead80aadb 100644 --- a/arch/s390/include/asm/entry-common.h +++ b/arch/s390/include/asm/entry-common.h @@ -14,7 +14,6 @@ #define ARCH_EXIT_TO_USER_MODE_WORK (_TIF_GUARDED_STORAGE | _TIF_PER_TRAP) void do_per_trap(struct pt_regs *regs); -void do_syscall(struct pt_regs *regs); #ifdef CONFIG_DEBUG_ENTRY static __always_inline void arch_check_user_regs(struct pt_regs *regs) diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index a0a7a2c72bd4..24e8fed150cf 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -5,7 +5,7 @@ #include <asm/asm-const.h> #include <linux/stringify.h> -#define __ALIGN .align 4, 0x07 +#define __ALIGN .align 16, 0x07 #define __ALIGN_STR __stringify(__ALIGN) /* diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h index 20e51c9ff240..2db45d7e68aa 100644 --- a/arch/s390/include/asm/nmi.h +++ b/arch/s390/include/asm/nmi.h @@ -23,12 +23,16 @@ #define MCCK_CODE_SYSTEM_DAMAGE BIT(63) #define MCCK_CODE_EXT_DAMAGE BIT(63 - 5) #define MCCK_CODE_CP BIT(63 - 9) -#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46) +#define MCCK_CODE_STG_ERROR BIT(63 - 16) +#define MCCK_CODE_STG_KEY_ERROR BIT(63 - 18) +#define MCCK_CODE_STG_DEGRAD BIT(63 - 19) #define MCCK_CODE_PSW_MWP_VALID BIT(63 - 20) #define MCCK_CODE_PSW_IA_VALID BIT(63 - 23) +#define MCCK_CODE_STG_FAIL_ADDR BIT(63 - 24) #define MCCK_CODE_CR_VALID BIT(63 - 29) #define MCCK_CODE_GS_VALID BIT(63 - 36) #define MCCK_CODE_FC_VALID BIT(63 - 43) +#define MCCK_CODE_CPU_TIMER_VALID BIT(63 - 46) #ifndef __ASSEMBLY__ diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h index 23ff51be7e29..d9d5350cc3ec 100644 --- a/arch/s390/include/asm/preempt.h +++ b/arch/s390/include/asm/preempt.h @@ -29,12 +29,6 @@ static inline void preempt_count_set(int pc) old, new) != old); } -#define init_task_preempt_count(p) do { } while (0) - -#define init_idle_preempt_count(p, cpu) do { \ - S390_lowcore.preempt_count = PREEMPT_DISABLED; \ -} while (0) - static inline void set_preempt_need_resched(void) { __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); @@ -88,12 +82,6 @@ static inline void preempt_count_set(int pc) S390_lowcore.preempt_count = pc; } -#define init_task_preempt_count(p) do { } while (0) - -#define init_idle_preempt_count(p, cpu) do { \ - S390_lowcore.preempt_count = PREEMPT_DISABLED; \ -} while (0) - static inline void set_preempt_need_resched(void) { } @@ -130,6 +118,10 @@ static inline bool should_resched(int preempt_offset) #endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ +#define init_task_preempt_count(p) do { } while (0) +/* Deferred to CPU bringup time */ +#define init_idle_preempt_count(p, cpu) do { } while (0) + #ifdef CONFIG_PREEMPTION extern void preempt_schedule(void); #define __preempt_schedule() preempt_schedule() diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h index c7850d649373..61b22aa990e7 100644 --- a/arch/s390/include/asm/ptrace.h +++ b/arch/s390/include/asm/ptrace.h @@ -11,15 +11,15 @@ #include <uapi/asm/ptrace.h> #include <asm/tpi.h> -#define PIF_SYSCALL 0 /* inside a system call */ -#define PIF_SYSCALL_RESTART 1 /* restart the current system call */ -#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ -#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ +#define PIF_SYSCALL 0 /* inside a system call */ +#define PIF_EXECVE_PGSTE_RESTART 1 /* restart execve for PGSTE binaries */ +#define PIF_SYSCALL_RET_SET 2 /* return value was set via ptrace */ +#define PIF_GUEST_FAULT 3 /* indicates program check in sie64a */ -#define _PIF_SYSCALL BIT(PIF_SYSCALL) -#define _PIF_SYSCALL_RESTART BIT(PIF_SYSCALL_RESTART) -#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) -#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) +#define _PIF_SYSCALL BIT(PIF_SYSCALL) +#define _PIF_EXECVE_PGSTE_RESTART BIT(PIF_EXECVE_PGSTE_RESTART) +#define _PIF_SYSCALL_RET_SET BIT(PIF_SYSCALL_RET_SET) +#define _PIF_GUEST_FAULT BIT(PIF_GUEST_FAULT) #ifndef __ASSEMBLY__ @@ -162,6 +162,14 @@ static inline int test_pt_regs_flag(struct pt_regs *regs, int flag) return !!(regs->flags & (1UL << flag)); } +static inline int test_and_clear_pt_regs_flag(struct pt_regs *regs, int flag) +{ + int ret = test_pt_regs_flag(regs, flag); + + clear_pt_regs_flag(regs, flag); + return ret; +} + /* * These are defined as per linux/ptrace.h, which see. */ diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h index a8b75da3c1b8..3a77aa96d092 100644 --- a/arch/s390/include/asm/setup.h +++ b/arch/s390/include/asm/setup.h @@ -159,6 +159,8 @@ static inline unsigned long kaslr_offset(void) return __kaslr_offset; } +extern int is_full_image; + static inline u32 gen_lpswe(unsigned long addr) { BUILD_BUG_ON(addr > 0xfff); diff --git a/arch/s390/include/asm/softirq_stack.h b/arch/s390/include/asm/softirq_stack.h new file mode 100644 index 000000000000..fd17f25704bd --- /dev/null +++ b/arch/s390/include/asm/softirq_stack.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __ASM_S390_SOFTIRQ_STACK_H +#define __ASM_S390_SOFTIRQ_STACK_H + +#include <asm/lowcore.h> +#include <asm/stacktrace.h> + +static inline void do_softirq_own_stack(void) +{ + call_on_stack(0, S390_lowcore.async_stack, void, __do_softirq); +} + +#endif /* __ASM_S390_SOFTIRQ_STACK_H */ diff --git a/arch/s390/include/asm/stacktrace.h b/arch/s390/include/asm/stacktrace.h index 76c6034428be..3d8a4b94c620 100644 --- a/arch/s390/include/asm/stacktrace.h +++ b/arch/s390/include/asm/stacktrace.h @@ -74,23 +74,6 @@ struct stack_frame { ((unsigned long)__builtin_frame_address(0) - \ offsetof(struct stack_frame, back_chain)) -#define CALL_ARGS_0() \ - register unsigned long r2 asm("2") -#define CALL_ARGS_1(arg1) \ - register unsigned long r2 asm("2") = (unsigned long)(arg1) -#define CALL_ARGS_2(arg1, arg2) \ - CALL_ARGS_1(arg1); \ - register unsigned long r3 asm("3") = (unsigned long)(arg2) -#define CALL_ARGS_3(arg1, arg2, arg3) \ - CALL_ARGS_2(arg1, arg2); \ - register unsigned long r4 asm("4") = (unsigned long)(arg3) -#define CALL_ARGS_4(arg1, arg2, arg3, arg4) \ - CALL_ARGS_3(arg1, arg2, arg3); \ - register unsigned long r4 asm("5") = (unsigned long)(arg4) -#define CALL_ARGS_5(arg1, arg2, arg3, arg4, arg5) \ - CALL_ARGS_4(arg1, arg2, arg3, arg4); \ - register unsigned long r4 asm("6") = (unsigned long)(arg5) - /* * To keep this simple mark register 2-6 as being changed (volatile) * by the called function, even though register 6 is saved/nonvolatile. @@ -109,34 +92,113 @@ struct stack_frame { #define CALL_CLOBBER_1 CALL_CLOBBER_2, "3" #define CALL_CLOBBER_0 CALL_CLOBBER_1 -#define CALL_ON_STACK(fn, stack, nr, args...) \ +#define CALL_LARGS_0(...) \ + long dummy = 0 +#define CALL_LARGS_1(t1, a1) \ + long arg1 = (long)(t1)(a1) +#define CALL_LARGS_2(t1, a1, t2, a2) \ + CALL_LARGS_1(t1, a1); \ + long arg2 = (long)(t2)(a2) +#define CALL_LARGS_3(t1, a1, t2, a2, t3, a3) \ + CALL_LARGS_2(t1, a1, t2, a2); \ + long arg3 = (long)(t3)(a3) +#define CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4) \ + CALL_LARGS_3(t1, a1, t2, a2, t3, a3); \ + long arg4 = (long)(t4)(a4) +#define CALL_LARGS_5(t1, a1, t2, a2, t3, a3, t4, a4, t5, a5) \ + CALL_LARGS_4(t1, a1, t2, a2, t3, a3, t4, a4); \ + long arg5 = (long)(t5)(a5) + +#define CALL_REGS_0 \ + register long r2 asm("2") = dummy +#define CALL_REGS_1 \ + register long r2 asm("2") = arg1 +#define CALL_REGS_2 \ + CALL_REGS_1; \ + register long r3 asm("3") = arg2 +#define CALL_REGS_3 \ + CALL_REGS_2; \ + register long r4 asm("4") = arg3 +#define CALL_REGS_4 \ + CALL_REGS_3; \ + register long r5 asm("5") = arg4 +#define CALL_REGS_5 \ + CALL_REGS_4; \ + register long r6 asm("6") = arg5 + +#define CALL_TYPECHECK_0(...) +#define CALL_TYPECHECK_1(t, a, ...) \ + typecheck(t, a) +#define CALL_TYPECHECK_2(t, a, ...) \ + CALL_TYPECHECK_1(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_3(t, a, ...) \ + CALL_TYPECHECK_2(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_4(t, a, ...) \ + CALL_TYPECHECK_3(__VA_ARGS__); \ + typecheck(t, a) +#define CALL_TYPECHECK_5(t, a, ...) \ + CALL_TYPECHECK_4(__VA_ARGS__); \ + typecheck(t, a) + +#define CALL_PARM_0(...) void +#define CALL_PARM_1(t, a, ...) t +#define CALL_PARM_2(t, a, ...) t, CALL_PARM_1(__VA_ARGS__) +#define CALL_PARM_3(t, a, ...) t, CALL_PARM_2(__VA_ARGS__) +#define CALL_PARM_4(t, a, ...) t, CALL_PARM_3(__VA_ARGS__) +#define CALL_PARM_5(t, a, ...) t, CALL_PARM_4(__VA_ARGS__) +#define CALL_PARM_6(t, a, ...) t, CALL_PARM_5(__VA_ARGS__) + +/* + * Use call_on_stack() to call a function switching to a specified + * stack. Proper sign and zero extension of function arguments is + * done. Usage: + * + * rc = call_on_stack(nr, stack, rettype, fn, t1, a1, t2, a2, ...) + * + * - nr specifies the number of function arguments of fn. + * - stack specifies the stack to be used. + * - fn is the function to be called. + * - rettype is the return type of fn. + * - t1, a1, ... are pairs, where t1 must match the type of the first + * argument of fn, t2 the second, etc. a1 is the corresponding + * first function argument (not name), etc. + */ +#define call_on_stack(nr, stack, rettype, fn, ...) \ ({ \ + rettype (*__fn)(CALL_PARM_##nr(__VA_ARGS__)) = fn; \ unsigned long frame = current_frame_address(); \ - CALL_ARGS_##nr(args); \ + unsigned long __stack = stack; \ unsigned long prev; \ + CALL_LARGS_##nr(__VA_ARGS__); \ + CALL_REGS_##nr; \ \ + CALL_TYPECHECK_##nr(__VA_ARGS__); \ asm volatile( \ - " la %[_prev],0(15)\n" \ + " lgr %[_prev],15\n" \ " lg 15,%[_stack]\n" \ " stg %[_frame],%[_bc](15)\n" \ " brasl 14,%[_fn]\n" \ - " la 15,0(%[_prev])\n" \ - : [_prev] "=&a" (prev), CALL_FMT_##nr \ - : [_stack] "R" (stack), \ + " lgr 15,%[_prev]\n" \ + : [_prev] "=&d" (prev), CALL_FMT_##nr \ + : [_stack] "R" (__stack), \ [_bc] "i" (offsetof(struct stack_frame, back_chain)), \ [_frame] "d" (frame), \ - [_fn] "X" (fn) : CALL_CLOBBER_##nr); \ - r2; \ + [_fn] "X" (__fn) : CALL_CLOBBER_##nr); \ + (rettype)r2; \ }) -#define CALL_ON_STACK_NORETURN(fn, stack) \ +#define call_on_stack_noreturn(fn, stack) \ ({ \ + void (*__fn)(void) = fn; \ + \ asm volatile( \ " la 15,0(%[_stack])\n" \ " xc %[_bc](8,15),%[_bc](15)\n" \ " brasl 14,%[_fn]\n" \ ::[_bc] "i" (offsetof(struct stack_frame, back_chain)), \ - [_stack] "a" (stack), [_fn] "X" (fn)); \ + [_stack] "a" (stack), [_fn] "X" (__fn)); \ BUG(); \ }) diff --git a/arch/s390/include/asm/uv.h b/arch/s390/include/asm/uv.h index 7b98d4caee77..12c5f006c136 100644 --- a/arch/s390/include/asm/uv.h +++ b/arch/s390/include/asm/uv.h @@ -73,6 +73,10 @@ enum uv_cmds_inst { BIT_UVC_CMD_UNPIN_PAGE_SHARED = 22, }; +enum uv_feat_ind { + BIT_UV_FEAT_MISC = 0, +}; + struct uv_cb_header { u16 len; u16 cmd; /* Command Code */ @@ -97,7 +101,8 @@ struct uv_cb_qui { u64 max_guest_stor_addr; u8 reserved88[158 - 136]; u16 max_guest_cpu_id; - u8 reserveda0[200 - 160]; + u64 uv_feature_indications; + u8 reserveda0[200 - 168]; } __packed __aligned(8); /* Initialize Ultravisor */ @@ -274,6 +279,7 @@ struct uv_info { unsigned long max_sec_stor_addr; unsigned int max_num_sec_conf; unsigned short max_guest_cpu_id; + unsigned long uv_feature_indications; }; extern struct uv_info uv_info; diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index b45e3dddd2c2..53165aa7813a 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -4,18 +4,31 @@ #include <vdso/datapage.h> -/* Default link address for the vDSO */ -#define VDSO64_LBASE 0 +#ifndef __ASSEMBLY__ -#define __VVAR_PAGES 2 +#include <generated/vdso64-offsets.h> +#ifdef CONFIG_COMPAT +#include <generated/vdso32-offsets.h> +#endif -#define VDSO_VERSION_STRING LINUX_2.6.29 - -#ifndef __ASSEMBLY__ +#define VDSO64_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso64_offset_##name)) +#ifdef CONFIG_COMPAT +#define VDSO32_SYMBOL(tsk, name) ((tsk)->mm->context.vdso_base + (vdso32_offset_##name)) +#else +#define VDSO32_SYMBOL(tsk, name) (-1UL) +#endif extern struct vdso_data *vdso_data; int vdso_getcpu_init(void); #endif /* __ASSEMBLY__ */ + +/* Default link address for the vDSO */ +#define VDSO_LBASE 0 + +#define __VVAR_PAGES 2 + +#define VDSO_VERSION_STRING LINUX_2.6.29 + #endif /* __S390_VDSO_H__ */ diff --git a/arch/s390/include/asm/vdso/gettimeofday.h b/arch/s390/include/asm/vdso/gettimeofday.h index 383c53c3dddd..d6465b22ffe3 100644 --- a/arch/s390/include/asm/vdso/gettimeofday.h +++ b/arch/s390/include/asm/vdso/gettimeofday.h @@ -8,7 +8,6 @@ #include <asm/timex.h> #include <asm/unistd.h> -#include <asm/vdso.h> #include <linux/compiler.h> #define vdso_calc_delta __arch_vdso_calc_delta diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 68ca1834316f..4a44ba5a2d73 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -71,10 +71,10 @@ obj-$(CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT) += ima_arch.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf_common.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf.o perf_cpum_sf.o obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o -obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_diag.o obj-$(CONFIG_TRACEPOINTS) += trace.o obj-$(findstring y, $(CONFIG_PROTECTED_VIRTUALIZATION_GUEST) $(CONFIG_PGSTE)) += uv.o # vdso obj-y += vdso64/ +obj-$(CONFIG_COMPAT) += vdso32/ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index f53605a3dfcd..77ff2130cb04 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -14,8 +14,6 @@ #include <linux/pgtable.h> #include <asm/idle.h> #include <asm/gmap.h> -#include <asm/nmi.h> -#include <asm/setup.h> #include <asm/stacktrace.h> int main(void) @@ -108,7 +106,6 @@ int main(void) OFFSET(__LC_LAST_UPDATE_CLOCK, lowcore, last_update_clock); OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); - OFFSET(__LC_CLOCK_COMPARATOR, lowcore, clock_comparator); OFFSET(__LC_BOOT_CLOCK, lowcore, boot_clock); OFFSET(__LC_CURRENT, lowcore, current_task); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); @@ -145,9 +142,6 @@ int main(void) OFFSET(__LC_CREGS_SAVE_AREA, lowcore, cregs_save_area); OFFSET(__LC_PGM_TDB, lowcore, pgm_tdb); BLANK(); - /* extended machine check save area */ - OFFSET(__MCESA_GS_SAVE_AREA, mcesa, guarded_storage_save_area); - BLANK(); /* gmap/sie offsets */ OFFSET(__GMAP_ASCE, gmap, asce); OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 1d0e17ec93eb..cca142fbb516 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -28,6 +28,7 @@ #include <linux/uaccess.h> #include <asm/lowcore.h> #include <asm/switch_to.h> +#include <asm/vdso.h> #include "compat_linux.h" #include "compat_ptrace.h" #include "entry.h" @@ -118,7 +119,6 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } @@ -304,11 +304,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - /* Signal frames without vectors registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, sigreturn); } /* Set up registers for signal handler */ @@ -371,10 +367,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, restorer = (unsigned long __force) ksig->ka.sa.sa_restorer | PSW32_ADDR_AMODE; } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long __force) svc | PSW32_ADDR_AMODE; + restorer = VDSO32_SYMBOL(current, rt_sigreturn); } /* Create siginfo on the signal stack */ diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index c2cf79d353cf..fb84e3fc1686 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -33,6 +33,8 @@ #include <asm/switch_to.h> #include "entry.h" +int __bootdata(is_full_image); + static void __init reset_tod_clock(void) { union tod_clock clk; @@ -279,7 +281,7 @@ static void __init setup_boot_command_line(void) static void __init check_image_bootable(void) { - if (!memcmp(EP_STRING, (void *)EP_OFFSET, strlen(EP_STRING))) + if (is_full_image) return; sclp_early_printk("Linux kernel boot failure: An attempt to boot a vmlinux ELF image failed.\n"); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 3e8c6669373a..5a2f70cbd3a9 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -14,7 +14,6 @@ #include <asm/alternative-asm.h> #include <asm/processor.h> #include <asm/cache.h> -#include <asm/ctl_reg.h> #include <asm/dwarf.h> #include <asm/errno.h> #include <asm/ptrace.h> @@ -129,6 +128,24 @@ _LPP_OFFSET = __LC_LPP "jnz .+8; .long 0xb2e8d000", 82 .endm + /* + * The CHKSTG macro jumps to the provided label in case the + * machine check interruption code reports one of unrecoverable + * storage errors: + * - Storage error uncorrected + * - Storage key error uncorrected + * - Storage degradation with Failing-storage-address validity + */ + .macro CHKSTG errlabel + TSTMSK __LC_MCCK_CODE,(MCCK_CODE_STG_ERROR|MCCK_CODE_STG_KEY_ERROR) + jnz \errlabel + TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_DEGRAD + jz oklabel\@ + TSTMSK __LC_MCCK_CODE,MCCK_CODE_STG_FAIL_ADDR + jnz \errlabel +oklabel\@: + .endm + #if IS_ENABLED(CONFIG_KVM) /* * The OUTSIDE macro jumps to the provided label in case the value @@ -148,6 +165,13 @@ _LPP_OFFSET = __LC_LPP clgr %r14,%r13 jhe \outside_label .endm + + .macro SIEEXIT + lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + .endm #endif GEN_BR_THUNK %r14 @@ -235,7 +259,6 @@ ENTRY(sie64a) # are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. # Other instructions between sie64a and .Lsie_done should not cause program # interrupts. So lets use 3 nops as a landing pad for all possible rewinds. -# See also .Lcleanup_sie .Lrewind_pad6: nopr 7 .Lrewind_pad4: @@ -341,10 +364,7 @@ ENTRY(pgm_check_handler) #if IS_ENABLED(CONFIG_KVM) # cleanup critical section for program checks in sie64a OUTSIDE %r9,.Lsie_gmap,.Lsie_done,1f - lg %r14,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE # load primary asce - larl %r9,sie_exit # skip forward to sie_exit + SIEEXIT lghi %r10,_PIF_GUEST_FAULT #endif 1: tmhh %r8,0x4000 # PER bit set in old PSW ? @@ -410,7 +430,8 @@ ENTRY(\name) jnz 1f #if IS_ENABLED(CONFIG_KVM) OUTSIDE %r9,.Lsie_gmap,.Lsie_done,0f - brasl %r14,.Lcleanup_sie + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + SIEEXIT #endif 0: CHECK_STACK __LC_SAVE_AREA_ASYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) @@ -484,8 +505,6 @@ ENTRY(mcck_int_handler) BPOFF la %r1,4095 # validate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # validate cpu timer - sckc __LC_CLOCK_COMPARATOR # validate comparator - lam %a0,%a15,__LC_AREGS_SAVE_AREA-4095(%r1) # validate acrs lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# validate gprs lg %r12,__LC_CURRENT lmg %r8,%r9,__LC_MCK_OLD_PSW @@ -496,41 +515,7 @@ ENTRY(mcck_int_handler) la %r14,4095 lctlg %c0,%c15,__LC_CREGS_SAVE_AREA-4095(%r14) # validate ctl regs ptlb - lg %r11,__LC_MCESAD-4095(%r14) # extended machine check save area - nill %r11,0xfc00 # MCESA_ORIGIN_MASK - TSTMSK __LC_CREGS_SAVE_AREA+16-4095(%r14),CR2_GUARDED_STORAGE - jno 0f - TSTMSK __LC_MCCK_CODE,MCCK_CODE_GS_VALID - jno 0f - .insn rxy,0xe3000000004d,0,__MCESA_GS_SAVE_AREA(%r11) # LGSC -0: l %r14,__LC_FP_CREG_SAVE_AREA-4095(%r14) - TSTMSK __LC_MCCK_CODE,MCCK_CODE_FC_VALID - jo 0f - sr %r14,%r14 -0: sfpc %r14 - TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX - jo 0f - lghi %r14,__LC_FPREGS_SAVE_AREA - ld %f0,0(%r14) - ld %f1,8(%r14) - ld %f2,16(%r14) - ld %f3,24(%r14) - ld %f4,32(%r14) - ld %f5,40(%r14) - ld %f6,48(%r14) - ld %f7,56(%r14) - ld %f8,64(%r14) - ld %f9,72(%r14) - ld %f10,80(%r14) - ld %f11,88(%r14) - ld %f12,96(%r14) - ld %f13,104(%r14) - ld %f14,112(%r14) - ld %f15,120(%r14) - j 1f -0: VLM %v0,%v15,0,%r11 - VLM %v16,%v31,256,%r11 -1: lghi %r14,__LC_CPU_TIMER_SAVE_AREA + lghi %r14,__LC_CPU_TIMER_SAVE_AREA mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) TSTMSK __LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID jo 3f @@ -546,24 +531,29 @@ ENTRY(mcck_int_handler) 3: TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_MWP_VALID jno .Lmcck_panic tmhh %r8,0x0001 # interrupting from user ? - jnz 4f + jnz 6f TSTMSK __LC_MCCK_CODE,MCCK_CODE_PSW_IA_VALID jno .Lmcck_panic -4: ssm __LC_PGM_NEW_PSW # turn dat on, keep irqs off - tmhh %r8,0x0001 # interrupting from user ? - jnz .Lmcck_user #if IS_ENABLED(CONFIG_KVM) - OUTSIDE %r9,.Lsie_gmap,.Lsie_done,.Lmcck_stack - OUTSIDE %r9,.Lsie_entry,.Lsie_skip,5f + OUTSIDE %r9,.Lsie_gmap,.Lsie_done,6f + OUTSIDE %r9,.Lsie_entry,.Lsie_skip,4f oi __LC_CPU_FLAGS+7, _CIF_MCCK_GUEST -5: brasl %r14,.Lcleanup_sie -#endif + j 5f +4: CHKSTG .Lmcck_panic +5: larl %r14,.Lstosm_tmp + stosm 0(%r14),0x04 # turn dat on, keep irqs off + BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) + SIEEXIT j .Lmcck_stack -.Lmcck_user: +#endif +6: CHKSTG .Lmcck_panic + larl %r14,.Lstosm_tmp + stosm 0(%r14),0x04 # turn dat on, keep irqs off + tmhh %r8,0x0001 # interrupting from user ? + jz .Lmcck_stack BPENTER __TI_flags(%r12),_TIF_ISOLATE_BP .Lmcck_stack: lg %r15,__LC_MCCK_STACK -.Lmcck_skip: la %r11,STACK_FRAME_OVERHEAD(%r15) stctg %c1,%c1,__PT_CR1(%r11) lctlg %c1,%c1,__LC_KERNEL_ASCE @@ -605,8 +595,33 @@ ENTRY(mcck_int_handler) b __LC_RETURN_MCCK_LPSWE .Lmcck_panic: - lg %r15,__LC_NODAT_STACK - j .Lmcck_skip + /* + * Iterate over all possible CPU addresses in the range 0..0xffff + * and stop each CPU using signal processor. Use compare and swap + * to allow just one CPU-stopper and prevent concurrent CPUs from + * stopping each other while leaving the others running. + */ + lhi %r5,0 + lhi %r6,1 + larl %r7,.Lstop_lock + cs %r5,%r6,0(%r7) # single CPU-stopper only + jnz 4f + larl %r7,.Lthis_cpu + stap 0(%r7) # this CPU address + lh %r4,0(%r7) + nilh %r4,0 + lhi %r0,1 + sll %r0,16 # CPU counter + lhi %r3,0 # next CPU address +0: cr %r3,%r4 + je 2f +1: sigp %r1,%r3,SIGP_STOP # stop next CPU + brc SIGP_CC_BUSY,1b +2: ahi %r3,1 + brct %r0,0b +3: sigp %r1,%r4,SIGP_STOP # stop this CPU + brc SIGP_CC_BUSY,3b +4: j 4b ENDPROC(mcck_int_handler) # @@ -657,15 +672,11 @@ ENTRY(stack_overflow) ENDPROC(stack_overflow) #endif -#if IS_ENABLED(CONFIG_KVM) -.Lcleanup_sie: - BPENTER __SF_SIE_FLAGS(%r15),(_TIF_ISOLATE_BP|_TIF_ISOLATE_BP_GUEST) - lg %r9,__SF_SIE_CONTROL(%r15) # get control block pointer - ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_KERNEL_ASCE - larl %r9,sie_exit # skip forward to sie_exit - BR_EX %r14,%r13 -#endif + .section .data, "aw" + .align 4 +.Lstop_lock: .long 0 +.Lthis_cpu: .short 0 +.Lstosm_tmp: .byte 0 .section .rodata, "a" #define SYSCALL(esame,emu) .quad __s390x_ ## esame .globl sys_call_table diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index c0df4060d28d..234d085257eb 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -110,15 +110,17 @@ static int on_async_stack(void) { unsigned long frame = current_frame_address(); - return !!!((S390_lowcore.async_stack - frame) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)); + return ((S390_lowcore.async_stack ^ frame) & ~(THREAD_SIZE - 1)) == 0; } static void do_irq_async(struct pt_regs *regs, int irq) { - if (on_async_stack()) + if (on_async_stack()) { do_IRQ(regs, irq); - else - CALL_ON_STACK(do_IRQ, S390_lowcore.async_stack, 2, regs, irq); + } else { + call_on_stack(2, S390_lowcore.async_stack, void, do_IRQ, + struct pt_regs *, regs, int, irq); + } } static int irq_pending(struct pt_regs *regs) @@ -266,24 +268,6 @@ unsigned int arch_dynirq_lower_bound(unsigned int from) } /* - * Switch to the asynchronous interrupt stack for softirq execution. - */ -void do_softirq_own_stack(void) -{ - unsigned long old, new; - - old = current_stack_pointer(); - /* Check against async. stack address range. */ - new = S390_lowcore.async_stack; - if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) { - CALL_ON_STACK(__do_softirq, new, 0); - } else { - /* We are already on the async stack. */ - __do_softirq(); - } -} - -/* * ext_int_hash[index] is the list head for all external interrupts that hash * to this index. */ diff --git a/arch/s390/kernel/kprobes.c b/arch/s390/kernel/kprobes.c index 528bb31815c3..52d056a5f89f 100644 --- a/arch/s390/kernel/kprobes.c +++ b/arch/s390/kernel/kprobes.c @@ -92,11 +92,6 @@ static void copy_instruction(struct kprobe *p) } NOKPROBE_SYMBOL(copy_instruction); -static inline int is_kernel_addr(void *addr) -{ - return addr < (void *)_end; -} - static int s390_get_insn_slot(struct kprobe *p) { /* @@ -105,7 +100,7 @@ static int s390_get_insn_slot(struct kprobe *p) * field can be patched and executed within the insn slot. */ p->ainsn.insn = NULL; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) p->ainsn.insn = get_s390_insn_slot(); else if (is_module_addr(p->addr)) p->ainsn.insn = get_insn_slot(); @@ -117,7 +112,7 @@ static void s390_free_insn_slot(struct kprobe *p) { if (!p->ainsn.insn) return; - if (is_kernel_addr(p->addr)) + if (is_kernel((unsigned long)p->addr)) free_s390_insn_slot(p->ainsn.insn, 0); else free_insn_slot(p->ainsn.insn, 0); diff --git a/arch/s390/kernel/machine_kexec.c b/arch/s390/kernel/machine_kexec.c index d91989c7bd6a..1005a6935fbe 100644 --- a/arch/s390/kernel/machine_kexec.c +++ b/arch/s390/kernel/machine_kexec.c @@ -132,7 +132,8 @@ static bool kdump_csum_valid(struct kimage *image) int rc; preempt_disable(); - rc = CALL_ON_STACK(do_start_kdump, S390_lowcore.nodat_stack, 1, image); + rc = call_on_stack(1, S390_lowcore.nodat_stack, unsigned long, do_start_kdump, + unsigned long, (unsigned long)image); preempt_enable(); return rc == 0; #else diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 11f8c296f60d..20f8e1868853 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -189,12 +189,16 @@ void noinstr s390_handle_mcck(void) * returns 0 if all required registers are available * returns 1 otherwise */ -static int notrace s390_check_registers(union mci mci, int umode) +static int notrace s390_validate_registers(union mci mci, int umode) { + struct mcesa *mcesa; + void *fpt_save_area; union ctlreg2 cr2; int kill_task; + u64 zero; kill_task = 0; + zero = 0; if (!mci.gr) { /* @@ -205,14 +209,6 @@ static int notrace s390_check_registers(union mci mci, int umode) s390_handle_damage(); kill_task = 1; } - /* Check control registers */ - if (!mci.cr) { - /* - * Control registers have unknown contents. - * Can't recover and therefore stopping machine. - */ - s390_handle_damage(); - } if (!mci.fp) { /* * Floating point registers can't be restored. If the @@ -225,35 +221,89 @@ static int notrace s390_check_registers(union mci mci, int umode) if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } + fpt_save_area = &S390_lowcore.floating_pt_save_area; if (!mci.fc) { /* * Floating point control register can't be restored. * If the kernel currently uses the floating pointer * registers and needs the FPC register the system is * stopped. If the process has its floating pointer - * registers loaded it is terminated. + * registers loaded it is terminated. Otherwise the + * FPC is just validated. */ if (S390_lowcore.fpu_flags & KERNEL_FPC) s390_handle_damage(); + asm volatile( + " lfpc %0\n" + : + : "Q" (zero)); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; + } else { + asm volatile( + " lfpc %0\n" + : + : "Q" (S390_lowcore.fpt_creg_save_area)); } - if (MACHINE_HAS_VX) { + mcesa = (struct mcesa *)(S390_lowcore.mcesad & MCESA_ORIGIN_MASK); + if (!MACHINE_HAS_VX) { + /* Validate floating point registers */ + asm volatile( + " ld 0,0(%0)\n" + " ld 1,8(%0)\n" + " ld 2,16(%0)\n" + " ld 3,24(%0)\n" + " ld 4,32(%0)\n" + " ld 5,40(%0)\n" + " ld 6,48(%0)\n" + " ld 7,56(%0)\n" + " ld 8,64(%0)\n" + " ld 9,72(%0)\n" + " ld 10,80(%0)\n" + " ld 11,88(%0)\n" + " ld 12,96(%0)\n" + " ld 13,104(%0)\n" + " ld 14,112(%0)\n" + " ld 15,120(%0)\n" + : + : "a" (fpt_save_area) + : "memory"); + } else { + /* Validate vector registers */ + union ctlreg0 cr0; + if (!mci.vr) { /* * Vector registers can't be restored. If the kernel * currently uses vector registers the system is * stopped. If the process has its vector registers - * loaded it is terminated. + * loaded it is terminated. Otherwise just validate + * the registers. */ if (S390_lowcore.fpu_flags & KERNEL_VXR) s390_handle_damage(); if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } + cr0.val = S390_lowcore.cregs_save_area[0]; + cr0.afp = cr0.vx = 1; + __ctl_load(cr0.val, 0, 0); + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ + : + : "Q" (*(struct vx_array *)mcesa->vector_save_area) + : "1"); + __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); } - /* Check if access registers are valid */ + /* Validate access registers */ + asm volatile( + " lam 0,15,0(%0)\n" + : + : "a" (&S390_lowcore.access_regs_save_area) + : "memory"); if (!mci.ar) { /* * Access registers have unknown contents. @@ -261,7 +311,7 @@ static int notrace s390_check_registers(union mci mci, int umode) */ kill_task = 1; } - /* Check guarded storage registers */ + /* Validate guarded storage registers */ cr2.val = S390_lowcore.cregs_save_area[2]; if (cr2.gse) { if (!mci.gs) { @@ -271,31 +321,26 @@ static int notrace s390_check_registers(union mci mci, int umode) * It has to be terminated. */ kill_task = 1; + } else { + load_gs_cb((struct gs_cb *)mcesa->guarded_storage_save_area); } } - /* Check if old PSW is valid */ - if (!mci.wp) { - /* - * Can't tell if we come from user or kernel mode - * -> stopping machine. - */ - s390_handle_damage(); - } - /* Check for invalid kernel instruction address */ - if (!mci.ia && !umode) { - /* - * The instruction address got lost while running - * in the kernel -> stopping machine. - */ - s390_handle_damage(); - } + /* + * The getcpu vdso syscall reads CPU number from the programmable + * field of the TOD clock. Disregard the TOD programmable register + * validity bit and load the CPU number into the TOD programmable + * field unconditionally. + */ + set_tod_programmable_field(raw_smp_processor_id()); + /* Validate clock comparator register */ + set_clock_comparator(S390_lowcore.clock_comparator); if (!mci.ms || !mci.pm || !mci.ia) kill_task = 1; return kill_task; } -NOKPROBE_SYMBOL(s390_check_registers); +NOKPROBE_SYMBOL(s390_validate_registers); /* * Backup the guest's machine check info to its description block @@ -353,11 +398,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs) mci.val = S390_lowcore.mcck_interruption_code; mcck = this_cpu_ptr(&cpu_mcck); - if (mci.sd) { - /* System damage -> stopping machine */ - s390_handle_damage(); - } - /* * Reinject the instruction processing damages' machine checks * including Delayed Access Exception into the guest @@ -398,7 +438,7 @@ int notrace s390_do_machine_check(struct pt_regs *regs) s390_handle_damage(); } } - if (s390_check_registers(mci, user_mode(regs))) { + if (s390_validate_registers(mci, user_mode(regs))) { /* * Couldn't restore all register contents for the * user space process -> mark task for termination. @@ -428,21 +468,6 @@ int notrace s390_do_machine_check(struct pt_regs *regs) mcck_pending = 1; } - /* - * Reinject storage related machine checks into the guest if they - * happen when the guest is running. - */ - if (!test_cpu_flag(CIF_MCCK_GUEST)) { - if (mci.se) - /* Storage error uncorrected */ - s390_handle_damage(); - if (mci.ke) - /* Storage key-error uncorrected */ - s390_handle_damage(); - if (mci.ds && mci.fa) - /* Storage degradation */ - s390_handle_damage(); - } if (mci.cp) { /* Channel report word pending */ mcck->channel_report = 1; diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 1b7a0525fbed..975a00c8c564 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -2,8 +2,9 @@ /* * Performance event support for s390x - CPU-measurement Counter Facility * - * Copyright IBM Corp. 2012, 2019 + * Copyright IBM Corp. 2012, 2021 * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> + * Thomas Richter <tmricht@linux.ibm.com> */ #define KMSG_COMPONENT "cpum_cf" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt @@ -14,7 +15,223 @@ #include <linux/notifier.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/miscdevice.h> + #include <asm/cpu_mcf.h> +#include <asm/hwctrset.h> +#include <asm/debug.h> + +static unsigned int cfdiag_cpu_speed; /* CPU speed for CF_DIAG trailer */ +static debug_info_t *cf_dbg; + +#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ + /* interval in seconds */ + +/* Counter sets are stored as data stream in a page sized memory buffer and + * exported to user space via raw data attached to the event sample data. + * Each counter set starts with an eight byte header consisting of: + * - a two byte eye catcher (0xfeef) + * - a one byte counter set number + * - a two byte counter set size (indicates the number of counters in this set) + * - a three byte reserved value (must be zero) to make the header the same + * size as a counter value. + * All counter values are eight byte in size. + * + * All counter sets are followed by a 64 byte trailer. + * The trailer consists of a: + * - flag field indicating valid fields when corresponding bit set + * - the counter facility first and second version number + * - the CPU speed if nonzero + * - the time stamp the counter sets have been collected + * - the time of day (TOD) base value + * - the machine type. + * + * The counter sets are saved when the process is prepared to be executed on a + * CPU and saved again when the process is going to be removed from a CPU. + * The difference of both counter sets are calculated and stored in the event + * sample data area. + */ +struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ + unsigned int def:16; /* 0-15 Data Entry Format */ + unsigned int set:16; /* 16-31 Counter set identifier */ + unsigned int ctr:16; /* 32-47 Number of stored counters */ + unsigned int res1:16; /* 48-63 Reserved */ +}; + +struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ + /* 0 - 7 */ + union { + struct { + unsigned int clock_base:1; /* TOD clock base set */ + unsigned int speed:1; /* CPU speed set */ + /* Measurement alerts */ + unsigned int mtda:1; /* Loss of MT ctr. data alert */ + unsigned int caca:1; /* Counter auth. change alert */ + unsigned int lcda:1; /* Loss of counter data alert */ + }; + unsigned long flags; /* 0-63 All indicators */ + }; + /* 8 - 15 */ + unsigned int cfvn:16; /* 64-79 Ctr First Version */ + unsigned int csvn:16; /* 80-95 Ctr Second Version */ + unsigned int cpu_speed:32; /* 96-127 CPU speed */ + /* 16 - 23 */ + unsigned long timestamp; /* 128-191 Timestamp (TOD) */ + /* 24 - 55 */ + union { + struct { + unsigned long progusage1; + unsigned long progusage2; + unsigned long progusage3; + unsigned long tod_base; + }; + unsigned long progusage[4]; + }; + /* 56 - 63 */ + unsigned int mach_type:16; /* Machine type */ + unsigned int res1:16; /* Reserved */ + unsigned int res2:32; /* Reserved */ +}; + +/* Create the trailer data at the end of a page. */ +static void cfdiag_trailer(struct cf_trailer_entry *te) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cpuid cpuid; + + te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ + te->csvn = cpuhw->info.csvn; + + get_cpu_id(&cpuid); /* Machine type */ + te->mach_type = cpuid.machine; + te->cpu_speed = cfdiag_cpu_speed; + if (te->cpu_speed) + te->speed = 1; + te->clock_base = 1; /* Save clock base */ + te->tod_base = tod_clock_base.tod; + te->timestamp = get_tod_clock_fast(); +} + +/* Read a counter set. The counter set number determines the counter set and + * the CPUM-CF first and second version number determine the number of + * available counters in each counter set. + * Each counter set starts with header containing the counter set number and + * the number of eight byte counters. + * + * The functions returns the number of bytes occupied by this counter set + * including the header. + * If there is no counter in the counter set, this counter set is useless and + * zero is returned on this case. + * + * Note that the counter sets may not be enabled or active and the stcctm + * instruction might return error 3. Depending on error_ok value this is ok, + * for example when called from cpumf_pmu_start() call back function. + */ +static size_t cfdiag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, + size_t room, bool error_ok) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + size_t ctrset_size, need = 0; + int rc = 3; /* Assume write failure */ + + ctrdata->def = CF_DIAG_CTRSET_DEF; + ctrdata->set = ctrset; + ctrdata->res1 = 0; + ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); + + if (ctrset_size) { /* Save data */ + need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); + if (need <= room) { + rc = ctr_stcctm(ctrset, ctrset_size, + (u64 *)(ctrdata + 1)); + } + if (rc != 3 || error_ok) + ctrdata->ctr = ctrset_size; + else + need = 0; + } + + debug_sprintf_event(cf_dbg, 3, + "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" + " need %zd rc %d\n", __func__, ctrset, ctrset_size, + cpuhw->info.cfvn, cpuhw->info.csvn, need, rc); + return need; +} + +/* Read out all counter sets and save them in the provided data buffer. + * The last 64 byte host an artificial trailer entry. + */ +static size_t cfdiag_getctr(void *data, size_t sz, unsigned long auth, + bool error_ok) +{ + struct cf_trailer_entry *trailer; + size_t offset = 0, done; + int i; + + memset(data, 0, sz); + sz -= sizeof(*trailer); /* Always room for trailer */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + struct cf_ctrset_entry *ctrdata = data + offset; + + if (!(auth & cpumf_ctr_ctl[i])) + continue; /* Counter set not authorized */ + + done = cfdiag_getctrset(ctrdata, i, sz - offset, error_ok); + offset += done; + } + trailer = data + offset; + cfdiag_trailer(trailer); + return offset + sizeof(*trailer); +} + +/* Calculate the difference for each counter in a counter set. */ +static void cfdiag_diffctrset(u64 *pstart, u64 *pstop, int counters) +{ + for (; --counters >= 0; ++pstart, ++pstop) + if (*pstop >= *pstart) + *pstop -= *pstart; + else + *pstop = *pstart - *pstop + 1; +} + +/* Scan the counter sets and calculate the difference of each counter + * in each set. The result is the increment of each counter during the + * period the counter set has been activated. + * + * Return true on success. + */ +static int cfdiag_diffctr(struct cpu_cf_events *cpuhw, unsigned long auth) +{ + struct cf_trailer_entry *trailer_start, *trailer_stop; + struct cf_ctrset_entry *ctrstart, *ctrstop; + size_t offset = 0; + + auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; + do { + ctrstart = (struct cf_ctrset_entry *)(cpuhw->start + offset); + ctrstop = (struct cf_ctrset_entry *)(cpuhw->stop + offset); + + if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { + pr_err_once("cpum_cf_diag counter set compare error " + "in set %i\n", ctrstart->set); + return 0; + } + auth &= ~cpumf_ctr_ctl[ctrstart->set]; + if (ctrstart->def == CF_DIAG_CTRSET_DEF) { + cfdiag_diffctrset((u64 *)(ctrstart + 1), + (u64 *)(ctrstop + 1), ctrstart->ctr); + offset += ctrstart->ctr * sizeof(u64) + + sizeof(*ctrstart); + } + } while (ctrstart->def && auth); + + /* Save time_stamp from start of event in stop's trailer */ + trailer_start = (struct cf_trailer_entry *)(cpuhw->start + offset); + trailer_stop = (struct cf_trailer_entry *)(cpuhw->stop + offset); + trailer_stop->progusage[0] = trailer_start->timestamp; + + return 1; +} static enum cpumf_ctr_set get_counter_set(u64 event) { @@ -34,7 +251,8 @@ static enum cpumf_ctr_set get_counter_set(u64 event) return set; } -static int validate_ctr_version(const struct hw_perf_event *hwc) +static int validate_ctr_version(const struct hw_perf_event *hwc, + enum cpumf_ctr_set set) { struct cpu_cf_events *cpuhw; int err = 0; @@ -43,7 +261,7 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) cpuhw = &get_cpu_var(cpu_cf_events); /* check required version for counter sets */ - switch (hwc->config_base) { + switch (set) { case CPUMF_CTR_SET_BASIC: case CPUMF_CTR_SET_USER: if (cpuhw->info.cfvn < 1) @@ -86,6 +304,8 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) (cpuhw->info.act_ctl & mtdiag_ctl))) err = -EOPNOTSUPP; break; + case CPUMF_CTR_SET_MAX: + err = -EOPNOTSUPP; } put_cpu_var(cpu_cf_events); @@ -95,7 +315,6 @@ static int validate_ctr_version(const struct hw_perf_event *hwc) static int validate_ctr_auth(const struct hw_perf_event *hwc) { struct cpu_cf_events *cpuhw; - u64 ctrs_state; int err = 0; cpuhw = &get_cpu_var(cpu_cf_events); @@ -105,8 +324,7 @@ static int validate_ctr_auth(const struct hw_perf_event *hwc) * return with -ENOENT in order to fall back to other * PMUs that might suffice the event request. */ - ctrs_state = cpumf_ctr_ctl[hwc->config_base]; - if (!(ctrs_state & cpuhw->info.auth_ctl)) + if (!(hwc->config_base & cpuhw->info.auth_ctl)) err = -ENOENT; put_cpu_var(cpu_cf_events); @@ -126,7 +344,7 @@ static void cpumf_pmu_enable(struct pmu *pmu) if (cpuhw->flags & PMU_F_ENABLED) return; - err = lcctl(cpuhw->state); + err = lcctl(cpuhw->state | cpuhw->dev_state); if (err) { pr_err("Enabling the performance measuring unit " "failed with rc=%x\n", err); @@ -151,6 +369,7 @@ static void cpumf_pmu_disable(struct pmu *pmu) return; inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); + inactive |= cpuhw->dev_state; err = lcctl(inactive); if (err) { pr_err("Disabling the performance measuring unit " @@ -199,6 +418,14 @@ static const int cpumf_generic_events_user[] = { [PERF_COUNT_HW_BUS_CYCLES] = -1, }; +static void cpumf_hw_inuse(void) +{ + mutex_lock(&pmc_reserve_mutex); + if (atomic_inc_return(&num_events) == 1) + __kernel_cpumcf_begin(); + mutex_unlock(&pmc_reserve_mutex); +} + static int __hw_perf_event_init(struct perf_event *event, unsigned int type) { struct perf_event_attr *attr = &event->attr; @@ -258,11 +485,11 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) /* * Use the hardware perf event structure to store the * counter number in the 'config' member and the counter - * set number in the 'config_base'. The counter set number - * is then later used to enable/disable the counter(s). + * set number in the 'config_base' as bit mask. + * It is later used to enable/disable the counter(s). */ hwc->config = ev; - hwc->config_base = set; + hwc->config_base = cpumf_ctr_ctl[set]; break; case CPUMF_CTR_SET_MAX: /* The counter could not be associated to a counter set */ @@ -270,22 +497,13 @@ static int __hw_perf_event_init(struct perf_event *event, unsigned int type) } /* Initialize for using the CPU-measurement counter facility */ - if (!atomic_inc_not_zero(&num_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_events) == 0 && __kernel_cpumcf_begin()) - err = -EBUSY; - else - atomic_inc(&num_events); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; + cpumf_hw_inuse(); event->destroy = hw_perf_event_destroy; /* Finally, validate version and authorization of the counter set */ err = validate_ctr_auth(hwc); if (!err) - err = validate_ctr_version(hwc); + err = validate_ctr_version(hwc, set); return err; } @@ -361,6 +579,7 @@ static void cpumf_pmu_start(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct hw_perf_event *hwc = &event->hw; + int i; if (!(hwc->state & PERF_HES_STOPPED)) return; @@ -376,29 +595,92 @@ static void cpumf_pmu_start(struct perf_event *event, int flags) * needs to be synchronized. At this point, the counter set can be in * the inactive or disabled state. */ - hw_perf_event_reset(event); + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + cpuhw->usedss = cfdiag_getctr(cpuhw->start, + sizeof(cpuhw->start), + hwc->config_base, true); + } else { + hw_perf_event_reset(event); + } - /* increment refcount for this counter set */ - atomic_inc(&cpuhw->ctr_set[hwc->config_base]); + /* Increment refcount for counter sets */ + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if ((hwc->config_base & cpumf_ctr_ctl[i])) + atomic_inc(&cpuhw->ctr_set[i]); +} + +/* Create perf event sample with the counter sets as raw data. The sample + * is then pushed to the event subsystem and the function checks for + * possible event overflows. If an event overflow occurs, the PMU is + * stopped. + * + * Return non-zero if an event overflow occurred. + */ +static int cfdiag_push_sample(struct perf_event *event, + struct cpu_cf_events *cpuhw) +{ + struct perf_sample_data data; + struct perf_raw_record raw; + struct pt_regs regs; + int overflow; + + /* Setup perf sample */ + perf_sample_data_init(&data, 0, event->hw.last_period); + memset(®s, 0, sizeof(regs)); + memset(&raw, 0, sizeof(raw)); + + if (event->attr.sample_type & PERF_SAMPLE_CPU) + data.cpu_entry.cpu = event->cpu; + if (event->attr.sample_type & PERF_SAMPLE_RAW) { + raw.frag.size = cpuhw->usedss; + raw.frag.data = cpuhw->stop; + raw.size = raw.frag.size; + data.raw = &raw; + } + + overflow = perf_event_overflow(event, &data, ®s); + debug_sprintf_event(cf_dbg, 3, + "%s event %#llx sample_type %#llx raw %d ov %d\n", + __func__, event->hw.config, + event->attr.sample_type, raw.size, overflow); + if (overflow) + event->pmu->stop(event, 0); + + perf_event_update_userpage(event); + return overflow; } static void cpumf_pmu_stop(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); struct hw_perf_event *hwc = &event->hw; + int i; if (!(hwc->state & PERF_HES_STOPPED)) { /* Decrement reference count for this counter set and if this * is the last used counter in the set, clear activation * control and set the counter set state to inactive. */ - if (!atomic_dec_return(&cpuhw->ctr_set[hwc->config_base])) - ctr_set_stop(&cpuhw->state, hwc->config_base); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(hwc->config_base & cpumf_ctr_ctl[i])) + continue; + if (!atomic_dec_return(&cpuhw->ctr_set[i])) + ctr_set_stop(&cpuhw->state, cpumf_ctr_ctl[i]); + } hwc->state |= PERF_HES_STOPPED; } if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) { - hw_perf_event_update(event); + if (hwc->config == PERF_EVENT_CPUM_CF_DIAG) { + local64_inc(&event->count); + cpuhw->usedss = cfdiag_getctr(cpuhw->stop, + sizeof(cpuhw->stop), + event->hw.config_base, + false); + if (cfdiag_diffctr(cpuhw, event->hw.config_base)) + cfdiag_push_sample(event, cpuhw); + } else + hw_perf_event_update(event); hwc->state |= PERF_HES_UPTODATE; } } @@ -419,6 +701,7 @@ static int cpumf_pmu_add(struct perf_event *event, int flags) static void cpumf_pmu_del(struct perf_event *event, int flags) { struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + int i; cpumf_pmu_stop(event, PERF_EF_UPDATE); @@ -430,8 +713,9 @@ static void cpumf_pmu_del(struct perf_event *event, int flags) * clear enable control and resets all counters in a set. Therefore, * cpumf_pmu_start() always has to reenable a counter set. */ - if (!atomic_read(&cpuhw->ctr_set[event->hw.config_base])) - ctr_set_disable(&cpuhw->state, event->hw.config_base); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) + if (!atomic_read(&cpuhw->ctr_set[i])) + ctr_set_disable(&cpuhw->state, cpumf_ctr_ctl[i]); } /* Performance monitoring unit for s390x */ @@ -448,6 +732,7 @@ static struct pmu cpumf_pmu = { .read = cpumf_pmu_read, }; +static int cfset_init(void); static int __init cpumf_pmu_init(void) { int rc; @@ -455,10 +740,689 @@ static int __init cpumf_pmu_init(void) if (!kernel_cpumcf_avail()) return -ENODEV; + /* Setup s390dbf facility */ + cf_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); + if (!cf_dbg) { + pr_err("Registration of s390dbf(cpum_cf) failed\n"); + return -ENOMEM; + }; + debug_register_view(cf_dbg, &debug_sprintf_view); + cpumf_pmu.attr_groups = cpumf_cf_event_group(); rc = perf_pmu_register(&cpumf_pmu, "cpum_cf", -1); - if (rc) + if (rc) { + debug_unregister_view(cf_dbg, &debug_sprintf_view); + debug_unregister(cf_dbg); pr_err("Registering the cpum_cf PMU failed with rc=%i\n", rc); + } else if (stccm_avail()) { /* Setup counter set device */ + cfset_init(); + } + return rc; +} + +/* Support for the CPU Measurement Facility counter set extraction using + * device /dev/hwctr. This allows user space programs to extract complete + * counter set via normal file operations. + */ + +static atomic_t cfset_opencnt = ATOMIC_INIT(0); /* Excl. access */ +static DEFINE_MUTEX(cfset_ctrset_mutex);/* Synchronize access to hardware */ +struct cfset_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ + unsigned int sets; /* Counter set bit mask */ + atomic_t cpus_ack; /* # CPUs successfully executed func */ +}; + +static struct cfset_request { /* CPUs and counter set bit mask */ + unsigned long ctrset; /* Bit mask of counter set to read */ + cpumask_t mask; /* CPU mask to read from */ +} cfset_request; + +static void cfset_ctrset_clear(void) +{ + cpumask_clear(&cfset_request.mask); + cfset_request.ctrset = 0; +} + +/* The /dev/hwctr device access uses PMU_F_IN_USE to mark the device access + * path is currently used. + * The cpu_cf_events::dev_state is used to denote counter sets in use by this + * interface. It is always or'ed in. If this interface is not active, its + * value is zero and no additional counter sets will be included. + * + * The cpu_cf_events::state is used by the perf_event_open SVC and remains + * unchanged. + * + * perf_pmu_enable() and perf_pmu_enable() and its call backs + * cpumf_pmu_enable() and cpumf_pmu_disable() are called by the + * performance measurement subsystem to enable per process + * CPU Measurement counter facility. + * The XXX_enable() and XXX_disable functions are used to turn off + * x86 performance monitoring interrupt (PMI) during scheduling. + * s390 uses these calls to temporarily stop and resume the active CPU + * counters sets during scheduling. + * + * We do allow concurrent access of perf_event_open() SVC and /dev/hwctr + * device access. The perf_event_open() SVC interface makes a lot of effort + * to only run the counters while the calling process is actively scheduled + * to run. + * When /dev/hwctr interface is also used at the same time, the counter sets + * will keep running, even when the process is scheduled off a CPU. + * However this is not a problem and does not lead to wrong counter values + * for the perf_event_open() SVC. The current counter value will be recorded + * during schedule-in. At schedule-out time the current counter value is + * extracted again and the delta is calculated and added to the event. + */ +/* Stop all counter sets via ioctl interface */ +static void cfset_ioctl_off(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->dev_state = 0; + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_dec(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set stop %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + cpuhw->flags &= ~PMU_F_IN_USE; + debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", + __func__, rc, cpuhw->state, cpuhw->dev_state); +} + +/* Start counter sets on particular CPU */ +static void cfset_ioctl_on(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int rc; + + cpuhw->flags |= PMU_F_IN_USE; + ctr_set_enable(&cpuhw->dev_state, p->sets); + ctr_set_start(&cpuhw->dev_state, p->sets); + for (rc = CPUMF_CTR_SET_BASIC; rc < CPUMF_CTR_SET_MAX; ++rc) + if ((p->sets & cpumf_ctr_ctl[rc])) + atomic_inc(&cpuhw->ctr_set[rc]); + rc = lcctl(cpuhw->dev_state | cpuhw->state); /* Start counter sets */ + if (!rc) + atomic_inc(&p->cpus_ack); + else + pr_err("Counter set start %#llx of /dev/%s failed rc=%i\n", + cpuhw->dev_state | cpuhw->state, S390_HWCTR_DEVICE, rc); + debug_sprintf_event(cf_dbg, 4, "%s rc %d state %#llx dev_state %#llx\n", + __func__, rc, cpuhw->state, cpuhw->dev_state); +} + +static void cfset_release_cpu(void *p) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + int rc; + + debug_sprintf_event(cf_dbg, 4, "%s state %#llx dev_state %#llx\n", + __func__, cpuhw->state, cpuhw->dev_state); + rc = lcctl(cpuhw->state); /* Keep perf_event_open counter sets */ + if (rc) + pr_err("Counter set release %#llx of /dev/%s failed rc=%i\n", + cpuhw->state, S390_HWCTR_DEVICE, rc); + cpuhw->dev_state = 0; +} + +/* Release function is also called when application gets terminated without + * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. + */ +static int cfset_release(struct inode *inode, struct file *file) +{ + on_each_cpu(cfset_release_cpu, NULL, 1); + hw_perf_event_destroy(NULL); + cfset_ctrset_clear(); + atomic_set(&cfset_opencnt, 0); + return 0; +} + +static int cfset_open(struct inode *inode, struct file *file) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Only one user space program can open /dev/hwctr */ + if (atomic_xchg(&cfset_opencnt, 1)) + return -EBUSY; + + cpumf_hw_inuse(); + file->private_data = NULL; + /* nonseekable_open() never fails */ + return nonseekable_open(inode, file); +} + +static int cfset_all_stop(void) +{ + struct cfset_call_on_cpu_parm p = { + .sets = cfset_request.ctrset, + }; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + free_cpumask_var(mask); + return 0; +} + +static int cfset_all_start(void) +{ + struct cfset_call_on_cpu_parm p = { + .sets = cfset_request.ctrset, + .cpus_ack = ATOMIC_INIT(0), + }; + cpumask_var_t mask; + int rc = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_ioctl_on, &p, 1); + if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { + on_each_cpu_mask(mask, cfset_ioctl_off, &p, 1); + rc = -EIO; + debug_sprintf_event(cf_dbg, 4, "%s CPUs missing", __func__); + } + free_cpumask_var(mask); + return rc; +} + + +/* Return the maximum required space for all possible CPUs in case one + * CPU will be onlined during the START, READ, STOP cycles. + * To find out the size of the counter sets, any one CPU will do. They + * all have the same counter sets. + */ +static size_t cfset_needspace(unsigned int sets) +{ + struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); + size_t bytes = 0; + int i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(sets & cpumf_ctr_ctl[i])) + continue; + bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + sizeof(((struct s390_ctrset_setdata *)0)->set) + + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); + } + bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * + (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); + put_cpu_ptr(&cpu_cf_events); + return bytes; +} + +static int cfset_all_copy(unsigned long arg, cpumask_t *mask) +{ + struct s390_ctrset_read __user *ctrset_read; + unsigned int cpu, cpus, rc; + void __user *uptr; + + ctrset_read = (struct s390_ctrset_read __user *)arg; + uptr = ctrset_read->data; + for_each_cpu(cpu, mask) { + struct cpu_cf_events *cpuhw = per_cpu_ptr(&cpu_cf_events, cpu); + struct s390_ctrset_cpudata __user *ctrset_cpudata; + + ctrset_cpudata = uptr; + rc = put_user(cpu, &ctrset_cpudata->cpu_nr); + rc |= put_user(cpuhw->sets, &ctrset_cpudata->no_sets); + rc |= copy_to_user(ctrset_cpudata->data, cpuhw->data, + cpuhw->used); + if (rc) + return -EFAULT; + uptr += sizeof(struct s390_ctrset_cpudata) + cpuhw->used; + cond_resched(); + } + cpus = cpumask_weight(mask); + if (put_user(cpus, &ctrset_read->no_cpus)) + return -EFAULT; + debug_sprintf_event(cf_dbg, 4, "%s copied %ld\n", __func__, + uptr - (void __user *)ctrset_read->data); + return 0; +} + +static size_t cfset_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, + int ctrset_size, size_t room) +{ + size_t need = 0; + int rc = -1; + + need = sizeof(*p) + sizeof(u64) * ctrset_size; + if (need <= room) { + p->set = cpumf_ctr_ctl[ctrset]; + p->no_cnts = ctrset_size; + rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); + if (rc == 3) /* Nothing stored */ + need = 0; + } + return need; +} + +/* Read all counter sets. */ +static void cfset_cpu_read(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cfset_call_on_cpu_parm *p = parm; + int set, set_size; + size_t space; + + /* No data saved yet */ + cpuhw->used = 0; + cpuhw->sets = 0; + memset(cpuhw->data, 0, sizeof(cpuhw->data)); + + /* Scan the counter sets */ + for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { + struct s390_ctrset_setdata *sp = (void *)cpuhw->data + + cpuhw->used; + + if (!(p->sets & cpumf_ctr_ctl[set])) + continue; /* Counter set not in list */ + set_size = cpum_cf_ctrset_size(set, &cpuhw->info); + space = sizeof(cpuhw->data) - cpuhw->used; + space = cfset_cpuset_read(sp, set, set_size, space); + if (space) { + cpuhw->used += space; + cpuhw->sets += 1; + } + } + debug_sprintf_event(cf_dbg, 4, "%s sets %d used %zd\n", __func__, + cpuhw->sets, cpuhw->used); +} + +static int cfset_all_read(unsigned long arg) +{ + struct cfset_call_on_cpu_parm p; + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + p.sets = cfset_request.ctrset; + cpumask_and(mask, &cfset_request.mask, cpu_online_mask); + on_each_cpu_mask(mask, cfset_cpu_read, &p, 1); + rc = cfset_all_copy(arg, mask); + free_cpumask_var(mask); return rc; } -subsys_initcall(cpumf_pmu_init); + +static long cfset_ioctl_read(unsigned long arg) +{ + struct s390_ctrset_read read; + int ret = 0; + + if (copy_from_user(&read, (char __user *)arg, sizeof(read))) + return -EFAULT; + ret = cfset_all_read(arg); + return ret; +} + +static long cfset_ioctl_stop(void) +{ + int ret = ENXIO; + + if (cfset_request.ctrset) { + ret = cfset_all_stop(); + cfset_ctrset_clear(); + } + return ret; +} + +static long cfset_ioctl_start(unsigned long arg) +{ + struct s390_ctrset_start __user *ustart; + struct s390_ctrset_start start; + void __user *umask; + unsigned int len; + int ret = 0; + size_t need; + + if (cfset_request.ctrset) + return -EBUSY; + ustart = (struct s390_ctrset_start __user *)arg; + if (copy_from_user(&start, ustart, sizeof(start))) + return -EFAULT; + if (start.version != S390_HWCTR_START_VERSION) + return -EINVAL; + if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | + cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | + cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | + cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | + cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) + return -EINVAL; /* Invalid counter set */ + if (!start.counter_sets) + return -EINVAL; /* No counter set at all? */ + cpumask_clear(&cfset_request.mask); + len = min_t(u64, start.cpumask_len, cpumask_size()); + umask = (void __user *)start.cpumask; + if (copy_from_user(&cfset_request.mask, umask, len)) + return -EFAULT; + if (cpumask_empty(&cfset_request.mask)) + return -EINVAL; + need = cfset_needspace(start.counter_sets); + if (put_user(need, &ustart->data_bytes)) + ret = -EFAULT; + if (ret) + goto out; + cfset_request.ctrset = start.counter_sets; + ret = cfset_all_start(); +out: + if (ret) + cfset_ctrset_clear(); + debug_sprintf_event(cf_dbg, 4, "%s sets %#lx need %ld ret %d\n", + __func__, cfset_request.ctrset, need, ret); + return ret; +} + +/* Entry point to the /dev/hwctr device interface. + * The ioctl system call supports three subcommands: + * S390_HWCTR_START: Start the specified counter sets on a CPU list. The + * counter set keeps running until explicitly stopped. Returns the number + * of bytes needed to store the counter values. If another S390_HWCTR_START + * ioctl subcommand is called without a previous S390_HWCTR_STOP stop + * command, -EBUSY is returned. + * S390_HWCTR_READ: Read the counter set values from specified CPU list given + * with the S390_HWCTR_START command. + * S390_HWCTR_STOP: Stops the counter sets on the CPU list given with the + * previous S390_HWCTR_START subcommand. + */ +static long cfset_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + int ret; + + get_online_cpus(); + mutex_lock(&cfset_ctrset_mutex); + switch (cmd) { + case S390_HWCTR_START: + ret = cfset_ioctl_start(arg); + break; + case S390_HWCTR_STOP: + ret = cfset_ioctl_stop(); + break; + case S390_HWCTR_READ: + ret = cfset_ioctl_read(arg); + break; + default: + ret = -ENOTTY; + break; + } + mutex_unlock(&cfset_ctrset_mutex); + put_online_cpus(); + return ret; +} + +static const struct file_operations cfset_fops = { + .owner = THIS_MODULE, + .open = cfset_open, + .release = cfset_release, + .unlocked_ioctl = cfset_ioctl, + .compat_ioctl = cfset_ioctl, + .llseek = no_llseek +}; + +static struct miscdevice cfset_dev = { + .name = S390_HWCTR_DEVICE, + .minor = MISC_DYNAMIC_MINOR, + .fops = &cfset_fops, +}; + +int cfset_online_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + + mutex_lock(&cfset_ctrset_mutex); + if (cfset_request.ctrset) { + p.sets = cfset_request.ctrset; + cfset_ioctl_on(&p); + cpumask_set_cpu(cpu, &cfset_request.mask); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +int cfset_offline_cpu(unsigned int cpu) +{ + struct cfset_call_on_cpu_parm p; + + mutex_lock(&cfset_ctrset_mutex); + if (cfset_request.ctrset) { + p.sets = cfset_request.ctrset; + cfset_ioctl_off(&p); + cpumask_clear_cpu(cpu, &cfset_request.mask); + } + mutex_unlock(&cfset_ctrset_mutex); + return 0; +} + +static void cfdiag_read(struct perf_event *event) +{ + debug_sprintf_event(cf_dbg, 3, "%s event %#llx count %ld\n", __func__, + event->attr.config, local64_read(&event->count)); +} + +static int get_authctrsets(void) +{ + struct cpu_cf_events *cpuhw; + unsigned long auth = 0; + enum cpumf_ctr_set i; + + cpuhw = &get_cpu_var(cpu_cf_events); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) + auth |= cpumf_ctr_ctl[i]; + } + put_cpu_var(cpu_cf_events); + return auth; +} + +/* Setup the event. Test for authorized counter sets and only include counter + * sets which are authorized at the time of the setup. Including unauthorized + * counter sets result in specification exception (and panic). + */ +static int cfdiag_event_init2(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = 0; + + /* Set sample_period to indicate sampling */ + event->hw.config = attr->config; + event->hw.sample_period = attr->sample_period; + local64_set(&event->hw.period_left, event->hw.sample_period); + local64_set(&event->count, 0); + event->hw.last_period = event->hw.sample_period; + + /* Add all authorized counter sets to config_base. The + * the hardware init function is either called per-cpu or just once + * for all CPUS (event->cpu == -1). This depends on the whether + * counting is started for all CPUs or on a per workload base where + * the perf event moves from one CPU to another CPU. + * Checking the authorization on any CPU is fine as the hardware + * applies the same authorization settings to all CPUs. + */ + event->hw.config_base = get_authctrsets(); + + /* No authorized counter sets, nothing to count/sample */ + if (!event->hw.config_base) + err = -EINVAL; + + debug_sprintf_event(cf_dbg, 5, "%s err %d config_base %#lx\n", + __func__, err, event->hw.config_base); + return err; +} + +static int cfdiag_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + int err = -ENOENT; + + if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || + event->attr.type != event->pmu->type) + goto out; + + /* Raw events are used to access counters directly, + * hence do not permit excludes. + * This event is useless without PERF_SAMPLE_RAW to return counter set + * values as raw data. + */ + if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || + !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { + err = -EOPNOTSUPP; + goto out; + } + + /* Initialize for using the CPU-measurement counter facility */ + cpumf_hw_inuse(); + event->destroy = hw_perf_event_destroy; + + err = cfdiag_event_init2(event); + if (unlikely(err)) + event->destroy(event); +out: + return err; +} + +/* Create cf_diag/events/CF_DIAG event sysfs file. This counter is used + * to collect the complete counter sets for a scheduled process. Target + * are complete counter sets attached as raw data to the artificial event. + * This results in complete counter sets available when a process is + * scheduled. Contains the delta of every counter while the process was + * running. + */ +CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); + +static struct attribute *cfdiag_events_attr[] = { + CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), + NULL, +}; + +PMU_FORMAT_ATTR(event, "config:0-63"); + +static struct attribute *cfdiag_format_attr[] = { + &format_attr_event.attr, + NULL, +}; + +static struct attribute_group cfdiag_events_group = { + .name = "events", + .attrs = cfdiag_events_attr, +}; +static struct attribute_group cfdiag_format_group = { + .name = "format", + .attrs = cfdiag_format_attr, +}; +static const struct attribute_group *cfdiag_attr_groups[] = { + &cfdiag_events_group, + &cfdiag_format_group, + NULL, +}; + +/* Performance monitoring unit for event CF_DIAG. Since this event + * is also started and stopped via the perf_event_open() system call, use + * the same event enable/disable call back functions. They do not + * have a pointer to the perf_event strcture as first parameter. + * + * The functions XXX_add, XXX_del, XXX_start and XXX_stop are also common. + * Reuse them and distinguish the event (always first parameter) via + * 'config' member. + */ +static struct pmu cf_diag = { + .task_ctx_nr = perf_sw_context, + .event_init = cfdiag_event_init, + .pmu_enable = cpumf_pmu_enable, + .pmu_disable = cpumf_pmu_disable, + .add = cpumf_pmu_add, + .del = cpumf_pmu_del, + .start = cpumf_pmu_start, + .stop = cpumf_pmu_stop, + .read = cfdiag_read, + + .attr_groups = cfdiag_attr_groups +}; + +/* Calculate memory needed to store all counter sets together with header and + * trailer data. This is independent of the counter set authorization which + * can vary depending on the configuration. + */ +static size_t cfdiag_maxsize(struct cpumf_ctr_info *info) +{ + size_t max_size = sizeof(struct cf_trailer_entry); + enum cpumf_ctr_set i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + size_t size = cpum_cf_ctrset_size(i, info); + + if (size) + max_size += size * sizeof(u64) + + sizeof(struct cf_ctrset_entry); + } + return max_size; +} + +/* Get the CPU speed, try sampling facility first and CPU attributes second. */ +static void cfdiag_get_cpu_speed(void) +{ + if (cpum_sf_avail()) { /* Sampling facility first */ + struct hws_qsi_info_block si; + + memset(&si, 0, sizeof(si)); + if (!qsi(&si)) { + cfdiag_cpu_speed = si.cpu_speed; + return; + } + } + + /* Fallback: CPU speed extract static part. Used in case + * CPU Measurement Sampling Facility is turned off. + */ + if (test_facility(34)) { + unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); + + if (mhz != -1UL) + cfdiag_cpu_speed = mhz & 0xffffffff; + } +} + +static int cfset_init(void) +{ + struct cpumf_ctr_info info; + size_t need; + int rc; + + if (qctri(&info)) + return -ENODEV; + + cfdiag_get_cpu_speed(); + /* Make sure the counter set data fits into predefined buffer. */ + need = cfdiag_maxsize(&info); + if (need > sizeof(((struct cpu_cf_events *)0)->start)) { + pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", + need); + return -ENOMEM; + } + + rc = misc_register(&cfset_dev); + if (rc) { + pr_err("Registration of /dev/%s failed rc=%i\n", + cfset_dev.name, rc); + goto out; + } + + rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); + if (rc) { + misc_deregister(&cfset_dev); + pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", + rc); + } +out: + return rc; +} + +device_initcall(cpumf_pmu_init); diff --git a/arch/s390/kernel/perf_cpum_cf_common.c b/arch/s390/kernel/perf_cpum_cf_common.c index 2300fbaac556..30f0242de4a5 100644 --- a/arch/s390/kernel/perf_cpum_cf_common.c +++ b/arch/s390/kernel/perf_cpum_cf_common.c @@ -29,7 +29,11 @@ DEFINE_PER_CPU(struct cpu_cf_events, cpu_cf_events) = { }, .alert = ATOMIC64_INIT(0), .state = 0, + .dev_state = 0, .flags = 0, + .used = 0, + .usedss = 0, + .sets = 0 }; /* Indicator whether the CPU-Measurement Counter Facility Support is ready */ static bool cpum_cf_initalized; @@ -96,25 +100,10 @@ bool kernel_cpumcf_avail(void) } EXPORT_SYMBOL(kernel_cpumcf_avail); - -/* Reserve/release functions for sharing perf hardware */ -static DEFINE_SPINLOCK(cpumcf_owner_lock); -static void *cpumcf_owner; - /* Initialize the CPU-measurement counter facility */ int __kernel_cpumcf_begin(void) { int flags = PMC_INIT; - int err = 0; - - spin_lock(&cpumcf_owner_lock); - if (cpumcf_owner) - err = -EBUSY; - else - cpumcf_owner = __builtin_return_address(0); - spin_unlock(&cpumcf_owner_lock); - if (err) - return err; on_each_cpu(cpum_cf_setup_cpu, &flags, 1); irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT); @@ -144,10 +133,6 @@ void __kernel_cpumcf_end(void) on_each_cpu(cpum_cf_setup_cpu, &flags, 1); irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT); - - spin_lock(&cpumcf_owner_lock); - cpumcf_owner = NULL; - spin_unlock(&cpumcf_owner_lock); } EXPORT_SYMBOL(__kernel_cpumcf_end); @@ -161,11 +146,13 @@ static int cpum_cf_setup(unsigned int cpu, int flags) static int cpum_cf_online_cpu(unsigned int cpu) { - return cpum_cf_setup(cpu, PMC_INIT); + cpum_cf_setup(cpu, PMC_INIT); + return cfset_online_cpu(cpu); } static int cpum_cf_offline_cpu(unsigned int cpu) { + cfset_offline_cpu(cpu); return cpum_cf_setup(cpu, PMC_RELEASE); } diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c deleted file mode 100644 index 08c985c1097c..000000000000 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ /dev/null @@ -1,1148 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Performance event support for s390x - CPU-measurement Counter Sets - * - * Copyright IBM Corp. 2019, 2021 - * Author(s): Hendrik Brueckner <brueckner@linux.ibm.com> - * Thomas Richer <tmricht@linux.ibm.com> - */ -#define KMSG_COMPONENT "cpum_cf_diag" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - -#include <linux/kernel.h> -#include <linux/kernel_stat.h> -#include <linux/percpu.h> -#include <linux/notifier.h> -#include <linux/init.h> -#include <linux/export.h> -#include <linux/slab.h> -#include <linux/processor.h> -#include <linux/miscdevice.h> -#include <linux/mutex.h> - -#include <asm/ctl_reg.h> -#include <asm/irq.h> -#include <asm/cpu_mcf.h> -#include <asm/timex.h> -#include <asm/debug.h> - -#include <asm/hwctrset.h> - -#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ - /* interval in seconds */ -static unsigned int cf_diag_cpu_speed; -static debug_info_t *cf_diag_dbg; - -struct cf_diag_csd { /* Counter set data per CPU */ - size_t used; /* Bytes used in data/start */ - unsigned char start[PAGE_SIZE]; /* Counter set at event start */ - unsigned char data[PAGE_SIZE]; /* Counter set at event delete */ - unsigned int sets; /* # Counter set saved in data */ -}; -static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd); - -/* Counter sets are stored as data stream in a page sized memory buffer and - * exported to user space via raw data attached to the event sample data. - * Each counter set starts with an eight byte header consisting of: - * - a two byte eye catcher (0xfeef) - * - a one byte counter set number - * - a two byte counter set size (indicates the number of counters in this set) - * - a three byte reserved value (must be zero) to make the header the same - * size as a counter value. - * All counter values are eight byte in size. - * - * All counter sets are followed by a 64 byte trailer. - * The trailer consists of a: - * - flag field indicating valid fields when corresponding bit set - * - the counter facility first and second version number - * - the CPU speed if nonzero - * - the time stamp the counter sets have been collected - * - the time of day (TOD) base value - * - the machine type. - * - * The counter sets are saved when the process is prepared to be executed on a - * CPU and saved again when the process is going to be removed from a CPU. - * The difference of both counter sets are calculated and stored in the event - * sample data area. - */ - -struct cf_ctrset_entry { /* CPU-M CF counter set entry (8 byte) */ - unsigned int def:16; /* 0-15 Data Entry Format */ - unsigned int set:16; /* 16-31 Counter set identifier */ - unsigned int ctr:16; /* 32-47 Number of stored counters */ - unsigned int res1:16; /* 48-63 Reserved */ -}; - -struct cf_trailer_entry { /* CPU-M CF_DIAG trailer (64 byte) */ - /* 0 - 7 */ - union { - struct { - unsigned int clock_base:1; /* TOD clock base set */ - unsigned int speed:1; /* CPU speed set */ - /* Measurement alerts */ - unsigned int mtda:1; /* Loss of MT ctr. data alert */ - unsigned int caca:1; /* Counter auth. change alert */ - unsigned int lcda:1; /* Loss of counter data alert */ - }; - unsigned long flags; /* 0-63 All indicators */ - }; - /* 8 - 15 */ - unsigned int cfvn:16; /* 64-79 Ctr First Version */ - unsigned int csvn:16; /* 80-95 Ctr Second Version */ - unsigned int cpu_speed:32; /* 96-127 CPU speed */ - /* 16 - 23 */ - unsigned long timestamp; /* 128-191 Timestamp (TOD) */ - /* 24 - 55 */ - union { - struct { - unsigned long progusage1; - unsigned long progusage2; - unsigned long progusage3; - unsigned long tod_base; - }; - unsigned long progusage[4]; - }; - /* 56 - 63 */ - unsigned int mach_type:16; /* Machine type */ - unsigned int res1:16; /* Reserved */ - unsigned int res2:32; /* Reserved */ -}; - -/* Create the trailer data at the end of a page. */ -static void cf_diag_trailer(struct cf_trailer_entry *te) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cpuid cpuid; - - te->cfvn = cpuhw->info.cfvn; /* Counter version numbers */ - te->csvn = cpuhw->info.csvn; - - get_cpu_id(&cpuid); /* Machine type */ - te->mach_type = cpuid.machine; - te->cpu_speed = cf_diag_cpu_speed; - if (te->cpu_speed) - te->speed = 1; - te->clock_base = 1; /* Save clock base */ - te->tod_base = tod_clock_base.tod; - te->timestamp = get_tod_clock_fast(); -} - -/* - * Change the CPUMF state to active. - * Enable and activate the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_enable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (cpuhw->flags & PMU_F_ENABLED) - return; - - err = lcctl(cpuhw->state); - if (err) { - pr_err("Enabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags |= PMU_F_ENABLED; -} - -/* - * Change the CPUMF state to inactive. - * Disable and enable (inactive) the CPU-counter sets according - * to the per-cpu control state. - */ -static void cf_diag_disable(struct pmu *pmu) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - u64 inactive; - int err; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s pmu %p cpu %d flags %#x state %#llx\n", - __func__, pmu, smp_processor_id(), cpuhw->flags, - cpuhw->state); - if (!(cpuhw->flags & PMU_F_ENABLED)) - return; - - inactive = cpuhw->state & ~((1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1); - err = lcctl(inactive); - if (err) { - pr_err("Disabling the performance measuring unit " - "failed with rc=%x\n", err); - return; - } - cpuhw->flags &= ~PMU_F_ENABLED; -} - -/* Number of perf events counting hardware events */ -static atomic_t cf_diag_events = ATOMIC_INIT(0); -/* Used to avoid races in calling reserve/release_cpumf_hardware */ -static DEFINE_MUTEX(cf_diag_reserve_mutex); - -/* Release the PMU if event is the last perf event */ -static void cf_diag_perf_event_destroy(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d cf_diag_events %d\n", - __func__, event, smp_processor_id(), - atomic_read(&cf_diag_events)); - if (atomic_dec_return(&cf_diag_events) == 0) - __kernel_cpumcf_end(); -} - -static int get_authctrsets(void) -{ - struct cpu_cf_events *cpuhw; - unsigned long auth = 0; - enum cpumf_ctr_set i; - - cpuhw = &get_cpu_var(cpu_cf_events); - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) - auth |= cpumf_ctr_ctl[i]; - } - put_cpu_var(cpu_cf_events); - return auth; -} - -/* Setup the event. Test for authorized counter sets and only include counter - * sets which are authorized at the time of the setup. Including unauthorized - * counter sets result in specification exception (and panic). - */ -static int __hw_perf_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__, - event, event->cpu); - - event->hw.config = attr->config; - - /* Add all authorized counter sets to config_base. The - * the hardware init function is either called per-cpu or just once - * for all CPUS (event->cpu == -1). This depends on the whether - * counting is started for all CPUs or on a per workload base where - * the perf event moves from one CPU to another CPU. - * Checking the authorization on any CPU is fine as the hardware - * applies the same authorization settings to all CPUs. - */ - event->hw.config_base = get_authctrsets(); - - /* No authorized counter sets, nothing to count/sample */ - if (!event->hw.config_base) { - err = -EINVAL; - goto out; - } - - /* Set sample_period to indicate sampling */ - event->hw.sample_period = attr->sample_period; - local64_set(&event->hw.period_left, event->hw.sample_period); - event->hw.last_period = event->hw.sample_period; -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d config_base %#lx\n", - __func__, err, event->hw.config_base); - return err; -} - -/* Return 0 if the CPU-measurement counter facility is currently free - * and an error otherwise. - */ -static int cf_diag_perf_event_inuse(void) -{ - int err = 0; - - if (!atomic_inc_not_zero(&cf_diag_events)) { - mutex_lock(&cf_diag_reserve_mutex); - if (atomic_read(&cf_diag_events) == 0 && - __kernel_cpumcf_begin()) - err = -EBUSY; - else - err = atomic_inc_return(&cf_diag_events); - mutex_unlock(&cf_diag_reserve_mutex); - } - return err; -} - -static int cf_diag_event_init(struct perf_event *event) -{ - struct perf_event_attr *attr = &event->attr; - int err = -ENOENT; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d config %#llx type:%u " - "sample_type %#llx cf_diag_events %d\n", __func__, - event, event->cpu, attr->config, event->pmu->type, - attr->sample_type, atomic_read(&cf_diag_events)); - - if (event->attr.config != PERF_EVENT_CPUM_CF_DIAG || - event->attr.type != event->pmu->type) - goto out; - - /* Raw events are used to access counters directly, - * hence do not permit excludes. - * This event is usesless without PERF_SAMPLE_RAW to return counter set - * values as raw data. - */ - if (attr->exclude_kernel || attr->exclude_user || attr->exclude_hv || - !(attr->sample_type & (PERF_SAMPLE_CPU | PERF_SAMPLE_RAW))) { - err = -EOPNOTSUPP; - goto out; - } - - /* Initialize for using the CPU-measurement counter facility */ - err = cf_diag_perf_event_inuse(); - if (err < 0) - goto out; - event->destroy = cf_diag_perf_event_destroy; - - err = __hw_perf_event_init(event); - if (unlikely(err)) - event->destroy(event); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_read(struct perf_event *event) -{ - debug_sprintf_event(cf_diag_dbg, 5, "%s event %p\n", __func__, event); -} - -/* Calculate memory needed to store all counter sets together with header and - * trailer data. This is independend of the counter set authorization which - * can vary depending on the configuration. - */ -static size_t cf_diag_ctrset_maxsize(struct cpumf_ctr_info *info) -{ - size_t max_size = sizeof(struct cf_trailer_entry); - enum cpumf_ctr_set i; - - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - size_t size = cpum_cf_ctrset_size(i, info); - - if (size) - max_size += size * sizeof(u64) + - sizeof(struct cf_ctrset_entry); - } - debug_sprintf_event(cf_diag_dbg, 5, "%s max_size %zu\n", __func__, - max_size); - - return max_size; -} - -/* Read a counter set. The counter set number determines which counter set and - * the CPUM-CF first and second version number determine the number of - * available counters in this counter set. - * Each counter set starts with header containing the counter set number and - * the number of 8 byte counters. - * - * The functions returns the number of bytes occupied by this counter set - * including the header. - * If there is no counter in the counter set, this counter set is useless and - * zero is returned on this case. - */ -static size_t cf_diag_getctrset(struct cf_ctrset_entry *ctrdata, int ctrset, - size_t room) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - size_t ctrset_size, need = 0; - int rc = 3; /* Assume write failure */ - - ctrdata->def = CF_DIAG_CTRSET_DEF; - ctrdata->set = ctrset; - ctrdata->res1 = 0; - ctrset_size = cpum_cf_ctrset_size(ctrset, &cpuhw->info); - - if (ctrset_size) { /* Save data */ - need = ctrset_size * sizeof(u64) + sizeof(*ctrdata); - if (need <= room) - rc = ctr_stcctm(ctrset, ctrset_size, - (u64 *)(ctrdata + 1)); - if (rc != 3) - ctrdata->ctr = ctrset_size; - else - need = 0; - } - - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d ctrset_size %zu cfvn %d csvn %d" - " need %zd rc %d\n", - __func__, ctrset, ctrset_size, cpuhw->info.cfvn, - cpuhw->info.csvn, need, rc); - return need; -} - -/* Read out all counter sets and save them in the provided data buffer. - * The last 64 byte host an artificial trailer entry. - */ -static size_t cf_diag_getctr(void *data, size_t sz, unsigned long auth) -{ - struct cf_trailer_entry *trailer; - size_t offset = 0, done; - int i; - - memset(data, 0, sz); - sz -= sizeof(*trailer); /* Always room for trailer */ - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - struct cf_ctrset_entry *ctrdata = data + offset; - - if (!(auth & cpumf_ctr_ctl[i])) - continue; /* Counter set not authorized */ - - done = cf_diag_getctrset(ctrdata, i, sz - offset); - offset += done; - debug_sprintf_event(cf_diag_dbg, 6, - "%s ctrset %d offset %zu done %zu\n", - __func__, i, offset, done); - } - trailer = data + offset; - cf_diag_trailer(trailer); - return offset + sizeof(*trailer); -} - -/* Calculate the difference for each counter in a counter set. */ -static void cf_diag_diffctrset(u64 *pstart, u64 *pstop, int counters) -{ - for (; --counters >= 0; ++pstart, ++pstop) - if (*pstop >= *pstart) - *pstop -= *pstart; - else - *pstop = *pstart - *pstop; -} - -/* Scan the counter sets and calculate the difference of each counter - * in each set. The result is the increment of each counter during the - * period the counter set has been activated. - * - * Return true on success. - */ -static int cf_diag_diffctr(struct cf_diag_csd *csd, unsigned long auth) -{ - struct cf_trailer_entry *trailer_start, *trailer_stop; - struct cf_ctrset_entry *ctrstart, *ctrstop; - size_t offset = 0; - - auth &= (1 << CPUMF_LCCTL_ENABLE_SHIFT) - 1; - do { - ctrstart = (struct cf_ctrset_entry *)(csd->start + offset); - ctrstop = (struct cf_ctrset_entry *)(csd->data + offset); - - if (memcmp(ctrstop, ctrstart, sizeof(*ctrstop))) { - pr_err("cpum_cf_diag counter set compare error " - "in set %i\n", ctrstart->set); - return 0; - } - auth &= ~cpumf_ctr_ctl[ctrstart->set]; - if (ctrstart->def == CF_DIAG_CTRSET_DEF) { - cf_diag_diffctrset((u64 *)(ctrstart + 1), - (u64 *)(ctrstop + 1), ctrstart->ctr); - offset += ctrstart->ctr * sizeof(u64) + - sizeof(*ctrstart); - } - debug_sprintf_event(cf_diag_dbg, 6, - "%s set %d ctr %d offset %zu auth %lx\n", - __func__, ctrstart->set, ctrstart->ctr, - offset, auth); - } while (ctrstart->def && auth); - - /* Save time_stamp from start of event in stop's trailer */ - trailer_start = (struct cf_trailer_entry *)(csd->start + offset); - trailer_stop = (struct cf_trailer_entry *)(csd->data + offset); - trailer_stop->progusage[0] = trailer_start->timestamp; - - return 1; -} - -/* Create perf event sample with the counter sets as raw data. The sample - * is then pushed to the event subsystem and the function checks for - * possible event overflows. If an event overflow occurs, the PMU is - * stopped. - * - * Return non-zero if an event overflow occurred. - */ -static int cf_diag_push_sample(struct perf_event *event, - struct cf_diag_csd *csd) -{ - struct perf_sample_data data; - struct perf_raw_record raw; - struct pt_regs regs; - int overflow; - - /* Setup perf sample */ - perf_sample_data_init(&data, 0, event->hw.last_period); - memset(®s, 0, sizeof(regs)); - memset(&raw, 0, sizeof(raw)); - - if (event->attr.sample_type & PERF_SAMPLE_CPU) - data.cpu_entry.cpu = event->cpu; - if (event->attr.sample_type & PERF_SAMPLE_RAW) { - raw.frag.size = csd->used; - raw.frag.data = csd->data; - raw.size = csd->used; - data.raw = &raw; - } - - overflow = perf_event_overflow(event, &data, ®s); - debug_sprintf_event(cf_diag_dbg, 6, - "%s event %p cpu %d sample_type %#llx raw %d " - "ov %d\n", __func__, event, event->cpu, - event->attr.sample_type, raw.size, overflow); - if (overflow) - event->pmu->stop(event, 0); - - perf_event_update_userpage(event); - return overflow; -} - -static void cf_diag_start(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) - return; - - /* (Re-)enable and activate all counter sets */ - lcctl(0); /* Reset counter sets */ - hwc->state = 0; - ctr_set_multiple_enable(&cpuhw->state, hwc->config_base); - lcctl(cpuhw->state); /* Enable counter sets */ - csd->used = cf_diag_getctr(csd->start, sizeof(csd->start), - event->hw.config_base); - ctr_set_multiple_start(&cpuhw->state, hwc->config_base); - /* Function cf_diag_enable() starts the counter sets. */ -} - -static void cf_diag_stop(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct hw_perf_event *hwc = &event->hw; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x hwc-state %#x\n", - __func__, event, event->cpu, flags, hwc->state); - - /* Deactivate all counter sets */ - ctr_set_multiple_stop(&cpuhw->state, hwc->config_base); - local64_inc(&event->count); - csd->used = cf_diag_getctr(csd->data, sizeof(csd->data), - event->hw.config_base); - if (cf_diag_diffctr(csd, event->hw.config_base)) - cf_diag_push_sample(event, csd); - hwc->state |= PERF_HES_STOPPED; -} - -static int cf_diag_add(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - int err = 0; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x cpuhw %p\n", - __func__, event, event->cpu, flags, cpuhw); - - if (cpuhw->flags & PMU_F_IN_USE) { - err = -EAGAIN; - goto out; - } - - event->hw.state = PERF_HES_UPTODATE | PERF_HES_STOPPED; - - cpuhw->flags |= PMU_F_IN_USE; - if (flags & PERF_EF_START) - cf_diag_start(event, PERF_EF_RELOAD); -out: - debug_sprintf_event(cf_diag_dbg, 5, "%s err %d\n", __func__, err); - return err; -} - -static void cf_diag_del(struct perf_event *event, int flags) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - debug_sprintf_event(cf_diag_dbg, 5, - "%s event %p cpu %d flags %#x\n", - __func__, event, event->cpu, flags); - - cf_diag_stop(event, PERF_EF_UPDATE); - ctr_set_multiple_stop(&cpuhw->state, event->hw.config_base); - ctr_set_multiple_disable(&cpuhw->state, event->hw.config_base); - cpuhw->flags &= ~PMU_F_IN_USE; -} - -/* Default counter set events and format attribute groups */ - -CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); - -static struct attribute *cf_diag_events_attr[] = { - CPUMF_EVENT_PTR(CF_DIAG, CF_DIAG), - NULL, -}; - -PMU_FORMAT_ATTR(event, "config:0-63"); - -static struct attribute *cf_diag_format_attr[] = { - &format_attr_event.attr, - NULL, -}; - -static struct attribute_group cf_diag_events_group = { - .name = "events", - .attrs = cf_diag_events_attr, -}; -static struct attribute_group cf_diag_format_group = { - .name = "format", - .attrs = cf_diag_format_attr, -}; -static const struct attribute_group *cf_diag_attr_groups[] = { - &cf_diag_events_group, - &cf_diag_format_group, - NULL, -}; - -/* Performance monitoring unit for s390x */ -static struct pmu cf_diag = { - .task_ctx_nr = perf_sw_context, - .pmu_enable = cf_diag_enable, - .pmu_disable = cf_diag_disable, - .event_init = cf_diag_event_init, - .add = cf_diag_add, - .del = cf_diag_del, - .start = cf_diag_start, - .stop = cf_diag_stop, - .read = cf_diag_read, - - .attr_groups = cf_diag_attr_groups -}; - -/* Get the CPU speed, try sampling facility first and CPU attributes second. */ -static void cf_diag_get_cpu_speed(void) -{ - if (cpum_sf_avail()) { /* Sampling facility first */ - struct hws_qsi_info_block si; - - memset(&si, 0, sizeof(si)); - if (!qsi(&si)) { - cf_diag_cpu_speed = si.cpu_speed; - return; - } - } - - if (test_facility(34)) { /* CPU speed extract static part */ - unsigned long mhz = __ecag(ECAG_CPU_ATTRIBUTE, 0); - - if (mhz != -1UL) - cf_diag_cpu_speed = mhz & 0xffffffff; - } -} - -/* Code to create device and file I/O operations */ -static atomic_t ctrset_opencnt = ATOMIC_INIT(0); /* Excl. access */ - -static int cf_diag_open(struct inode *inode, struct file *file) -{ - int err = 0; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - if (atomic_xchg(&ctrset_opencnt, 1)) - return -EBUSY; - - /* Avoid concurrent access with perf_event_open() system call */ - mutex_lock(&cf_diag_reserve_mutex); - if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin()) - err = -EBUSY; - mutex_unlock(&cf_diag_reserve_mutex); - if (err) { - atomic_set(&ctrset_opencnt, 0); - return err; - } - file->private_data = NULL; - debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); - /* nonseekable_open() never fails */ - return nonseekable_open(inode, file); -} - -/* Variables for ioctl() interface support */ -static DEFINE_MUTEX(cf_diag_ctrset_mutex); -static struct cf_diag_ctrset { - unsigned long ctrset; /* Bit mask of counter set to read */ - cpumask_t mask; /* CPU mask to read from */ -} cf_diag_ctrset; - -static void cf_diag_ctrset_clear(void) -{ - cpumask_clear(&cf_diag_ctrset.mask); - cf_diag_ctrset.ctrset = 0; -} - -static void cf_diag_release_cpu(void *p) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - - debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__, - smp_processor_id()); - lcctl(0); /* Reset counter sets */ - cpuhw->state = 0; /* Save state in CPU hardware state */ -} - -/* Release function is also called when application gets terminated without - * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. - * Since only one application is allowed to open the device, simple stop all - * CPU counter sets. - */ -static int cf_diag_release(struct inode *inode, struct file *file) -{ - on_each_cpu(cf_diag_release_cpu, NULL, 1); - cf_diag_ctrset_clear(); - atomic_set(&ctrset_opencnt, 0); - __kernel_cpumcf_end(); - debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); - return 0; -} - -struct cf_diag_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ - unsigned int sets; /* Counter set bit mask */ - atomic_t cpus_ack; /* # CPUs successfully executed func */ -}; - -static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask) -{ - struct s390_ctrset_read __user *ctrset_read; - unsigned int cpu, cpus, rc; - void __user *uptr; - - ctrset_read = (struct s390_ctrset_read __user *)arg; - uptr = ctrset_read->data; - for_each_cpu(cpu, mask) { - struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu); - struct s390_ctrset_cpudata __user *ctrset_cpudata; - - ctrset_cpudata = uptr; - debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n", - __func__, cpu, csd->used); - rc = put_user(cpu, &ctrset_cpudata->cpu_nr); - rc |= put_user(csd->sets, &ctrset_cpudata->no_sets); - rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used); - if (rc) - return -EFAULT; - uptr += sizeof(struct s390_ctrset_cpudata) + csd->used; - cond_resched(); - } - cpus = cpumask_weight(mask); - if (put_user(cpus, &ctrset_read->no_cpus)) - return -EFAULT; - debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n", - __func__, uptr - (void __user *)ctrset_read->data); - return 0; -} - -static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, - int ctrset_size, size_t room) -{ - size_t need = 0; - int rc = -1; - - need = sizeof(*p) + sizeof(u64) * ctrset_size; - debug_sprintf_event(cf_diag_dbg, 5, - "%s room %zd need %zd set %#x set_size %d\n", - __func__, room, need, ctrset, ctrset_size); - if (need <= room) { - p->set = cpumf_ctr_ctl[ctrset]; - p->no_cnts = ctrset_size; - rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); - if (rc == 3) /* Nothing stored */ - need = 0; - } - debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__, - need, rc); - return need; -} - -/* Read all counter sets. Since the perf_event_open() system call with - * event cpum_cf_diag/.../ is blocked when this interface is active, reuse - * the perf_event_open() data buffer to store the counter sets. - */ -static void cf_diag_cpu_read(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); - struct cf_diag_call_on_cpu_parm *p = parm; - int set, set_size; - size_t space; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - /* No data saved yet */ - csd->used = 0; - csd->sets = 0; - memset(csd->data, 0, sizeof(csd->data)); - - /* Scan the counter sets */ - for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { - struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used; - - if (!(p->sets & cpumf_ctr_ctl[set])) - continue; /* Counter set not in list */ - set_size = cpum_cf_ctrset_size(set, &cpuhw->info); - space = sizeof(csd->data) - csd->used; - space = cf_diag_cpuset_read(sp, set, set_size, space); - if (space) { - csd->used += space; - csd->sets += 1; - } - debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n", - __func__, sp, space); - } - debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__, - csd->sets, csd->used); -} - -static int cf_diag_all_read(unsigned long arg) -{ - struct cf_diag_call_on_cpu_parm p; - cpumask_var_t mask; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - - p.sets = cf_diag_ctrset.ctrset; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1); - rc = cf_diag_all_copy(arg, mask); - free_cpumask_var(mask); - debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc); - return rc; -} - -/* Stop all counter sets via ioctl interface */ -static void cf_diag_ioctl_off(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_call_on_cpu_parm *p = parm; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - - ctr_set_multiple_disable(&cpuhw->state, p->sets); - ctr_set_multiple_stop(&cpuhw->state, p->sets); - rc = lcctl(cpuhw->state); /* Stop counter sets */ - if (!cpuhw->state) - cpuhw->flags &= ~PMU_F_IN_USE; - debug_sprintf_event(cf_diag_dbg, 5, - "%s rc %d flags %#x state %#llx\n", __func__, - rc, cpuhw->flags, cpuhw->state); -} - -/* Start counter sets on particular CPU */ -static void cf_diag_ioctl_on(void *parm) -{ - struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); - struct cf_diag_call_on_cpu_parm *p = parm; - int rc; - - debug_sprintf_event(cf_diag_dbg, 5, - "%s new %#x flags %#x state %#llx\n", - __func__, p->sets, cpuhw->flags, - cpuhw->state); - - if (!(cpuhw->flags & PMU_F_IN_USE)) - cpuhw->state = 0; - cpuhw->flags |= PMU_F_IN_USE; - rc = lcctl(cpuhw->state); /* Reset unused counter sets */ - ctr_set_multiple_enable(&cpuhw->state, p->sets); - ctr_set_multiple_start(&cpuhw->state, p->sets); - rc |= lcctl(cpuhw->state); /* Start counter sets */ - if (!rc) - atomic_inc(&p->cpus_ack); - debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n", - __func__, rc, cpuhw->state); -} - -static int cf_diag_all_stop(void) -{ - struct cf_diag_call_on_cpu_parm p = { - .sets = cf_diag_ctrset.ctrset, - }; - cpumask_var_t mask; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); - free_cpumask_var(mask); - return 0; -} - -static int cf_diag_all_start(void) -{ - struct cf_diag_call_on_cpu_parm p = { - .sets = cf_diag_ctrset.ctrset, - .cpus_ack = ATOMIC_INIT(0), - }; - cpumask_var_t mask; - int rc = 0; - - if (!alloc_cpumask_var(&mask, GFP_KERNEL)) - return -ENOMEM; - cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); - on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1); - if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { - on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); - rc = -EIO; - } - free_cpumask_var(mask); - return rc; -} - -/* Return the maximum required space for all possible CPUs in case one - * CPU will be onlined during the START, READ, STOP cycles. - * To find out the size of the counter sets, any one CPU will do. They - * all have the same counter sets. - */ -static size_t cf_diag_needspace(unsigned int sets) -{ - struct cpu_cf_events *cpuhw = get_cpu_ptr(&cpu_cf_events); - size_t bytes = 0; - int i; - - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { - if (!(sets & cpumf_ctr_ctl[i])) - continue; - bytes += cpum_cf_ctrset_size(i, &cpuhw->info) * sizeof(u64) + - sizeof(((struct s390_ctrset_setdata *)0)->set) + - sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); - } - bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * - (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + - sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); - debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__, - bytes); - put_cpu_ptr(&cpu_cf_events); - return bytes; -} - -static long cf_diag_ioctl_read(unsigned long arg) -{ - struct s390_ctrset_read read; - int ret = 0; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - if (copy_from_user(&read, (char __user *)arg, sizeof(read))) - return -EFAULT; - ret = cf_diag_all_read(arg); - debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); - return ret; -} - -static long cf_diag_ioctl_stop(void) -{ - int ret; - - debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); - ret = cf_diag_all_stop(); - cf_diag_ctrset_clear(); - debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); - return ret; -} - -static long cf_diag_ioctl_start(unsigned long arg) -{ - struct s390_ctrset_start __user *ustart; - struct s390_ctrset_start start; - void __user *umask; - unsigned int len; - int ret = 0; - size_t need; - - if (cf_diag_ctrset.ctrset) - return -EBUSY; - ustart = (struct s390_ctrset_start __user *)arg; - if (copy_from_user(&start, ustart, sizeof(start))) - return -EFAULT; - if (start.version != S390_HWCTR_START_VERSION) - return -EINVAL; - if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | - cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | - cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | - cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | - cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) - return -EINVAL; /* Invalid counter set */ - if (!start.counter_sets) - return -EINVAL; /* No counter set at all? */ - cpumask_clear(&cf_diag_ctrset.mask); - len = min_t(u64, start.cpumask_len, cpumask_size()); - umask = (void __user *)start.cpumask; - if (copy_from_user(&cf_diag_ctrset.mask, umask, len)) - return -EFAULT; - if (cpumask_empty(&cf_diag_ctrset.mask)) - return -EINVAL; - need = cf_diag_needspace(start.counter_sets); - if (put_user(need, &ustart->data_bytes)) - ret = -EFAULT; - if (ret) - goto out; - cf_diag_ctrset.ctrset = start.counter_sets; - ret = cf_diag_all_start(); -out: - if (ret) - cf_diag_ctrset_clear(); - debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n", - __func__, cf_diag_ctrset.ctrset, need, ret); - return ret; -} - -static long cf_diag_ioctl(struct file *file, unsigned int cmd, - unsigned long arg) -{ - int ret; - - debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__, - cmd, arg); - get_online_cpus(); - mutex_lock(&cf_diag_ctrset_mutex); - switch (cmd) { - case S390_HWCTR_START: - ret = cf_diag_ioctl_start(arg); - break; - case S390_HWCTR_STOP: - ret = cf_diag_ioctl_stop(); - break; - case S390_HWCTR_READ: - ret = cf_diag_ioctl_read(arg); - break; - default: - ret = -ENOTTY; - break; - } - mutex_unlock(&cf_diag_ctrset_mutex); - put_online_cpus(); - debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret); - return ret; -} - -static const struct file_operations cf_diag_fops = { - .owner = THIS_MODULE, - .open = cf_diag_open, - .release = cf_diag_release, - .unlocked_ioctl = cf_diag_ioctl, - .compat_ioctl = cf_diag_ioctl, - .llseek = no_llseek -}; - -static struct miscdevice cf_diag_dev = { - .name = S390_HWCTR_DEVICE, - .minor = MISC_DYNAMIC_MINOR, - .fops = &cf_diag_fops, -}; - -static int cf_diag_online_cpu(unsigned int cpu) -{ - struct cf_diag_call_on_cpu_parm p; - - mutex_lock(&cf_diag_ctrset_mutex); - if (!cf_diag_ctrset.ctrset) - goto out; - p.sets = cf_diag_ctrset.ctrset; - cf_diag_ioctl_on(&p); -out: - mutex_unlock(&cf_diag_ctrset_mutex); - return 0; -} - -static int cf_diag_offline_cpu(unsigned int cpu) -{ - struct cf_diag_call_on_cpu_parm p; - - mutex_lock(&cf_diag_ctrset_mutex); - if (!cf_diag_ctrset.ctrset) - goto out; - p.sets = cf_diag_ctrset.ctrset; - cf_diag_ioctl_off(&p); -out: - mutex_unlock(&cf_diag_ctrset_mutex); - return 0; -} - -/* Initialize the counter set PMU to generate complete counter set data as - * event raw data. This relies on the CPU Measurement Counter Facility device - * already being loaded and initialized. - */ -static int __init cf_diag_init(void) -{ - struct cpumf_ctr_info info; - size_t need; - int rc; - - if (!kernel_cpumcf_avail() || !stccm_avail() || qctri(&info)) - return -ENODEV; - cf_diag_get_cpu_speed(); - - /* Make sure the counter set data fits into predefined buffer. */ - need = cf_diag_ctrset_maxsize(&info); - if (need > sizeof(((struct cf_diag_csd *)0)->start)) { - pr_err("Insufficient memory for PMU(cpum_cf_diag) need=%zu\n", - need); - return -ENOMEM; - } - - rc = misc_register(&cf_diag_dev); - if (rc) { - pr_err("Registration of /dev/" S390_HWCTR_DEVICE - "failed rc=%d\n", rc); - goto out; - } - - /* Setup s390dbf facility */ - cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); - if (!cf_diag_dbg) { - pr_err("Registration of s390dbf(cpum_cf_diag) failed\n"); - rc = -ENOMEM; - goto out_dbf; - } - debug_register_view(cf_diag_dbg, &debug_sprintf_view); - - rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); - if (rc) { - pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", - rc); - goto out_perf; - } - rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE, - "perf/s390/cfd:online", - cf_diag_online_cpu, cf_diag_offline_cpu); - if (!rc) - goto out; - - pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n", - rc); - perf_pmu_unregister(&cf_diag); -out_perf: - debug_unregister_view(cf_diag_dbg, &debug_sprintf_view); - debug_unregister(cf_diag_dbg); -out_dbf: - misc_deregister(&cf_diag_dev); -out: - return rc; -} -device_initcall(cf_diag_init); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 7ae5dde9c54d..350e94d0cac2 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -166,6 +166,12 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, p->thread.acrs[1] = (unsigned int)tls; } } + /* + * s390 stores the svc return address in arch_data when calling + * sigreturn()/restart_syscall() via vdso. 1 means no valid address + * stored. + */ + p->restart_block.arch_data = 1; return 0; } diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 5486d82470b5..ff0f9e838916 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -354,7 +354,7 @@ void __init arch_call_rest_init(void) set_task_stack_end_magic(current); stack += STACK_INIT_OFFSET; S390_lowcore.kernel_stack = stack; - CALL_ON_STACK_NORETURN(rest_init, stack); + call_on_stack_noreturn(rest_init, stack); } static void __init setup_lowcore_dat_off(void) @@ -442,6 +442,7 @@ static void __init setup_lowcore_dat_off(void) lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; set_prefix((u32)(unsigned long) lc); lowcore_ptr[0] = lc; diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 080e7aed181f..78ef53b29958 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -32,6 +32,7 @@ #include <linux/uaccess.h> #include <asm/lowcore.h> #include <asm/switch_to.h> +#include <asm/vdso.h> #include "entry.h" /* @@ -171,7 +172,6 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); return 0; } @@ -334,15 +334,10 @@ static int setup_frame(int sig, struct k_sigaction *ka, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ka->sa.sa_flags & SA_RESTORER) { + if (ka->sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ka->sa.sa_restorer; - } else { - /* Signal frame without vector registers are short ! */ - __u16 __user *svc = (void __user *) frame + frame_size - 2; - if (__put_user(S390_SYSCALL_OPCODE | __NR_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, sigreturn); /* Set up registers for signal handler */ regs->gprs[14] = restorer; @@ -397,14 +392,10 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, /* Set up to return from userspace. If provided, use a stub already in userspace. */ - if (ksig->ka.sa.sa_flags & SA_RESTORER) { + if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = (unsigned long) ksig->ka.sa.sa_restorer; - } else { - __u16 __user *svc = &frame->svc_insn; - if (__put_user(S390_SYSCALL_OPCODE | __NR_rt_sigreturn, svc)) - return -EFAULT; - restorer = (unsigned long) svc; - } + else + restorer = VDSO64_SYMBOL(current, rt_sigreturn); /* Create siginfo on the signal stack */ if (copy_siginfo_to_user(&frame->info, &ksig->info)) @@ -501,7 +492,7 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) } /* No longer in a system call */ clear_pt_regs_flag(regs, PIF_SYSCALL); - clear_pt_regs_flag(regs, PIF_SYSCALL_RESTART); + rseq_signal_deliver(&ksig, regs); if (is_compat_task()) handle_signal32(&ksig, oldset, regs); @@ -517,14 +508,20 @@ void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ - regs->int_code = __NR_restart_syscall; - fallthrough; + regs->gprs[2] = regs->orig_gpr2; + current->restart_block.arch_data = regs->psw.addr; + if (is_compat_task()) + regs->psw.addr = VDSO32_SYMBOL(current, restart_syscall); + else + regs->psw.addr = VDSO64_SYMBOL(current, restart_syscall); + if (test_thread_flag(TIF_SINGLE_STEP)) + clear_thread_flag(TIF_PER_TRAP); + break; case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: - /* Restart system call with magic TIF bit. */ regs->gprs[2] = regs->orig_gpr2; - set_pt_regs_flag(regs, PIF_SYSCALL_RESTART); + regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); if (test_thread_flag(TIF_SINGLE_STEP)) clear_thread_flag(TIF_PER_TRAP); break; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index ff42d3aa0f00..8984711f72ed 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -210,6 +210,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu) lc->br_r1_trampoline = 0x07f1; /* br %r1 */ lc->return_lpswe = gen_lpswe(__LC_RETURN_PSW); lc->return_mcck_lpswe = gen_lpswe(__LC_RETURN_MCCK_PSW); + lc->preempt_count = PREEMPT_DISABLED; if (nmi_alloc_per_cpu(lc)) goto out; lowcore_ptr[cpu] = lc; @@ -300,24 +301,28 @@ static void pcpu_start_fn(struct pcpu *pcpu, void (*func)(void *), void *data) pcpu_sigp_retry(pcpu, SIGP_RESTART, 0); } +typedef void (pcpu_delegate_fn)(void *); + /* * Call function via PSW restart on pcpu and stop the current cpu. */ -static void __pcpu_delegate(void (*func)(void*), void *data) +static void __pcpu_delegate(pcpu_delegate_fn *func, void *data) { func(data); /* should not return */ } static void __no_sanitize_address pcpu_delegate(struct pcpu *pcpu, - void (*func)(void *), + pcpu_delegate_fn *func, void *data, unsigned long stack) { struct lowcore *lc = lowcore_ptr[pcpu - pcpu_devices]; unsigned long source_cpu = stap(); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - if (pcpu->address == source_cpu) - CALL_ON_STACK(__pcpu_delegate, stack, 2, func, data); + if (pcpu->address == source_cpu) { + call_on_stack(2, stack, void, __pcpu_delegate, + pcpu_delegate_fn *, func, void *, data); + } /* Stop target cpu (if func returns this stops the current cpu). */ pcpu_sigp_retry(pcpu, SIGP_STOP, 0); /* Restart func on the target cpu and stop the current cpu. */ @@ -898,7 +903,7 @@ static void __no_sanitize_address smp_start_secondary(void *cpuvoid) S390_lowcore.restart_source = -1UL; __ctl_load(S390_lowcore.cregs_save_area, 0, 15); __load_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT); - CALL_ON_STACK_NORETURN(smp_init_secondary, S390_lowcore.kernel_stack); + call_on_stack_noreturn(smp_init_secondary, S390_lowcore.kernel_stack); } /* Upping and downing of CPUs */ diff --git a/arch/s390/kernel/syscall.c b/arch/s390/kernel/syscall.c index 76f7916cc30f..8fe2d23b64f4 100644 --- a/arch/s390/kernel/syscall.c +++ b/arch/s390/kernel/syscall.c @@ -108,7 +108,7 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -void do_syscall(struct pt_regs *regs) +static void do_syscall(struct pt_regs *regs) { unsigned long nr; @@ -121,6 +121,10 @@ void do_syscall(struct pt_regs *regs) regs->gprs[2] = nr; + if (nr == __NR_restart_syscall && !(current->restart_block.arch_data & 1)) { + regs->psw.addr = current->restart_block.arch_data; + current->restart_block.arch_data = 1; + } nr = syscall_enter_from_user_mode_work(regs, nr); /* @@ -130,13 +134,16 @@ void do_syscall(struct pt_regs *regs) * work, the ptrace code sets PIF_SYSCALL_RET_SET, which is checked here * and if set, the syscall will be skipped. */ - if (!test_pt_regs_flag(regs, PIF_SYSCALL_RET_SET)) { - regs->gprs[2] = -ENOSYS; - if (likely(nr < NR_syscalls)) - regs->gprs[2] = current->thread.sys_call_table[nr](regs); - } else { - clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET); - } + + if (unlikely(test_and_clear_pt_regs_flag(regs, PIF_SYSCALL_RET_SET))) + goto out; + regs->gprs[2] = -ENOSYS; + if (likely(nr >= NR_syscalls)) + goto out; + do { + regs->gprs[2] = current->thread.sys_call_table[nr](regs); + } while (test_and_clear_pt_regs_flag(regs, PIF_EXECVE_PGSTE_RESTART)); +out: syscall_exit_to_user_mode_work(regs); } @@ -154,13 +161,8 @@ void noinstr __do_syscall(struct pt_regs *regs, int per_trap) if (per_trap) set_thread_flag(TIF_PER_TRAP); - for (;;) { - regs->flags = 0; - set_pt_regs_flag(regs, PIF_SYSCALL); - do_syscall(regs); - if (!test_pt_regs_flag(regs, PIF_SYSCALL_RESTART)) - break; - local_irq_enable(); - } + regs->flags = 0; + set_pt_regs_flag(regs, PIF_SYSCALL); + do_syscall(regs); exit_to_user_mode(); } diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 019c5748b607..76947275fe8b 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -277,6 +277,8 @@ static void __init test_monitor_call(void) { int val = 1; + if (!IS_ENABLED(CONFIG_BUG)) + return; asm volatile( " mc 0,0\n" "0: xgr %0,%0\n" @@ -299,10 +301,9 @@ static void (*pgm_check_table[128])(struct pt_regs *regs); void noinstr __do_pgm_check(struct pt_regs *regs) { unsigned long last_break = S390_lowcore.breaking_event_addr; - unsigned int trapnr, syscall_redirect = 0; + unsigned int trapnr; irqentry_state_t state; - add_random_kstack_offset(); regs->int_code = *(u32 *)&S390_lowcore.pgm_ilc; regs->int_parm_long = S390_lowcore.trans_exc_code; @@ -344,18 +345,9 @@ void noinstr __do_pgm_check(struct pt_regs *regs) trapnr = regs->int_code & PGM_INT_CODE_MASK; if (trapnr) pgm_check_table[trapnr](regs); - syscall_redirect = user_mode(regs) && test_pt_regs_flag(regs, PIF_SYSCALL); out: local_irq_disable(); irqentry_exit(regs, state); - - if (syscall_redirect) { - enter_from_user_mode(regs); - local_irq_enable(); - regs->orig_gpr2 = regs->gprs[2]; - do_syscall(regs); - exit_to_user_mode(); - } } /* diff --git a/arch/s390/kernel/uv.c b/arch/s390/kernel/uv.c index 6be2167943bb..aeb0a15bcbb7 100644 --- a/arch/s390/kernel/uv.c +++ b/arch/s390/kernel/uv.c @@ -358,6 +358,15 @@ static ssize_t uv_query_facilities(struct kobject *kobj, static struct kobj_attribute uv_query_facilities_attr = __ATTR(facilities, 0444, uv_query_facilities, NULL); +static ssize_t uv_query_feature_indications(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "%lx\n", uv_info.uv_feature_indications); +} + +static struct kobj_attribute uv_query_feature_indications_attr = + __ATTR(feature_indications, 0444, uv_query_feature_indications, NULL); + static ssize_t uv_query_max_guest_cpus(struct kobject *kobj, struct kobj_attribute *attr, char *page) { @@ -390,6 +399,7 @@ static struct kobj_attribute uv_query_max_guest_addr_attr = static struct attribute *uv_query_attrs[] = { &uv_query_facilities_attr.attr, + &uv_query_feature_indications_attr.attr, &uv_query_max_guest_cpus_attr.attr, &uv_query_max_guest_vms_attr.attr, &uv_query_max_guest_addr_attr.attr, diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 8c4e07d533c8..99694260cac9 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -20,7 +20,7 @@ #include <asm/vdso.h> extern char vdso64_start[], vdso64_end[]; -static unsigned int vdso_pages; +extern char vdso32_start[], vdso32_end[]; static struct vm_special_mapping vvar_mapping; @@ -37,18 +37,6 @@ enum vvar_pages { VVAR_NR_PAGES, }; -unsigned int __read_mostly vdso_enabled = 1; - -static int __init vdso_setup(char *str) -{ - bool enabled; - - if (!kstrtobool(str, &enabled)) - vdso_enabled = enabled; - return 1; -} -__setup("vdso=", vdso_setup); - #ifdef CONFIG_TIME_NS struct vdso_data *arch_get_vdso_data(void *vvar_page) { @@ -155,7 +143,12 @@ static struct vm_special_mapping vvar_mapping = { .fault = vvar_fault, }; -static struct vm_special_mapping vdso_mapping = { +static struct vm_special_mapping vdso64_mapping = { + .name = "[vdso]", + .mremap = vdso_mremap, +}; + +static struct vm_special_mapping vdso32_mapping = { .name = "[vdso]", .mremap = vdso_mremap, }; @@ -171,16 +164,22 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) { unsigned long vdso_text_len, vdso_mapping_len; unsigned long vvar_start, vdso_text_start; + struct vm_special_mapping *vdso_mapping; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int rc; BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); - if (!vdso_enabled || is_compat_task()) - return 0; if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_text_len = vdso_pages << PAGE_SHIFT; + + if (is_compat_task()) { + vdso_text_len = vdso32_end - vdso32_start; + vdso_mapping = &vdso32_mapping; + } else { + vdso_text_len = vdso64_end - vdso64_start; + vdso_mapping = &vdso64_mapping; + } vdso_mapping_len = vdso_text_len + VVAR_NR_PAGES * PAGE_SIZE; vvar_start = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); rc = vvar_start; @@ -198,7 +197,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) vma = _install_special_mapping(mm, vdso_text_start, vdso_text_len, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &vdso_mapping); + vdso_mapping); if (IS_ERR(vma)) { do_munmap(mm, vvar_start, PAGE_SIZE, NULL); rc = PTR_ERR(vma); @@ -211,21 +210,25 @@ out: return rc; } -static int __init vdso_init(void) +static struct page ** __init vdso_setup_pages(void *start, void *end) { - struct page **pages; + int pages = (end - start) >> PAGE_SHIFT; + struct page **pagelist; int i; - vdso_pages = (vdso64_end - vdso64_start) >> PAGE_SHIFT; - pages = kcalloc(vdso_pages + 1, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - vdso_enabled = 0; - return -ENOMEM; - } - for (i = 0; i < vdso_pages; i++) - pages[i] = virt_to_page(vdso64_start + i * PAGE_SIZE); - pages[vdso_pages] = NULL; - vdso_mapping.pages = pages; + pagelist = kcalloc(pages + 1, sizeof(struct page *), GFP_KERNEL); + if (!pagelist) + panic("%s: Cannot allocate page list for VDSO", __func__); + for (i = 0; i < pages; i++) + pagelist[i] = virt_to_page(start + i * PAGE_SIZE); + return pagelist; +} + +static int __init vdso_init(void) +{ + vdso64_mapping.pages = vdso_setup_pages(vdso64_start, vdso64_end); + if (IS_ENABLED(CONFIG_COMPAT)) + vdso32_mapping.pages = vdso_setup_pages(vdso32_start, vdso32_end); return 0; } arch_initcall(vdso_init); diff --git a/arch/s390/kernel/vdso32/.gitignore b/arch/s390/kernel/vdso32/.gitignore new file mode 100644 index 000000000000..5167384843b9 --- /dev/null +++ b/arch/s390/kernel/vdso32/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +vdso32.lds diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile new file mode 100644 index 000000000000..b2349a3f4fa3 --- /dev/null +++ b/arch/s390/kernel/vdso32/Makefile @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: GPL-2.0 +# List of files in the vdso + +KCOV_INSTRUMENT := n +ARCH_REL_TYPE_ABS := R_390_COPY|R_390_GLOB_DAT|R_390_JMP_SLOT|R_390_RELATIVE +ARCH_REL_TYPE_ABS += R_390_GOT|R_390_PLT + +include $(srctree)/lib/vdso/Makefile +obj-vdso32 = vdso_user_wrapper-32.o note-32.o + +# Build rules + +targets := $(obj-vdso32) vdso32.so vdso32.so.dbg +obj-vdso32 := $(addprefix $(obj)/, $(obj-vdso32)) + +KBUILD_AFLAGS += -DBUILD_VDSO +KBUILD_CFLAGS += -DBUILD_VDSO -DDISABLE_BRANCH_PROFILING + +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 += -m31 -s + +KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) +KBUILD_CFLAGS_32 += -m31 -fPIC -shared -fno-common -fno-builtin + +LDFLAGS_vdso32.so.dbg += -fPIC -shared -nostdlib -soname=linux-vdso32.so.1 \ + --hash-style=both --build-id=sha1 -melf_s390 -T + +$(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) + +obj-y += vdso32_wrapper.o +CPPFLAGS_vdso32.lds += -P -C -U$(ARCH) + +# Disable gcov profiling, ubsan and kasan for VDSO code +GCOV_PROFILE := n +UBSAN_SANITIZE := n +KASAN_SANITIZE := n + +# Force dependency (incbin is bad) +$(obj)/vdso32_wrapper.o : $(obj)/vdso32.so + +$(obj)/vdso32.so.dbg: $(src)/vdso32.lds $(obj-vdso32) FORCE + $(call if_changed,ld) + +# strip rule for the .so file +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +$(obj-vdso32): %-32.o: %.S FORCE + $(call if_changed_dep,vdso32as) + +# actual build commands +quiet_cmd_vdso32as = VDSO32A $@ + cmd_vdso32as = $(CC) $(a_flags) -c -o $@ $< +quiet_cmd_vdso32cc = VDSO32C $@ + cmd_vdso32cc = $(CC) $(c_flags) -c -o $@ $< + +# install commands for the unstripped file +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ + +vdso32.so: $(obj)/vdso32.so.dbg + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +vdso_install: vdso32.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso32-offsets.h: $(obj)/vdso32.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso32/gen_vdso_offsets.sh b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh new file mode 100755 index 000000000000..9c4f951e227d --- /dev/null +++ b/arch/s390/kernel/vdso32/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_compat_\(.*\)/\#define vdso32_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso32/note.S b/arch/s390/kernel/vdso32/note.S new file mode 100644 index 000000000000..db19d0680a0a --- /dev/null +++ b/arch/s390/kernel/vdso32/note.S @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/s390/kernel/vdso32/vdso32.lds.S b/arch/s390/kernel/vdso32/vdso32.lds.S new file mode 100644 index 000000000000..bff50b6acd6d --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32.lds.S @@ -0,0 +1,141 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * This is the infamous ld script for the 64 bits vdso + * library + */ + +#include <asm/page.h> +#include <asm/vdso.h> + +OUTPUT_FORMAT("elf32-s390", "elf32-s390", "elf32-s390") +OUTPUT_ARCH(s390:31-bit) +ENTRY(_start) + +SECTIONS +{ + PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif + . = VDSO_LBASE + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + . = ALIGN(16); + .text : { + *(.text .stub .text.* .gnu.linkonce.t.*) + } :text + PROVIDE(__etext = .); + PROVIDE(_etext = .); + PROVIDE(etext = .); + + /* + * Other stuff is appended to the text segment: + */ + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } + .rodata1 : { *(.rodata1) } + + .dynamic : { *(.dynamic) } :text :dynamic + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + .gcc_except_table : { *(.gcc_except_table .gcc_except_table.*) } + + .rela.dyn ALIGN(8) : { *(.rela.dyn) } + .got ALIGN(8) : { *(.got .toc) } + + _end = .; + PROVIDE(end = .); + + /* + * Stabs debugging sections are here too. + */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + + /* + * DWARF debug sections. + * Symbols in the DWARF debugging sections are relative to the + * beginning of the section so we begin them at 0. + */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + + /DISCARD/ : { + *(.note.GNU-stack) + *(.branch_lt) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + } +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FILEHDR PHDRS FLAGS(5); /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} + +/* + * This controls what symbols we export from the DSO. + */ +VERSION +{ + VDSO_VERSION_STRING { + global: + /* + * Has to be there for the kernel to find + */ + __kernel_compat_restart_syscall; + __kernel_compat_rt_sigreturn; + __kernel_compat_sigreturn; + local: *; + }; +} diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S new file mode 100644 index 000000000000..de2fb930471a --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/init.h> +#include <linux/linkage.h> +#include <asm/page.h> + + __PAGE_ALIGNED_DATA + + .globl vdso32_start, vdso32_end + .balign PAGE_SIZE +vdso32_start: + .incbin "arch/s390/kernel/vdso32/vdso32.so" + .balign PAGE_SIZE +vdso32_end: + + .previous diff --git a/arch/s390/kernel/vdso32/vdso_user_wrapper.S b/arch/s390/kernel/vdso32/vdso_user_wrapper.S new file mode 100644 index 000000000000..3f42f27f978c --- /dev/null +++ b/arch/s390/kernel/vdso32/vdso_user_wrapper.S @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <asm/unistd.h> +#include <asm/dwarf.h> + +.macro vdso_syscall func,syscall + .globl __kernel_compat_\func + .type __kernel_compat_\func,@function + .align 8 +__kernel_compat_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_compat_\func,.-__kernel_compat_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index a6e0fb6b91d6..2a2092ce19f1 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -74,3 +74,11 @@ vdso64.so: $(obj)/vdso64.so.dbg $(call cmd,vdso_install) vdso_install: vdso64.so + +# Generate VDSO offsets using helper script +gen-vdsosym := $(srctree)/$(src)/gen_vdso_offsets.sh +quiet_cmd_vdsosym = VDSOSYM $@ + cmd_vdsosym = $(NM) $< | $(gen-vdsosym) | LC_ALL=C sort > $@ + +include/generated/vdso64-offsets.h: $(obj)/vdso64.so.dbg FORCE + $(call if_changed,vdsosym) diff --git a/arch/s390/kernel/vdso64/gen_vdso_offsets.sh b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh new file mode 100755 index 000000000000..37f05cb38dad --- /dev/null +++ b/arch/s390/kernel/vdso64/gen_vdso_offsets.sh @@ -0,0 +1,15 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +# +# Match symbols in the DSO that look like VDSO_*; produce a header file +# of constant offsets into the shared object. +# +# Doing this inside the Makefile will break the $(filter-out) function, +# causing Kbuild to rebuild the vdso-offsets header file every time. +# +# Inspired by arm64 version. +# + +LC_ALL=C +sed -n 's/\([0-9a-f]*\) . __kernel_\(.*\)/\#define vdso64_offset_\2\t0x\1/p' diff --git a/arch/s390/kernel/vdso64/vdso64.lds.S b/arch/s390/kernel/vdso64/vdso64.lds.S index 518f1ea405f4..d4fb336d747b 100644 --- a/arch/s390/kernel/vdso64/vdso64.lds.S +++ b/arch/s390/kernel/vdso64/vdso64.lds.S @@ -17,7 +17,7 @@ SECTIONS #ifdef CONFIG_TIME_NS PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); #endif - . = VDSO64_LBASE + SIZEOF_HEADERS; + . = VDSO_LBASE + SIZEOF_HEADERS; .hash : { *(.hash) } :text .gnu.hash : { *(.gnu.hash) } @@ -137,6 +137,9 @@ VERSION __kernel_clock_gettime; __kernel_clock_getres; __kernel_getcpu; + __kernel_restart_syscall; + __kernel_rt_sigreturn; + __kernel_sigreturn; local: *; }; } diff --git a/arch/s390/kernel/vdso64/vdso_user_wrapper.S b/arch/s390/kernel/vdso64/vdso_user_wrapper.S index f773505c7e63..97f0c0a669a5 100644 --- a/arch/s390/kernel/vdso64/vdso_user_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso_user_wrapper.S @@ -37,3 +37,20 @@ vdso_func gettimeofday vdso_func clock_getres vdso_func clock_gettime vdso_func getcpu + +.macro vdso_syscall func,syscall + .globl __kernel_\func + .type __kernel_\func,@function + .align 8 +__kernel_\func: + CFI_STARTPROC + svc \syscall + /* Make sure we notice when a syscall returns, which shouldn't happen */ + .word 0 + CFI_ENDPROC + .size __kernel_\func,.-__kernel_\func +.endm + +vdso_syscall restart_syscall,__NR_restart_syscall +vdso_syscall sigreturn,__NR_sigreturn +vdso_syscall rt_sigreturn,__NR_rt_sigreturn diff --git a/arch/s390/lib/string.c b/arch/s390/lib/string.c index ec5b76bde4d8..cfcdf76d6a95 100644 --- a/arch/s390/lib/string.c +++ b/arch/s390/lib/string.c @@ -162,7 +162,7 @@ char *strcat(char *dest, const char *src) " jo 0b\n" "1: mvst %[dummy],%[src]\n" " jo 1b\n" - : [dummy] "=&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src) + : [dummy] "+&a" (dummy), [dest] "+&a" (dest), [src] "+&a" (src) : : "cc", "memory", "0"); return ret; diff --git a/arch/s390/lib/test_unwind.c b/arch/s390/lib/test_unwind.c index 2f32802f79ce..ecf327d743a0 100644 --- a/arch/s390/lib/test_unwind.c +++ b/arch/s390/lib/test_unwind.c @@ -120,7 +120,7 @@ static struct unwindme *unwindme; #define UWM_REGS 0x2 /* Pass regs to test_unwind(). */ #define UWM_SP 0x4 /* Pass sp to test_unwind(). */ #define UWM_CALLER 0x8 /* Unwind starting from caller. */ -#define UWM_SWITCH_STACK 0x10 /* Use CALL_ON_STACK. */ +#define UWM_SWITCH_STACK 0x10 /* Use call_on_stack. */ #define UWM_IRQ 0x20 /* Unwind from irq context. */ #define UWM_PGM 0x40 /* Unwind from program check handler. */ @@ -211,7 +211,8 @@ static noinline int unwindme_func2(struct unwindme *u) if (u->flags & UWM_SWITCH_STACK) { local_irq_save(flags); local_mcck_disable(); - rc = CALL_ON_STACK(unwindme_func3, S390_lowcore.nodat_stack, 1, u); + rc = call_on_stack(1, S390_lowcore.nodat_stack, + int, unwindme_func3, struct unwindme *, u); local_mcck_enable(); local_irq_restore(flags); return rc; diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 67606d932825..7ec8b1fa0f08 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -224,7 +224,7 @@ static inline unsigned long copy_in_user_mvcos(void __user *to, const void __use EX_TABLE(0b,3b) : "+a" (size), "+a" (to), "+a" (from), "+a" (tmp1), "=a" (tmp2) : [spec] "d" (0x810081UL) - : "cc", "memory"); + : "cc", "memory", "0"); return size; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 8ae3dc5783fd..e33c43b38afe 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -285,26 +285,6 @@ static noinline void do_sigbus(struct pt_regs *regs) (void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK)); } -static noinline int signal_return(struct pt_regs *regs) -{ - u16 instruction; - int rc; - - rc = __get_user(instruction, (u16 __user *) regs->psw.addr); - if (rc) - return rc; - if (instruction == 0x0a77) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x00040077; - return 0; - } else if (instruction == 0x0aad) { - set_pt_regs_flag(regs, PIF_SYSCALL); - regs->int_code = 0x000400ad; - return 0; - } - return -EACCES; -} - static noinline void do_fault_error(struct pt_regs *regs, int access, vm_fault_t fault) { @@ -312,9 +292,6 @@ static noinline void do_fault_error(struct pt_regs *regs, int access, switch (fault) { case VM_FAULT_BADACCESS: - if (access == VM_EXEC && signal_return(regs) == 0) - break; - fallthrough; case VM_FAULT_BADMAP: /* Bad memory access. Check if it is kernel or user space. */ if (user_mode(regs)) { @@ -792,6 +769,32 @@ void do_secure_storage_access(struct pt_regs *regs) struct page *page; int rc; + /* + * bit 61 tells us if the address is valid, if it's not we + * have a major problem and should stop the kernel or send a + * SIGSEGV to the process. Unfortunately bit 61 is not + * reliable without the misc UV feature so we need to check + * for that as well. + */ + if (test_bit_inv(BIT_UV_FEAT_MISC, &uv_info.uv_feature_indications) && + !test_bit_inv(61, ®s->int_parm_long)) { + /* + * When this happens, userspace did something that it + * was not supposed to do, e.g. branching into secure + * memory. Trigger a segmentation fault. + */ + if (user_mode(regs)) { + send_sig(SIGSEGV, current, 0); + return; + } + + /* + * The kernel should never run into this case and we + * have no way out of this situation. + */ + panic("Unexpected PGM 0x3d with TEID bit 61=0"); + } + switch (get_fault_type(regs)) { case USER_FAULT: mm = current->mm; diff --git a/arch/s390/mm/maccess.c b/arch/s390/mm/maccess.c index 1f1f906344ff..a0f54bd5e98a 100644 --- a/arch/s390/mm/maccess.c +++ b/arch/s390/mm/maccess.c @@ -125,12 +125,18 @@ static unsigned long __no_sanitize_address _memcpy_real(unsigned long dest, */ int memcpy_real(void *dest, void *src, size_t count) { + unsigned long _dest = (unsigned long)dest; + unsigned long _src = (unsigned long)src; + unsigned long _count = (unsigned long)count; int rc; if (S390_lowcore.nodat_stack != 0) { preempt_disable(); - rc = CALL_ON_STACK(_memcpy_real, S390_lowcore.nodat_stack, 3, - dest, src, count); + rc = call_on_stack(3, S390_lowcore.nodat_stack, + unsigned long, _memcpy_real, + unsigned long, _dest, + unsigned long, _src, + unsigned long, _count); preempt_enable(); return rc; } @@ -139,8 +145,7 @@ int memcpy_real(void *dest, void *src, size_t count) * not set up yet. Just call _memcpy_real on the early boot * stack */ - return _memcpy_real((unsigned long) dest,(unsigned long) src, - (unsigned long) count); + return _memcpy_real(_dest, _src, _count); } /* |