diff options
Diffstat (limited to 'arch/s390')
82 files changed, 3511 insertions, 1355 deletions
diff --git a/arch/s390/Kbuild b/arch/s390/Kbuild index 2938934c6518..e256592eb66e 100644 --- a/arch/s390/Kbuild +++ b/arch/s390/Kbuild @@ -6,3 +6,4 @@ obj-$(CONFIG_S390_HYPFS_FS) += hypfs/ obj-$(CONFIG_APPLDATA_BASE) += appldata/ obj-y += net/ obj-$(CONFIG_PCI) += pci/ +obj-$(CONFIG_NUMA) += numa/ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b06dc3839268..4827870f7a6d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -99,18 +99,22 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_WANTS_PROT_NUMA_PROT_NONE select ARCH_WANT_IPC_PARSE_VERSION select BUILDTIME_EXTABLE_SORT select CLONE_BACKWARDS2 select DYNAMIC_FTRACE if FUNCTION_TRACER select GENERIC_CLOCKEVENTS + select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_DEVICES if !SMP select GENERIC_FIND_FIRST_BIT select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL + select HAVE_ARCH_EARLY_PFN_TO_NID select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK @@ -153,6 +157,7 @@ config S390 select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + config SCHED_OMIT_FRAME_POINTER def_bool y @@ -385,6 +390,76 @@ config HOTPLUG_CPU config SCHED_SMT def_bool n +# Some NUMA nodes have memory ranges that span +# other nodes. Even though a pfn is valid and +# between a node's start and end pfns, it may not +# reside on that node. See memmap_init_zone() +# for details. <- They meant memory holes! +config NODES_SPAN_OTHER_NODES + def_bool NUMA + +config NUMA + bool "NUMA support" + depends on SMP && 64BIT && SCHED_TOPOLOGY + default n + help + Enable NUMA support + + This option adds NUMA support to the kernel. + + An operation mode can be selected by appending + numa=<method> to the kernel command line. + + The default behaviour is identical to appending numa=plain to + the command line. This will create just one node with all + available memory and all CPUs in it. + +config NODES_SHIFT + int "Maximum NUMA nodes (as a power of 2)" + range 1 10 + depends on NUMA + default "4" + help + Specify the maximum number of NUMA nodes available on the target + system. Increases memory reserved to accommodate various tables. + +menu "Select NUMA modes" + depends on NUMA + +config NUMA_EMU + bool "NUMA emulation" + default y + help + Numa emulation mode will split the available system memory into + equal chunks which then are distributed over the configured number + of nodes in a round-robin manner. + + The number of fake nodes is limited by the number of available memory + chunks (i.e. memory size / fake size) and the number of supported + nodes in the kernel. + + The CPUs are assigned to the nodes in a way that partially respects + the original machine topology (if supported by the machine). + Fair distribution of the CPUs is not guaranteed. + +config EMU_SIZE + hex "NUMA emulation memory chunk size" + default 0x10000000 + range 0x400000 0x100000000 + depends on NUMA_EMU + help + Select the default size by which the memory is chopped and then + assigned to emulated NUMA nodes. + + This can be overridden by specifying + + emu_size=<n> + + on the kernel command line where also suffixes K, M, G, and T are + supported. + +endmenu + config SCHED_MC def_bool n diff --git a/arch/s390/Makefile b/arch/s390/Makefile index 667b1bca5681..e8d4423e4f85 100644 --- a/arch/s390/Makefile +++ b/arch/s390/Makefile @@ -33,6 +33,8 @@ mflags-$(CONFIG_MARCH_Z196) := -march=z196 mflags-$(CONFIG_MARCH_ZEC12) := -march=zEC12 mflags-$(CONFIG_MARCH_Z13) := -march=z13 +export CC_FLAGS_MARCH := $(mflags-y) + aflags-y += $(mflags-y) cflags-y += $(mflags-y) diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 940cbddd9237..0c98f1508542 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -13,6 +13,7 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_RCU_FAST_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +CONFIG_NUMA_BALANCING=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y @@ -50,6 +51,7 @@ CONFIG_LIVEPATCH=y CONFIG_MARCH_Z196=y CONFIG_TUNE_ZEC12=y CONFIG_NR_CPUS=256 +CONFIG_NUMA=y CONFIG_PREEMPT=y CONFIG_HZ_100=y CONFIG_MEMORY_HOTPLUG=y diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index d793fec91797..82083e1fbdc4 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -13,6 +13,7 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_RCU_FAST_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +CONFIG_NUMA_BALANCING=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y @@ -49,6 +50,7 @@ CONFIG_DEFAULT_DEADLINE=y CONFIG_MARCH_Z196=y CONFIG_TUNE_ZEC12=y CONFIG_NR_CPUS=256 +CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 38a77e9c8aa6..c05c9e0821e3 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -13,6 +13,8 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_RCU_FAST_NO_HZ=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y +CONFIG_NUMA_BALANCING=y +# CONFIG_NUMA_BALANCING_DEFAULT_ENABLED is not set CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y @@ -48,6 +50,7 @@ CONFIG_LIVEPATCH=y CONFIG_MARCH_Z196=y CONFIG_TUNE_ZEC12=y CONFIG_NR_CPUS=512 +CONFIG_NUMA=y CONFIG_HZ_100=y CONFIG_MEMORY_HOTPLUG=y CONFIG_MEMORY_HOTREMOVE=y diff --git a/arch/s390/crypto/aes_s390.c b/arch/s390/crypto/aes_s390.c index 5566ce80abdb..0b9b95f3c703 100644 --- a/arch/s390/crypto/aes_s390.c +++ b/arch/s390/crypto/aes_s390.c @@ -24,6 +24,7 @@ #include <crypto/algapi.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/cpufeature.h> #include <linux/init.h> #include <linux/spinlock.h> #include "crypt_s390.h" @@ -976,7 +977,7 @@ static void __exit aes_s390_fini(void) crypto_unregister_alg(&aes_alg); } -module_init(aes_s390_init); +module_cpu_feature_match(MSA, aes_s390_init); module_exit(aes_s390_fini); MODULE_ALIAS_CRYPTO("aes-all"); diff --git a/arch/s390/crypto/des_s390.c b/arch/s390/crypto/des_s390.c index 9e05cc453a40..fba1c10a2dd0 100644 --- a/arch/s390/crypto/des_s390.c +++ b/arch/s390/crypto/des_s390.c @@ -16,6 +16,7 @@ #include <linux/init.h> #include <linux/module.h> +#include <linux/cpufeature.h> #include <linux/crypto.h> #include <crypto/algapi.h> #include <crypto/des.h> @@ -616,7 +617,7 @@ static void __exit des_s390_exit(void) crypto_unregister_alg(&des_alg); } -module_init(des_s390_init); +module_cpu_feature_match(MSA, des_s390_init); module_exit(des_s390_exit); MODULE_ALIAS_CRYPTO("des"); diff --git a/arch/s390/crypto/ghash_s390.c b/arch/s390/crypto/ghash_s390.c index b258110da952..26e14efd30a7 100644 --- a/arch/s390/crypto/ghash_s390.c +++ b/arch/s390/crypto/ghash_s390.c @@ -9,6 +9,7 @@ #include <crypto/internal/hash.h> #include <linux/module.h> +#include <linux/cpufeature.h> #include "crypt_s390.h" @@ -158,7 +159,7 @@ static void __exit ghash_mod_exit(void) crypto_unregister_shash(&ghash_alg); } -module_init(ghash_mod_init); +module_cpu_feature_match(MSA, ghash_mod_init); module_exit(ghash_mod_exit); MODULE_ALIAS_CRYPTO("ghash"); diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 9d5192c94963..b8045b97f4fb 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -17,6 +17,7 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/mutex.h> +#include <linux/cpufeature.h> #include <linux/random.h> #include <linux/slab.h> #include <asm/debug.h> @@ -914,6 +915,5 @@ static void __exit prng_exit(void) } } - -module_init(prng_init); +module_cpu_feature_match(MSA, prng_init); module_exit(prng_exit); diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index 5b2bee323694..9208eadae9f0 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -26,6 +26,7 @@ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/cpufeature.h> #include <crypto/sha.h> #include "crypt_s390.h" @@ -100,7 +101,7 @@ static void __exit sha1_s390_fini(void) crypto_unregister_shash(&alg); } -module_init(sha1_s390_init); +module_cpu_feature_match(MSA, sha1_s390_init); module_exit(sha1_s390_fini); MODULE_ALIAS_CRYPTO("sha1"); diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index b74ff158108c..667888f5c964 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -16,6 +16,7 @@ #include <crypto/internal/hash.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/cpufeature.h> #include <crypto/sha.h> #include "crypt_s390.h" @@ -140,7 +141,7 @@ static void __exit sha256_s390_fini(void) crypto_unregister_shash(&sha256_alg); } -module_init(sha256_s390_init); +module_cpu_feature_match(MSA, sha256_s390_init); module_exit(sha256_s390_fini); MODULE_ALIAS_CRYPTO("sha256"); diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index 0c36989ba182..2ba66b1518f0 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -18,6 +18,7 @@ #include <linux/init.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/cpufeature.h> #include "sha.h" #include "crypt_s390.h" @@ -148,7 +149,7 @@ static void __exit fini(void) crypto_unregister_shash(&sha384_alg); } -module_init(init); +module_cpu_feature_match(MSA, init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/arch/s390/include/asm/cpufeature.h b/arch/s390/include/asm/cpufeature.h new file mode 100644 index 000000000000..fa7e69b7c299 --- /dev/null +++ b/arch/s390/include/asm/cpufeature.h @@ -0,0 +1,29 @@ +/* + * Module interface for CPU features + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#ifndef __ASM_S390_CPUFEATURE_H +#define __ASM_S390_CPUFEATURE_H + +#include <asm/elf.h> + +/* Hardware features on Linux on z Systems are indicated by facility bits that + * are mapped to the so-called machine flags. Particular machine flags are + * then used to define ELF hardware capabilities; most notably hardware flags + * that are essential for user space / glibc. + * + * Restrict the set of exposed CPU features to ELF hardware capabilities for + * now. Additional machine flags can be indicated by values larger than + * MAX_ELF_HWCAP_FEATURES. + */ +#define MAX_ELF_HWCAP_FEATURES (8 * sizeof(elf_hwcap)) +#define MAX_CPU_FEATURES MAX_ELF_HWCAP_FEATURES + +#define cpu_feature(feat) ilog2(HWCAP_S390_ ## feat) + +int cpu_have_feature(unsigned int nr); + +#endif /* __ASM_S390_CPUFEATURE_H */ diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h index d7697ab802f6..17a373576868 100644 --- a/arch/s390/include/asm/ctl_reg.h +++ b/arch/s390/include/asm/ctl_reg.h @@ -46,6 +46,8 @@ static inline void __ctl_clear_bit(unsigned int cr, unsigned int bit) __ctl_load(reg, cr, cr); } +void __ctl_set_vx(void); + void smp_ctl_set_bit(int cr, int bit); void smp_ctl_clear_bit(int cr, int bit); diff --git a/arch/s390/include/asm/etr.h b/arch/s390/include/asm/etr.h index 629b79a93165..f7e5c36688c3 100644 --- a/arch/s390/include/asm/etr.h +++ b/arch/s390/include/asm/etr.h @@ -214,6 +214,9 @@ static inline int etr_ptff(void *ptff_block, unsigned int func) void etr_switch_to_local(void); void etr_sync_check(void); +/* notifier for syncs */ +extern struct atomic_notifier_head s390_epoch_delta_notifier; + /* STP interruption parameter */ struct stp_irq_parm { unsigned int _pad0 : 14; diff --git a/arch/s390/include/asm/fpu-internal.h b/arch/s390/include/asm/fpu-internal.h new file mode 100644 index 000000000000..55dc2c0fb40a --- /dev/null +++ b/arch/s390/include/asm/fpu-internal.h @@ -0,0 +1,110 @@ +/* + * General floating pointer and vector register helpers + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#ifndef _ASM_S390_FPU_INTERNAL_H +#define _ASM_S390_FPU_INTERNAL_H + +#define FPU_USE_VX 1 /* Vector extension is active */ + +#ifndef __ASSEMBLY__ + +#include <linux/errno.h> +#include <linux/string.h> +#include <asm/linkage.h> +#include <asm/ctl_reg.h> +#include <asm/sigcontext.h> + +struct fpu { + __u32 fpc; /* Floating-point control */ + __u32 flags; + union { + void *regs; + freg_t *fprs; /* Floating-point register save area */ + __vector128 *vxrs; /* Vector register save area */ + }; +}; + +void save_fpu_regs(void); + +#define is_vx_fpu(fpu) (!!((fpu)->flags & FPU_USE_VX)) +#define is_vx_task(tsk) (!!((tsk)->thread.fpu.flags & FPU_USE_VX)) + +/* VX array structure for address operand constraints in inline assemblies */ +struct vx_array { __vector128 _[__NUM_VXRS]; }; + +static inline int test_fp_ctl(u32 fpc) +{ + u32 orig_fpc; + int rc; + + asm volatile( + " efpc %1\n" + " sfpc %2\n" + "0: sfpc %1\n" + " la %0,0\n" + "1:\n" + EX_TABLE(0b,1b) + : "=d" (rc), "=d" (orig_fpc) + : "d" (fpc), "0" (-EINVAL)); + return rc; +} + +static inline void save_vx_regs_safe(__vector128 *vxrs) +{ + unsigned long cr0, flags; + + flags = arch_local_irq_save(); + __ctl_store(cr0, 0, 0); + __ctl_set_bit(0, 17); + __ctl_set_bit(0, 18); + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */ + : "=Q" (*(struct vx_array *) vxrs) : : "1"); + __ctl_load(cr0, 0, 0); + arch_local_irq_restore(flags); +} + +static inline void convert_vx_to_fp(freg_t *fprs, __vector128 *vxrs) +{ + int i; + + for (i = 0; i < __NUM_FPRS; i++) + fprs[i] = *(freg_t *)(vxrs + i); +} + +static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs) +{ + int i; + + for (i = 0; i < __NUM_FPRS; i++) + *(freg_t *)(vxrs + i) = fprs[i]; +} + +static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu) +{ + fpregs->pad = 0; + if (is_vx_fpu(fpu)) + convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs); + else + memcpy((freg_t *)&fpregs->fprs, fpu->fprs, + sizeof(fpregs->fprs)); +} + +static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu) +{ + if (is_vx_fpu(fpu)) + convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs); + else + memcpy(fpu->fprs, (freg_t *)&fpregs->fprs, + sizeof(fpregs->fprs)); +} + +#endif + +#endif /* _ASM_S390_FPU_INTERNAL_H */ diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h index cb5fdf3a78fc..437e9af96688 100644 --- a/arch/s390/include/asm/io.h +++ b/arch/s390/include/asm/io.h @@ -57,6 +57,8 @@ static inline void ioport_unmap(void __iomem *p) */ #define pci_iomap pci_iomap #define pci_iounmap pci_iounmap +#define pci_iomap_wc pci_iomap +#define pci_iomap_wc_range pci_iomap_range #define memcpy_fromio(dst, src, count) zpci_memcpy_fromio(dst, src, count) #define memcpy_toio(dst, src, count) zpci_memcpy_toio(dst, src, count) diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 3024acbe1f9d..3d012e071647 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -22,6 +22,7 @@ #include <linux/kvm.h> #include <asm/debug.h> #include <asm/cpu.h> +#include <asm/fpu-internal.h> #include <asm/isc.h> #define KVM_MAX_VCPUS 64 @@ -258,6 +259,9 @@ struct kvm_vcpu_stat { u32 diagnose_10; u32 diagnose_44; u32 diagnose_9c; + u32 diagnose_258; + u32 diagnose_308; + u32 diagnose_500; }; #define PGM_OPERATION 0x01 @@ -498,10 +502,9 @@ struct kvm_guestdbg_info_arch { struct kvm_vcpu_arch { struct kvm_s390_sie_block *sie_block; - s390_fp_regs host_fpregs; unsigned int host_acrs[NUM_ACRS]; - s390_fp_regs guest_fpregs; - struct kvm_s390_vregs *host_vregs; + struct fpu host_fpregs; + struct fpu guest_fpregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; struct kvm_s390_pgm_info pgm; @@ -630,7 +633,6 @@ extern char sie_exit; static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_check_processor_compat(void *rtn) {} -static inline void kvm_arch_exit(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} diff --git a/arch/s390/include/asm/linkage.h b/arch/s390/include/asm/linkage.h index fc8a8284778e..27da78cf416d 100644 --- a/arch/s390/include/asm/linkage.h +++ b/arch/s390/include/asm/linkage.h @@ -6,4 +6,26 @@ #define __ALIGN .align 4, 0x07 #define __ALIGN_STR __stringify(__ALIGN) +#ifndef __ASSEMBLY__ + +/* + * Helper macro for exception table entries + */ +#define EX_TABLE(_fault, _target) \ + ".section __ex_table,\"a\"\n" \ + ".align 4\n" \ + ".long (" #_fault ") - .\n" \ + ".long (" #_target ") - .\n" \ + ".previous\n" + +#else /* __ASSEMBLY__ */ + +#define EX_TABLE(_fault, _target) \ + .section __ex_table,"a" ; \ + .align 4 ; \ + .long (_fault) - . ; \ + .long (_target) - . ; \ + .previous + +#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/s390/include/asm/mmzone.h b/arch/s390/include/asm/mmzone.h new file mode 100644 index 000000000000..a9e834e60b84 --- /dev/null +++ b/arch/s390/include/asm/mmzone.h @@ -0,0 +1,16 @@ +/* + * NUMA support for s390 + * + * Copyright IBM Corp. 2015 + */ + +#ifndef _ASM_S390_MMZONE_H +#define _ASM_S390_MMZONE_H + +#ifdef CONFIG_NUMA + +extern struct pglist_data *node_data[]; +#define NODE_DATA(nid) (node_data[nid]) + +#endif /* CONFIG_NUMA */ +#endif /* _ASM_S390_MMZONE_H */ diff --git a/arch/s390/include/asm/numa.h b/arch/s390/include/asm/numa.h new file mode 100644 index 000000000000..2a0efc63b9e5 --- /dev/null +++ b/arch/s390/include/asm/numa.h @@ -0,0 +1,35 @@ +/* + * NUMA support for s390 + * + * Declare the NUMA core code structures and functions. + * + * Copyright IBM Corp. 2015 + */ + +#ifndef _ASM_S390_NUMA_H +#define _ASM_S390_NUMA_H + +#ifdef CONFIG_NUMA + +#include <linux/numa.h> +#include <linux/cpumask.h> + +void numa_setup(void); +int numa_pfn_to_nid(unsigned long pfn); +int __node_distance(int a, int b); +void numa_update_cpu_topology(void); + +extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +extern int numa_debug_enabled; + +#else + +static inline void numa_setup(void) { } +static inline void numa_update_cpu_topology(void) { } +static inline int numa_pfn_to_nid(unsigned long pfn) +{ + return 0; +} + +#endif /* CONFIG_NUMA */ +#endif /* _ASM_S390_NUMA_H */ diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index a648338c434a..34d960353a08 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -170,7 +170,11 @@ static inline void zpci_exit_slot(struct zpci_dev *zdev) {} #endif /* CONFIG_HOTPLUG_PCI_S390 */ /* Helpers */ -struct zpci_dev *get_zdev(struct pci_dev *); +static inline struct zpci_dev *to_zpci(struct pci_dev *pdev) +{ + return pdev->sysdata; +} + struct zpci_dev *get_zdev_by_fid(u32); /* DMA */ @@ -188,4 +192,20 @@ void zpci_debug_init_device(struct zpci_dev *); void zpci_debug_exit_device(struct zpci_dev *); void zpci_debug_info(struct zpci_dev *, struct seq_file *); +#ifdef CONFIG_NUMA + +/* Returns the node based on PCI bus */ +static inline int __pcibus_to_node(const struct pci_bus *bus) +{ + return NUMA_NO_NODE; +} + +static inline const struct cpumask * +cpumask_of_pcibus(const struct pci_bus *bus) +{ + return cpu_online_mask; +} + +#endif /* CONFIG_NUMA */ + #endif diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index f66d82798a6a..bdb2f51124ed 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -576,6 +576,19 @@ static inline int pte_same(pte_t a, pte_t b) return pte_val(a) == pte_val(b); } +#ifdef CONFIG_NUMA_BALANCING +static inline int pte_protnone(pte_t pte) +{ + return pte_present(pte) && !(pte_val(pte) & _PAGE_READ); +} + +static inline int pmd_protnone(pmd_t pmd) +{ + /* pmd_large(pmd) implies pmd_present(pmd) */ + return pmd_large(pmd) && !(pmd_val(pmd) & _SEGMENT_ENTRY_READ); +} +#endif + static inline pgste_t pgste_get_lock(pte_t *ptep) { unsigned long new = 0; diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index dedb6218544b..085fb0d3c54e 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -14,10 +14,12 @@ #define CIF_MCCK_PENDING 0 /* machine check handling is pending */ #define CIF_ASCE 1 /* user asce needs fixup / uaccess */ #define CIF_NOHZ_DELAY 2 /* delay HZ disable for a tick */ +#define CIF_FPU 3 /* restore vector registers */ #define _CIF_MCCK_PENDING (1<<CIF_MCCK_PENDING) #define _CIF_ASCE (1<<CIF_ASCE) #define _CIF_NOHZ_DELAY (1<<CIF_NOHZ_DELAY) +#define _CIF_FPU (1<<CIF_FPU) #ifndef __ASSEMBLY__ @@ -28,6 +30,7 @@ #include <asm/ptrace.h> #include <asm/setup.h> #include <asm/runtime_instr.h> +#include <asm/fpu-internal.h> static inline void set_cpu_flag(int flag) { @@ -85,7 +88,7 @@ typedef struct { * Thread structure */ struct thread_struct { - s390_fp_regs fp_regs; + struct fpu fpu; /* FP and VX register save area */ unsigned int acrs[NUM_ACRS]; unsigned long ksp; /* kernel stack pointer */ mm_segment_t mm_segment; @@ -101,7 +104,6 @@ struct thread_struct { struct runtime_instr_cb *ri_cb; int ri_signum; unsigned char trap_tdb[256]; /* Transaction abort diagnose block */ - __vector128 *vxrs; /* Vector register save area */ }; /* Flag to disable transactions. */ @@ -231,6 +233,17 @@ static inline void __load_psw_mask (unsigned long mask) } /* + * Extract current PSW mask + */ +static inline unsigned long __extract_psw(void) +{ + unsigned int reg1, reg2; + + asm volatile("epsw %0,%1" : "=d" (reg1), "=a" (reg2)); + return (((unsigned long) reg1) << 32) | ((unsigned long) reg2); +} + +/* * Rewind PSW instruction address by specified number of bytes. */ static inline unsigned long __rewind_psw(psw_t psw, unsigned long ilc) @@ -336,25 +349,6 @@ extern void memcpy_absolute(void *, void *, size_t); memcpy_absolute(&(dest), &__tmp, sizeof(__tmp)); \ } -/* - * Helper macro for exception table entries - */ -#define EX_TABLE(_fault, _target) \ - ".section __ex_table,\"a\"\n" \ - ".align 4\n" \ - ".long (" #_fault ") - .\n" \ - ".long (" #_target ") - .\n" \ - ".previous\n" - -#else /* __ASSEMBLY__ */ - -#define EX_TABLE(_fault, _target) \ - .section __ex_table,"a" ; \ - .align 4 ; \ - .long (_fault) - . ; \ - .long (_target) - . ; \ - .previous - #endif /* __ASSEMBLY__ */ #endif /* __ASM_S390_PROCESSOR_H */ diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index f6ff06077631..821dde5f425d 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -79,6 +79,6 @@ int sclp_pci_configure(u32 fid); int sclp_pci_deconfigure(u32 fid); int memcpy_hsa(void *dest, unsigned long src, size_t count, int mode); void sclp_early_detect(void); -long _sclp_print_early(const char *); +int _sclp_print_early(const char *); #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h index d62e7a69605f..dcadfde32265 100644 --- a/arch/s390/include/asm/switch_to.h +++ b/arch/s390/include/asm/switch_to.h @@ -8,139 +8,12 @@ #define __ASM_SWITCH_TO_H #include <linux/thread_info.h> +#include <asm/fpu-internal.h> #include <asm/ptrace.h> extern struct task_struct *__switch_to(void *, void *); extern void update_cr_regs(struct task_struct *task); -static inline int test_fp_ctl(u32 fpc) -{ - u32 orig_fpc; - int rc; - - asm volatile( - " efpc %1\n" - " sfpc %2\n" - "0: sfpc %1\n" - " la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc), "=d" (orig_fpc) - : "d" (fpc), "0" (-EINVAL)); - return rc; -} - -static inline void save_fp_ctl(u32 *fpc) -{ - asm volatile( - " stfpc %0\n" - : "+Q" (*fpc)); -} - -static inline int restore_fp_ctl(u32 *fpc) -{ - int rc; - - asm volatile( - " lfpc %1\n" - "0: la %0,0\n" - "1:\n" - EX_TABLE(0b,1b) - : "=d" (rc) : "Q" (*fpc), "0" (-EINVAL)); - return rc; -} - -static inline void save_fp_regs(freg_t *fprs) -{ - asm volatile("std 0,%0" : "=Q" (fprs[0])); - asm volatile("std 2,%0" : "=Q" (fprs[2])); - asm volatile("std 4,%0" : "=Q" (fprs[4])); - asm volatile("std 6,%0" : "=Q" (fprs[6])); - asm volatile("std 1,%0" : "=Q" (fprs[1])); - asm volatile("std 3,%0" : "=Q" (fprs[3])); - asm volatile("std 5,%0" : "=Q" (fprs[5])); - asm volatile("std 7,%0" : "=Q" (fprs[7])); - asm volatile("std 8,%0" : "=Q" (fprs[8])); - asm volatile("std 9,%0" : "=Q" (fprs[9])); - asm volatile("std 10,%0" : "=Q" (fprs[10])); - asm volatile("std 11,%0" : "=Q" (fprs[11])); - asm volatile("std 12,%0" : "=Q" (fprs[12])); - asm volatile("std 13,%0" : "=Q" (fprs[13])); - asm volatile("std 14,%0" : "=Q" (fprs[14])); - asm volatile("std 15,%0" : "=Q" (fprs[15])); -} - -static inline void restore_fp_regs(freg_t *fprs) -{ - asm volatile("ld 0,%0" : : "Q" (fprs[0])); - asm volatile("ld 2,%0" : : "Q" (fprs[2])); - asm volatile("ld 4,%0" : : "Q" (fprs[4])); - asm volatile("ld 6,%0" : : "Q" (fprs[6])); - asm volatile("ld 1,%0" : : "Q" (fprs[1])); - asm volatile("ld 3,%0" : : "Q" (fprs[3])); - asm volatile("ld 5,%0" : : "Q" (fprs[5])); - asm volatile("ld 7,%0" : : "Q" (fprs[7])); - asm volatile("ld 8,%0" : : "Q" (fprs[8])); - asm volatile("ld 9,%0" : : "Q" (fprs[9])); - asm volatile("ld 10,%0" : : "Q" (fprs[10])); - asm volatile("ld 11,%0" : : "Q" (fprs[11])); - asm volatile("ld 12,%0" : : "Q" (fprs[12])); - asm volatile("ld 13,%0" : : "Q" (fprs[13])); - asm volatile("ld 14,%0" : : "Q" (fprs[14])); - asm volatile("ld 15,%0" : : "Q" (fprs[15])); -} - -static inline void save_vx_regs(__vector128 *vxrs) -{ - typedef struct { __vector128 _[__NUM_VXRS]; } addrtype; - - asm volatile( - " la 1,%0\n" - " .word 0xe70f,0x1000,0x003e\n" /* vstm 0,15,0(1) */ - " .word 0xe70f,0x1100,0x0c3e\n" /* vstm 16,31,256(1) */ - : "=Q" (*(addrtype *) vxrs) : : "1"); -} - -static inline void save_vx_regs_safe(__vector128 *vxrs) -{ - unsigned long cr0, flags; - - flags = arch_local_irq_save(); - __ctl_store(cr0, 0, 0); - __ctl_set_bit(0, 17); - __ctl_set_bit(0, 18); - save_vx_regs(vxrs); - __ctl_load(cr0, 0, 0); - arch_local_irq_restore(flags); -} - -static inline void restore_vx_regs(__vector128 *vxrs) -{ - typedef struct { __vector128 _[__NUM_VXRS]; } addrtype; - - asm volatile( - " la 1,%0\n" - " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ - " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ - : : "Q" (*(addrtype *) vxrs) : "1"); -} - -static inline void save_fp_vx_regs(struct task_struct *task) -{ - if (task->thread.vxrs) - save_vx_regs(task->thread.vxrs); - else - save_fp_regs(task->thread.fp_regs.fprs); -} - -static inline void restore_fp_vx_regs(struct task_struct *task) -{ - if (task->thread.vxrs) - restore_vx_regs(task->thread.vxrs); - else - restore_fp_regs(task->thread.fp_regs.fprs); -} - static inline void save_access_regs(unsigned int *acrs) { typedef struct { int _[NUM_ACRS]; } acrstype; @@ -157,15 +30,13 @@ static inline void restore_access_regs(unsigned int *acrs) #define switch_to(prev,next,last) do { \ if (prev->mm) { \ - save_fp_ctl(&prev->thread.fp_regs.fpc); \ - save_fp_vx_regs(prev); \ + save_fpu_regs(); \ save_access_regs(&prev->thread.acrs[0]); \ save_ri_cb(prev->thread.ri_cb); \ } \ if (next->mm) { \ update_cr_regs(next); \ - restore_fp_ctl(&next->thread.fp_regs.fpc); \ - restore_fp_vx_regs(next); \ + set_cpu_flag(CIF_FPU); \ restore_access_regs(&next->thread.acrs[0]); \ restore_ri_cb(next->thread.ri_cb, prev->thread.ri_cb); \ } \ diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index 4990f6c66288..27ebde643933 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -2,6 +2,7 @@ #define _ASM_S390_TOPOLOGY_H #include <linux/cpumask.h> +#include <asm/numa.h> struct sysinfo_15_1_x; struct cpu; @@ -13,6 +14,7 @@ struct cpu_topology_s390 { unsigned short core_id; unsigned short socket_id; unsigned short book_id; + unsigned short node_id; cpumask_t thread_mask; cpumask_t core_mask; cpumask_t book_mask; @@ -52,6 +54,43 @@ static inline void topology_expect_change(void) { } #define POLARIZATION_VM (2) #define POLARIZATION_VH (3) +#define SD_BOOK_INIT SD_CPU_INIT + +#ifdef CONFIG_NUMA + +#define cpu_to_node cpu_to_node +static inline int cpu_to_node(int cpu) +{ + return per_cpu(cpu_topology, cpu).node_id; +} + +/* Returns a pointer to the cpumask of CPUs on node 'node'. */ +#define cpumask_of_node cpumask_of_node +static inline const struct cpumask *cpumask_of_node(int node) +{ + return node_to_cpumask_map[node]; +} + +/* + * Returns the number of the node containing node 'node'. This + * architecture is flat, so it is a pretty simple function! + */ +#define parent_node(node) (node) + +#define pcibus_to_node(bus) __pcibus_to_node(bus) + +#define node_distance(a, b) __node_distance(a, b) + +#else /* !CONFIG_NUMA */ + +#define numa_node_id numa_node_id +static inline int numa_node_id(void) +{ + return 0; +} + +#endif /* CONFIG_NUMA */ + #include <asm-generic/topology.h> #endif /* _ASM_S390_TOPOLOGY_H */ diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index 91f56b1d8156..525cef73b085 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -11,16 +11,24 @@ #define __IGNORE_time -/* Ignore NUMA system calls. Not wired up on s390. */ -#define __IGNORE_mbind -#define __IGNORE_get_mempolicy -#define __IGNORE_set_mempolicy -#define __IGNORE_migrate_pages -#define __IGNORE_move_pages - -/* Ignore system calls that are also reachable via sys_socket */ +/* Ignore system calls that are also reachable via sys_socketcall */ #define __IGNORE_recvmmsg #define __IGNORE_sendmmsg +#define __IGNORE_socket +#define __IGNORE_socketpair +#define __IGNORE_bind +#define __IGNORE_connect +#define __IGNORE_listen +#define __IGNORE_accept4 +#define __IGNORE_getsockopt +#define __IGNORE_setsockopt +#define __IGNORE_getsockname +#define __IGNORE_getpeername +#define __IGNORE_sendto +#define __IGNORE_sendmsg +#define __IGNORE_recvfrom +#define __IGNORE_recvmsg +#define __IGNORE_shutdown #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM diff --git a/arch/s390/include/asm/vx-insn.h b/arch/s390/include/asm/vx-insn.h new file mode 100644 index 000000000000..4a3135620f5e --- /dev/null +++ b/arch/s390/include/asm/vx-insn.h @@ -0,0 +1,480 @@ +/* + * Support for Vector Instructions + * + * Assembler macros to generate .byte/.word code for particular + * vector instructions that are supported by recent binutils (>= 2.26) only. + * + * Copyright IBM Corp. 2015 + * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> + */ + +#ifndef __ASM_S390_VX_INSN_H +#define __ASM_S390_VX_INSN_H + +#ifdef __ASSEMBLY__ + + +/* Macros to generate vector instruction byte code */ + +#define REG_NUM_INVALID 255 + +/* GR_NUM - Retrieve general-purpose register number + * + * @opd: Operand to store register number + * @r64: String designation register in the format "%rN" + */ +.macro GR_NUM opd gr + \opd = REG_NUM_INVALID + .ifc \gr,%r0 + \opd = 0 + .endif + .ifc \gr,%r1 + \opd = 1 + .endif + .ifc \gr,%r2 + \opd = 2 + .endif + .ifc \gr,%r3 + \opd = 3 + .endif + .ifc \gr,%r4 + \opd = 4 + .endif + .ifc \gr,%r5 + \opd = 5 + .endif + .ifc \gr,%r6 + \opd = 6 + .endif + .ifc \gr,%r7 + \opd = 7 + .endif + .ifc \gr,%r8 + \opd = 8 + .endif + .ifc \gr,%r9 + \opd = 9 + .endif + .ifc \gr,%r10 + \opd = 10 + .endif + .ifc \gr,%r11 + \opd = 11 + .endif + .ifc \gr,%r12 + \opd = 12 + .endif + .ifc \gr,%r13 + \opd = 13 + .endif + .ifc \gr,%r14 + \opd = 14 + .endif + .ifc \gr,%r15 + \opd = 15 + .endif + .if \opd == REG_NUM_INVALID + .error "Invalid general-purpose register designation: \gr" + .endif +.endm + +/* VX_R() - Macro to encode the VX_NUM into the instruction */ +#define VX_R(v) (v & 0x0F) + +/* VX_NUM - Retrieve vector register number + * + * @opd: Operand to store register number + * @vxr: String designation register in the format "%vN" + * + * The vector register number is used for as input number to the + * instruction and, as well as, to compute the RXB field of the + * instruction. To encode the particular vector register number, + * use the VX_R(v) macro to extract the instruction opcode. + */ +.macro VX_NUM opd vxr + \opd = REG_NUM_INVALID + .ifc \vxr,%v0 + \opd = 0 + .endif + .ifc \vxr,%v1 + \opd = 1 + .endif + .ifc \vxr,%v2 + \opd = 2 + .endif + .ifc \vxr,%v3 + \opd = 3 + .endif + .ifc \vxr,%v4 + \opd = 4 + .endif + .ifc \vxr,%v5 + \opd = 5 + .endif + .ifc \vxr,%v6 + \opd = 6 + .endif + .ifc \vxr,%v7 + \opd = 7 + .endif + .ifc \vxr,%v8 + \opd = 8 + .endif + .ifc \vxr,%v9 + \opd = 9 + .endif + .ifc \vxr,%v10 + \opd = 10 + .endif + .ifc \vxr,%v11 + \opd = 11 + .endif + .ifc \vxr,%v12 + \opd = 12 + .endif + .ifc \vxr,%v13 + \opd = 13 + .endif + .ifc \vxr,%v14 + \opd = 14 + .endif + .ifc \vxr,%v15 + \opd = 15 + .endif + .ifc \vxr,%v16 + \opd = 16 + .endif + .ifc \vxr,%v17 + \opd = 17 + .endif + .ifc \vxr,%v18 + \opd = 18 + .endif + .ifc \vxr,%v19 + \opd = 19 + .endif + .ifc \vxr,%v20 + \opd = 20 + .endif + .ifc \vxr,%v21 + \opd = 21 + .endif + .ifc \vxr,%v22 + \opd = 22 + .endif + .ifc \vxr,%v23 + \opd = 23 + .endif + .ifc \vxr,%v24 + \opd = 24 + .endif + .ifc \vxr,%v25 + \opd = 25 + .endif + .ifc \vxr,%v26 + \opd = 26 + .endif + .ifc \vxr,%v27 + \opd = 27 + .endif + .ifc \vxr,%v28 + \opd = 28 + .endif + .ifc \vxr,%v29 + \opd = 29 + .endif + .ifc \vxr,%v30 + \opd = 30 + .endif + .ifc \vxr,%v31 + \opd = 31 + .endif + .if \opd == REG_NUM_INVALID + .error "Invalid vector register designation: \vxr" + .endif +.endm + +/* RXB - Compute most significant bit used vector registers + * + * @rxb: Operand to store computed RXB value + * @v1: First vector register designated operand + * @v2: Second vector register designated operand + * @v3: Third vector register designated operand + * @v4: Fourth vector register designated operand + */ +.macro RXB rxb v1 v2=0 v3=0 v4=0 + \rxb = 0 + .if \v1 & 0x10 + \rxb = \rxb | 0x08 + .endif + .if \v2 & 0x10 + \rxb = \rxb | 0x04 + .endif + .if \v3 & 0x10 + \rxb = \rxb | 0x02 + .endif + .if \v4 & 0x10 + \rxb = \rxb | 0x01 + .endif +.endm + +/* MRXB - Generate Element Size Control and RXB value + * + * @m: Element size control + * @v1: First vector register designated operand (for RXB) + * @v2: Second vector register designated operand (for RXB) + * @v3: Third vector register designated operand (for RXB) + * @v4: Fourth vector register designated operand (for RXB) + */ +.macro MRXB m v1 v2=0 v3=0 v4=0 + rxb = 0 + RXB rxb, \v1, \v2, \v3, \v4 + .byte (\m << 4) | rxb +.endm + +/* MRXBOPC - Generate Element Size Control, RXB, and final Opcode fields + * + * @m: Element size control + * @opc: Opcode + * @v1: First vector register designated operand (for RXB) + * @v2: Second vector register designated operand (for RXB) + * @v3: Third vector register designated operand (for RXB) + * @v4: Fourth vector register designated operand (for RXB) + */ +.macro MRXBOPC m opc v1 v2=0 v3=0 v4=0 + MRXB \m, \v1, \v2, \v3, \v4 + .byte \opc +.endm + +/* Vector support instructions */ + +/* VECTOR GENERATE BYTE MASK */ +.macro VGBM vr imm2 + VX_NUM v1, \vr + .word (0xE700 | (VX_R(v1) << 4)) + .word \imm2 + MRXBOPC 0, 0x44, v1 +.endm +.macro VZERO vxr + VGBM \vxr, 0 +.endm +.macro VONE vxr + VGBM \vxr, 0xFFFF +.endm + +/* VECTOR LOAD VR ELEMENT FROM GR */ +.macro VLVG v, gr, disp, m + VX_NUM v1, \v + GR_NUM b2, "%r0" + GR_NUM r3, \gr + .word 0xE700 | (VX_R(v1) << 4) | r3 + .word (b2 << 12) | (\disp) + MRXBOPC \m, 0x22, v1 +.endm +.macro VLVGB v, gr, index, base + VLVG \v, \gr, \index, \base, 0 +.endm +.macro VLVGH v, gr, index + VLVG \v, \gr, \index, 1 +.endm +.macro VLVGF v, gr, index + VLVG \v, \gr, \index, 2 +.endm +.macro VLVGG v, gr, index + VLVG \v, \gr, \index, 3 +.endm + +/* VECTOR LOAD */ +.macro VL v, disp, index="%r0", base + VX_NUM v1, \v + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | (VX_R(v1) << 4) | x2 + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x06, v1 +.endm + +/* VECTOR LOAD ELEMENT */ +.macro VLEx vr1, disp, index="%r0", base, m3, opc + VX_NUM v1, \vr1 + GR_NUM x2, \index + GR_NUM b2, \base + .word 0xE700 | (VX_R(v1) << 4) | x2 + .word (b2 << 12) | (\disp) + MRXBOPC \m3, \opc, v1 +.endm +.macro VLEB vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x00 +.endm +.macro VLEH vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x01 +.endm +.macro VLEF vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x03 +.endm +.macro VLEG vr1, disp, index="%r0", base, m3 + VLEx \vr1, \disp, \index, \base, \m3, 0x02 +.endm + +/* VECTOR LOAD ELEMENT IMMEDIATE */ +.macro VLEIx vr1, imm2, m3, opc + VX_NUM v1, \vr1 + .word 0xE700 | (VX_R(v1) << 4) + .word \imm2 + MRXBOPC \m3, \opc, v1 +.endm +.macro VLEIB vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x40 +.endm +.macro VLEIH vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x41 +.endm +.macro VLEIF vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x43 +.endm +.macro VLEIG vr1, imm2, index + VLEIx \vr1, \imm2, \index, 0x42 +.endm + +/* VECTOR LOAD GR FROM VR ELEMENT */ +.macro VLGV gr, vr, disp, base="%r0", m + GR_NUM r1, \gr + GR_NUM b2, \base + VX_NUM v3, \vr + .word 0xE700 | (r1 << 4) | VX_R(v3) + .word (b2 << 12) | (\disp) + MRXBOPC \m, 0x21, v3 +.endm +.macro VLGVB gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 0 +.endm +.macro VLGVH gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 1 +.endm +.macro VLGVF gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 2 +.endm +.macro VLGVG gr, vr, disp, base="%r0" + VLGV \gr, \vr, \disp, \base, 3 +.endm + +/* VECTOR LOAD MULTIPLE */ +.macro VLM vfrom, vto, disp, base + VX_NUM v1, \vfrom + VX_NUM v3, \vto + GR_NUM b2, \base /* Base register */ + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v3) + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x36, v1, v3 +.endm + +/* VECTOR STORE MULTIPLE */ +.macro VSTM vfrom, vto, disp, base + VX_NUM v1, \vfrom + VX_NUM v3, \vto + GR_NUM b2, \base /* Base register */ + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v3) + .word (b2 << 12) | (\disp) + MRXBOPC 0, 0x3E, v1, v3 +.endm + +/* VECTOR PERMUTE */ +.macro VPERM vr1, vr2, vr3, vr4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + VX_NUM v4, \vr4 + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) + .word (VX_R(v3) << 12) + MRXBOPC VX_R(v4), 0x8C, v1, v2, v3, v4 +.endm + +/* VECTOR UNPACK LOGICAL LOW */ +.macro VUPLL vr1, vr2, m3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) + .word 0x0000 + MRXBOPC \m3, 0xD4, v1, v2 +.endm +.macro VUPLLB vr1, vr2 + VUPLL \vr1, \vr2, 0 +.endm +.macro VUPLLH vr1, vr2 + VUPLL \vr1, \vr2, 1 +.endm +.macro VUPLLF vr1, vr2 + VUPLL \vr1, \vr2, 2 +.endm + + +/* Vector integer instructions */ + +/* VECTOR EXCLUSIVE OR */ +.macro VX vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) + .word (VX_R(v3) << 12) + MRXBOPC 0, 0x6D, v1, v2, v3 +.endm + +/* VECTOR GALOIS FIELD MULTIPLY SUM */ +.macro VGFM vr1, vr2, vr3, m4 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) + .word (VX_R(v3) << 12) + MRXBOPC \m4, 0xB4, v1, v2, v3 +.endm +.macro VGFMB vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 0 +.endm +.macro VGFMH vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 1 +.endm +.macro VGFMF vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 2 +.endm +.macro VGFMG vr1, vr2, vr3 + VGFM \vr1, \vr2, \vr3, 3 +.endm + +/* VECTOR GALOIS FIELD MULTIPLY SUM AND ACCUMULATE */ +.macro VGFMA vr1, vr2, vr3, vr4, m5 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + VX_NUM v4, \vr4 + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) + .word (VX_R(v3) << 12) | (\m5 << 8) + MRXBOPC VX_R(v4), 0xBC, v1, v2, v3, v4 +.endm +.macro VGFMAB vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 0 +.endm +.macro VGFMAH vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 1 +.endm +.macro VGFMAF vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 2 +.endm +.macro VGFMAG vr1, vr2, vr3, vr4 + VGFMA \vr1, \vr2, \vr3, \vr4, 3 +.endm + +/* VECTOR SHIFT RIGHT LOGICAL BY BYTE */ +.macro VSRLB vr1, vr2, vr3 + VX_NUM v1, \vr1 + VX_NUM v2, \vr2 + VX_NUM v3, \vr3 + .word 0xE700 | (VX_R(v1) << 4) | VX_R(v2) + .word (VX_R(v3) << 12) + MRXBOPC 0, 0x7D, v1, v2, v3 +.endm + + +#endif /* __ASSEMBLY__ */ +#endif /* __ASM_S390_VX_INSN_H */ diff --git a/arch/s390/include/uapi/asm/unistd.h b/arch/s390/include/uapi/asm/unistd.h index 67878af257a0..59d2bb4e2d0c 100644 --- a/arch/s390/include/uapi/asm/unistd.h +++ b/arch/s390/include/uapi/asm/unistd.h @@ -204,9 +204,9 @@ #define __NR_statfs64 265 #define __NR_fstatfs64 266 #define __NR_remap_file_pages 267 -/* Number 268 is reserved for new sys_mbind */ -/* Number 269 is reserved for new sys_get_mempolicy */ -/* Number 270 is reserved for new sys_set_mempolicy */ +#define __NR_mbind 268 +#define __NR_get_mempolicy 269 +#define __NR_set_mempolicy 270 #define __NR_mq_open 271 #define __NR_mq_unlink 272 #define __NR_mq_timedsend 273 @@ -223,7 +223,7 @@ #define __NR_inotify_init 284 #define __NR_inotify_add_watch 285 #define __NR_inotify_rm_watch 286 -/* Number 287 is reserved for new sys_migrate_pages */ +#define __NR_migrate_pages 287 #define __NR_openat 288 #define __NR_mkdirat 289 #define __NR_mknodat 290 @@ -245,7 +245,7 @@ #define __NR_sync_file_range 307 #define __NR_tee 308 #define __NR_vmsplice 309 -/* Number 310 is reserved for new sys_move_pages */ +#define __NR_move_pages 310 #define __NR_getcpu 311 #define __NR_epoll_pwait 312 #define __NR_utimes 313 diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index ffb87617a36c..b756c6348ac6 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -28,6 +28,17 @@ CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' CFLAGS_sysinfo.o += -w +# +# Use -march=z900 for sclp.c to be able to print an error message if +# the kernel is started on a machine which is too old +# +CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE) +ifneq ($(CC_FLAGS_MARCH),-march=z900) +CFLAGS_REMOVE_sclp.o += $(CC_FLAGS_MARCH) +CFLAGS_sclp.o += -march=z900 +endif +GCOV_PROFILE_sclp.o := n + obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o sclp.o vdso.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index a2da259d9327..48c9af7a7683 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -28,6 +28,9 @@ int main(void) DEFINE(__TASK_pid, offsetof(struct task_struct, pid)); BLANK(); DEFINE(__THREAD_ksp, offsetof(struct thread_struct, ksp)); + DEFINE(__THREAD_FPU_fpc, offsetof(struct thread_struct, fpu.fpc)); + DEFINE(__THREAD_FPU_flags, offsetof(struct thread_struct, fpu.flags)); + DEFINE(__THREAD_FPU_regs, offsetof(struct thread_struct, fpu.regs)); DEFINE(__THREAD_per_cause, offsetof(struct thread_struct, per_event.cause)); DEFINE(__THREAD_per_address, offsetof(struct thread_struct, per_event.address)); DEFINE(__THREAD_per_paid, offsetof(struct thread_struct, per_event.paid)); diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index fe8d6924efaa..eb4664238613 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -153,33 +153,14 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) /* Store registers needed to create the signal frame */ static void store_sigregs(void) { - int i; - save_access_regs(current->thread.acrs); - save_fp_ctl(¤t->thread.fp_regs.fpc); - if (current->thread.vxrs) { - save_vx_regs(current->thread.vxrs); - for (i = 0; i < __NUM_FPRS; i++) - current->thread.fp_regs.fprs[i] = - *(freg_t *)(current->thread.vxrs + i); - } else - save_fp_regs(current->thread.fp_regs.fprs); + save_fpu_regs(); } /* Load registers after signal return */ static void load_sigregs(void) { - int i; - restore_access_regs(current->thread.acrs); - /* restore_fp_ctl is done in restore_sigregs */ - if (current->thread.vxrs) { - for (i = 0; i < __NUM_FPRS; i++) - *(freg_t *)(current->thread.vxrs + i) = - current->thread.fp_regs.fprs[i]; - restore_vx_regs(current->thread.vxrs); - } else - restore_fp_regs(current->thread.fp_regs.fprs); } static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) @@ -196,8 +177,7 @@ static int save_sigregs32(struct pt_regs *regs, _sigregs32 __user *sregs) user_sregs.regs.gprs[i] = (__u32) regs->gprs[i]; memcpy(&user_sregs.regs.acrs, current->thread.acrs, sizeof(user_sregs.regs.acrs)); - memcpy(&user_sregs.fpregs, ¤t->thread.fp_regs, - sizeof(user_sregs.fpregs)); + fpregs_store((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs32))) return -EFAULT; return 0; @@ -217,8 +197,8 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW32_MASK_RI)) return -EINVAL; - /* Loading the floating-point-control word can fail. Do that first. */ - if (restore_fp_ctl(&user_sregs.fpregs.fpc)) + /* Test the floating-point-control word. */ + if (test_fp_ctl(user_sregs.fpregs.fpc)) return -EINVAL; /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ @@ -235,9 +215,7 @@ static int restore_sigregs32(struct pt_regs *regs,_sigregs32 __user *sregs) regs->gprs[i] = (__u64) user_sregs.regs.gprs[i]; memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, sizeof(current->thread.acrs)); - - memcpy(¤t->thread.fp_regs, &user_sregs.fpregs, - sizeof(current->thread.fp_regs)); + fpregs_load((_s390_fp_regs *) &user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ return 0; @@ -258,13 +236,13 @@ static int save_sigregs_ext32(struct pt_regs *regs, return -EFAULT; /* Save vector registers to signal stack */ - if (current->thread.vxrs) { + if (is_vx_task(current)) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.vxrs + i) + 1); + vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, - current->thread.vxrs + __NUM_VXRS_LOW, + current->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(sregs_ext->vxrs_high))) return -EFAULT; } @@ -286,15 +264,15 @@ static int restore_sigregs_ext32(struct pt_regs *regs, *(__u32 *)®s->gprs[i] = gprs_high[i]; /* Restore vector registers from signal stack */ - if (current->thread.vxrs) { + if (is_vx_task(current)) { if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, sizeof(sregs_ext->vxrs_low)) || - __copy_from_user(current->thread.vxrs + __NUM_VXRS_LOW, + __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, &sregs_ext->vxrs_high, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.vxrs + i) + 1) = vxrs[i]; + *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; } return 0; } @@ -308,6 +286,7 @@ COMPAT_SYSCALL_DEFINE0(sigreturn) if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE32)) goto badframe; set_current_blocked(&set); + save_fpu_regs(); if (restore_sigregs32(regs, &frame->sregs)) goto badframe; if (restore_sigregs_ext32(regs, &frame->sregs_ext)) @@ -330,6 +309,7 @@ COMPAT_SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; + save_fpu_regs(); if (restore_sigregs32(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_sigregs_ext32(regs, &frame->uc.uc_mcontext_ext)) @@ -472,7 +452,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, */ uc_flags = UC_GPRS_HIGH; if (MACHINE_HAS_VX) { - if (current->thread.vxrs) + if (is_vx_task(current)) uc_flags |= UC_VXRS; } else frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) + diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 84062e7a77da..247b7aae4c6d 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -20,6 +20,8 @@ #include <asm/page.h> #include <asm/sigp.h> #include <asm/irq.h> +#include <asm/fpu-internal.h> +#include <asm/vx-insn.h> __PT_R0 = __PT_GPRS __PT_R1 = __PT_GPRS + 8 @@ -46,10 +48,10 @@ _TIF_WORK = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ _TIF_UPROBE) _TIF_TRACE = (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ _TIF_SYSCALL_TRACEPOINT) -_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE) +_CIF_WORK = (_CIF_MCCK_PENDING | _CIF_ASCE | _CIF_FPU) _PIF_WORK = (_PIF_PER_TRAP) -#define BASED(name) name-system_call(%r13) +#define BASED(name) name-cleanup_critical(%r13) .macro TRACE_IRQS_ON #ifdef CONFIG_TRACE_IRQFLAGS @@ -73,38 +75,6 @@ _PIF_WORK = (_PIF_PER_TRAP) #endif .endm - .macro LPP newpp -#if IS_ENABLED(CONFIG_KVM) - tm __LC_MACHINE_FLAGS+6,0x20 # MACHINE_FLAG_LPP - jz .+8 - .insn s,0xb2800000,\newpp -#endif - .endm - - .macro HANDLE_SIE_INTERCEPT scratch,reason -#if IS_ENABLED(CONFIG_KVM) - tmhh %r8,0x0001 # interrupting from user ? - jnz .+62 - lgr \scratch,%r9 - slg \scratch,BASED(.Lsie_critical) - clg \scratch,BASED(.Lsie_critical_length) - .if \reason==1 - # Some program interrupts are suppressing (e.g. protection). - # We must also check the instruction after SIE in that case. - # do_protection_exception will rewind to .Lrewind_pad - jh .+42 - .else - jhe .+42 - .endif - lg %r14,__SF_EMPTY(%r15) # get control block pointer - LPP __SF_EMPTY+16(%r15) # set host id - ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - larl %r9,sie_exit # skip forward to sie_exit - mvi __SF_EMPTY+31(%r15),\reason # set exit reason -#endif - .endm - .macro CHECK_STACK stacksize,savearea #ifdef CONFIG_CHECK_STACK tml %r15,\stacksize - CONFIG_STACK_GUARD @@ -113,7 +83,7 @@ _PIF_WORK = (_PIF_PER_TRAP) #endif .endm - .macro SWITCH_ASYNC savearea,stack,shift + .macro SWITCH_ASYNC savearea,timer tmhh %r8,0x0001 # interrupting from user ? jnz 1f lgr %r14,%r9 @@ -124,26 +94,28 @@ _PIF_WORK = (_PIF_PER_TRAP) brasl %r14,cleanup_critical tmhh %r8,0x0001 # retest problem state after cleanup jnz 1f -0: lg %r14,\stack # are we already on the target stack? +0: lg %r14,__LC_ASYNC_STACK # are we already on the async stack? slgr %r14,%r15 - srag %r14,%r14,\shift - jnz 1f - CHECK_STACK 1<<\shift,\savearea + srag %r14,%r14,STACK_SHIFT + jnz 2f + CHECK_STACK 1<<STACK_SHIFT,\savearea aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - j 2f -1: lg %r15,\stack # load target stack -2: la %r11,STACK_FRAME_OVERHEAD(%r15) + j 3f +1: LAST_BREAK %r14 + UPDATE_VTIME %r14,%r15,\timer +2: lg %r15,__LC_ASYNC_STACK # load async stack +3: la %r11,STACK_FRAME_OVERHEAD(%r15) .endm - .macro UPDATE_VTIME scratch,enter_timer - lg \scratch,__LC_EXIT_TIMER - slg \scratch,\enter_timer - alg \scratch,__LC_USER_TIMER - stg \scratch,__LC_USER_TIMER - lg \scratch,__LC_LAST_UPDATE_TIMER - slg \scratch,__LC_EXIT_TIMER - alg \scratch,__LC_SYSTEM_TIMER - stg \scratch,__LC_SYSTEM_TIMER + .macro UPDATE_VTIME w1,w2,enter_timer + lg \w1,__LC_EXIT_TIMER + lg \w2,__LC_LAST_UPDATE_TIMER + slg \w1,\enter_timer + slg \w2,__LC_EXIT_TIMER + alg \w1,__LC_USER_TIMER + alg \w2,__LC_SYSTEM_TIMER + stg \w1,__LC_USER_TIMER + stg \w2,__LC_SYSTEM_TIMER mvc __LC_LAST_UPDATE_TIMER(8),\enter_timer .endm @@ -197,6 +169,69 @@ ENTRY(__switch_to) br %r14 .L__critical_start: + +#if IS_ENABLED(CONFIG_KVM) +/* + * sie64a calling convention: + * %r2 pointer to sie control block + * %r3 guest register save area + */ +ENTRY(sie64a) + stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers + stg %r2,__SF_EMPTY(%r15) # save control block pointer + stg %r3,__SF_EMPTY+8(%r15) # save guest register save area + xc __SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason + tm __LC_CPU_FLAGS+7,_CIF_FPU # load guest fp/vx registers ? + jno .Lsie_load_guest_gprs + brasl %r14,load_fpu_regs # load guest fp/vx regs +.Lsie_load_guest_gprs: + lmg %r0,%r13,0(%r3) # load guest gprs 0-13 + lg %r14,__LC_GMAP # get gmap pointer + ltgr %r14,%r14 + jz .Lsie_gmap + lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce +.Lsie_gmap: + lg %r14,__SF_EMPTY(%r15) # get control block pointer + oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now + tm __SIE_PROG20+3(%r14),3 # last exit... + jnz .Lsie_skip + tm __LC_CPU_FLAGS+7,_CIF_FPU + jo .Lsie_skip # exit if fp/vx regs changed + tm __LC_MACHINE_FLAGS+6,0x20 # MACHINE_FLAG_LPP + jz .Lsie_enter + .insn s,0xb2800000,__LC_CURRENT_PID # set guest id to pid +.Lsie_enter: + sie 0(%r14) + tm __LC_MACHINE_FLAGS+6,0x20 # MACHINE_FLAG_LPP + jz .Lsie_skip + .insn s,0xb2800000,__SF_EMPTY+16(%r15)# set host id +.Lsie_skip: + ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce +.Lsie_done: +# some program checks are suppressing. C code (e.g. do_protection_exception) +# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other +# instructions between sie64a and .Lsie_done should not cause program +# interrupts. So lets use a nop (47 00 00 00) as a landing pad. +# See also .Lcleanup_sie +.Lrewind_pad: + nop 0 + .globl sie_exit +sie_exit: + lg %r14,__SF_EMPTY+8(%r15) # load guest register save area + stmg %r0,%r13,0(%r14) # save guest gprs 0-13 + lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers + lg %r2,__SF_EMPTY+24(%r15) # return exit reason code + br %r14 +.Lsie_fault: + lghi %r14,-EFAULT + stg %r14,__SF_EMPTY+24(%r15) # set exit reason code + j sie_exit + + EX_TABLE(.Lrewind_pad,.Lsie_fault) + EX_TABLE(sie_exit,.Lsie_fault) +#endif + /* * SVC interrupt handler routine. System calls are synchronous events and * are executed with interrupts enabled. @@ -212,9 +247,9 @@ ENTRY(system_call) .Lsysc_per: lg %r15,__LC_KERNEL_STACK la %r11,STACK_FRAME_OVERHEAD(%r15) # pointer to pt_regs -.Lsysc_vtime: - UPDATE_VTIME %r13,__LC_SYNC_ENTER_TIMER LAST_BREAK %r13 +.Lsysc_vtime: + UPDATE_VTIME %r10,%r13,__LC_SYNC_ENTER_TIMER stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC mvc __PT_PSW(16,%r11),__LC_SVC_OLD_PSW @@ -244,8 +279,6 @@ ENTRY(system_call) .Lsysc_return: LOCKDEP_SYS_EXIT .Lsysc_tif: - tm __PT_PSW+1(%r11),0x01 # returning to user ? - jno .Lsysc_restore tm __PT_FLAGS+7(%r11),_PIF_WORK jnz .Lsysc_work tm __TI_flags+7(%r12),_TIF_WORK @@ -280,6 +313,8 @@ ENTRY(system_call) jo .Lsysc_sigpending tm __TI_flags+7(%r12),_TIF_NOTIFY_RESUME jo .Lsysc_notify_resume + tm __LC_CPU_FLAGS+7,_CIF_FPU + jo .Lsysc_vxrs tm __LC_CPU_FLAGS+7,_CIF_ASCE jo .Lsysc_uaccess j .Lsysc_return # beware of critical section cleanup @@ -307,6 +342,13 @@ ENTRY(system_call) j .Lsysc_return # +# CIF_FPU is set, restore floating-point controls and floating-point registers. +# +.Lsysc_vxrs: + larl %r14,.Lsysc_return + jg load_fpu_regs + +# # _TIF_SIGPENDING is set, call do_signal # .Lsysc_sigpending: @@ -405,28 +447,35 @@ ENTRY(pgm_check_handler) stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK lg %r12,__LC_THREAD_INFO - larl %r13,system_call + larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW - HANDLE_SIE_INTERCEPT %r14,1 tmhh %r8,0x0001 # test problem state bit - jnz 1f # -> fault in user space - tmhh %r8,0x4000 # PER bit set in old PSW ? - jnz 0f # -> enabled, can't be a double fault + jnz 2f # -> fault in user space +#if IS_ENABLED(CONFIG_KVM) + # cleanup critical section for sie64a + lgr %r14,%r9 + slg %r14,BASED(.Lsie_critical_start) + clg %r14,BASED(.Lsie_critical_length) + jhe 0f + brasl %r14,.Lcleanup_sie +#endif +0: tmhh %r8,0x4000 # PER bit set in old PSW ? + jnz 1f # -> enabled, can't be a double fault tm __LC_PGM_ILC+3,0x80 # check for per exception jnz .Lpgm_svcper # -> single stepped svc -0: CHECK_STACK STACK_SIZE,__LC_SAVE_AREA_SYNC +1: CHECK_STACK STACK_SIZE,__LC_SAVE_AREA_SYNC aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) - j 2f -1: UPDATE_VTIME %r14,__LC_SYNC_ENTER_TIMER - LAST_BREAK %r14 + j 3f +2: LAST_BREAK %r14 + UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER lg %r15,__LC_KERNEL_STACK lg %r14,__TI_task(%r12) aghi %r14,__TASK_thread # pointer to thread_struct lghi %r13,__LC_PGM_TDB tm __LC_PGM_ILC+2,0x02 # check for transaction abort - jz 2f + jz 3f mvc __THREAD_trap_tdb(256,%r14),0(%r13) -2: la %r11,STACK_FRAME_OVERHEAD(%r15) +3: la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC stmg %r8,%r9,__PT_PSW(%r11) @@ -435,24 +484,28 @@ ENTRY(pgm_check_handler) xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) stg %r10,__PT_ARGS(%r11) tm __LC_PGM_ILC+3,0x80 # check for per exception - jz 0f + jz 4f tmhh %r8,0x0001 # kernel per event ? jz .Lpgm_kprobe oi __PT_FLAGS+7(%r11),_PIF_PER_TRAP mvc __THREAD_per_address(8,%r14),__LC_PER_ADDRESS mvc __THREAD_per_cause(2,%r14),__LC_PER_CODE mvc __THREAD_per_paid(1,%r14),__LC_PER_ACCESS_ID -0: REENABLE_IRQS +4: REENABLE_IRQS xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) larl %r1,pgm_check_table llgh %r10,__PT_INT_CODE+2(%r11) nill %r10,0x007f sll %r10,2 - je .Lsysc_return + je .Lpgm_return lgf %r1,0(%r10,%r1) # load address of handler routine lgr %r2,%r11 # pass pointer to pt_regs basr %r14,%r1 # branch to interrupt-handler - j .Lsysc_return +.Lpgm_return: + LOCKDEP_SYS_EXIT + tm __PT_PSW+1(%r11),0x01 # returning to user ? + jno .Lsysc_restore + j .Lsysc_tif # # PER event in supervisor state, must be kprobes @@ -462,7 +515,7 @@ ENTRY(pgm_check_handler) xc __SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15) lgr %r2,%r11 # pass pointer to pt_regs brasl %r14,do_per_trap - j .Lsysc_return + j .Lpgm_return # # single stepped system call @@ -483,15 +536,9 @@ ENTRY(io_int_handler) stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r10,__LC_LAST_BREAK lg %r12,__LC_THREAD_INFO - larl %r13,system_call + larl %r13,cleanup_critical lmg %r8,%r9,__LC_IO_OLD_PSW - HANDLE_SIE_INTERCEPT %r14,2 - SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT - tmhh %r8,0x0001 # interrupting from user? - jz .Lio_skip - UPDATE_VTIME %r14,__LC_ASYNC_ENTER_TIMER - LAST_BREAK %r14 -.Lio_skip: + SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC stmg %r8,%r9,__PT_PSW(%r11) @@ -587,6 +634,8 @@ ENTRY(io_int_handler) jo .Lio_sigpending tm __TI_flags+7(%r12),_TIF_NOTIFY_RESUME jo .Lio_notify_resume + tm __LC_CPU_FLAGS+7,_CIF_FPU + jo .Lio_vxrs tm __LC_CPU_FLAGS+7,_CIF_ASCE jo .Lio_uaccess j .Lio_return # beware of critical section cleanup @@ -609,6 +658,13 @@ ENTRY(io_int_handler) j .Lio_return # +# CIF_FPU is set, restore floating-point controls and floating-point registers. +# +.Lio_vxrs: + larl %r14,.Lio_return + jg load_fpu_regs + +# # _TIF_NEED_RESCHED is set, call schedule # .Lio_reschedule: @@ -652,15 +708,9 @@ ENTRY(ext_int_handler) stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r10,__LC_LAST_BREAK lg %r12,__LC_THREAD_INFO - larl %r13,system_call + larl %r13,cleanup_critical lmg %r8,%r9,__LC_EXT_OLD_PSW - HANDLE_SIE_INTERCEPT %r14,3 - SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_STACK,STACK_SHIFT - tmhh %r8,0x0001 # interrupting from user ? - jz .Lext_skip - UPDATE_VTIME %r14,__LC_ASYNC_ENTER_TIMER - LAST_BREAK %r14 -.Lext_skip: + SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_ASYNC stmg %r8,%r9,__PT_PSW(%r11) @@ -690,6 +740,122 @@ ENTRY(psw_idle) br %r14 .Lpsw_idle_end: +/* Store floating-point controls and floating-point or vector extension + * registers instead. A critical section cleanup assures that the registers + * are stored even if interrupted for some other work. The register %r2 + * designates a struct fpu to store register contents. If the specified + * structure does not contain a register save area, the register store is + * omitted (see also comments in arch_dup_task_struct()). + * + * The CIF_FPU flag is set in any case. The CIF_FPU triggers a lazy restore + * of the register contents at system call or io return. + */ +ENTRY(save_fpu_regs) + lg %r2,__LC_CURRENT + aghi %r2,__TASK_thread + tm __LC_CPU_FLAGS+7,_CIF_FPU + bor %r14 + stfpc __THREAD_FPU_fpc(%r2) +.Lsave_fpu_regs_fpc_end: + lg %r3,__THREAD_FPU_regs(%r2) + ltgr %r3,%r3 + jz .Lsave_fpu_regs_done # no save area -> set CIF_FPU + tm __THREAD_FPU_flags+3(%r2),FPU_USE_VX + jz .Lsave_fpu_regs_fp # no -> store FP regs +.Lsave_fpu_regs_vx_low: + VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3) +.Lsave_fpu_regs_vx_high: + VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3) + j .Lsave_fpu_regs_done # -> set CIF_FPU flag +.Lsave_fpu_regs_fp: + std 0,0(%r3) + std 1,8(%r3) + std 2,16(%r3) + std 3,24(%r3) + std 4,32(%r3) + std 5,40(%r3) + std 6,48(%r3) + std 7,56(%r3) + std 8,64(%r3) + std 9,72(%r3) + std 10,80(%r3) + std 11,88(%r3) + std 12,96(%r3) + std 13,104(%r3) + std 14,112(%r3) + std 15,120(%r3) +.Lsave_fpu_regs_done: + oi __LC_CPU_FLAGS+7,_CIF_FPU + br %r14 +.Lsave_fpu_regs_end: + +/* Load floating-point controls and floating-point or vector extension + * registers. A critical section cleanup assures that the register contents + * are loaded even if interrupted for some other work. Depending on the saved + * FP/VX state, the vector-enablement control, CR0.46, is either set or cleared. + * + * There are special calling conventions to fit into sysc and io return work: + * %r15: <kernel stack> + * The function requires: + * %r4 and __SF_EMPTY+32(%r15) + */ +load_fpu_regs: + lg %r4,__LC_CURRENT + aghi %r4,__TASK_thread + tm __LC_CPU_FLAGS+7,_CIF_FPU + bnor %r14 + lfpc __THREAD_FPU_fpc(%r4) + stctg %c0,%c0,__SF_EMPTY+32(%r15) # store CR0 + tm __THREAD_FPU_flags+3(%r4),FPU_USE_VX # VX-enabled task ? + lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area + jz .Lload_fpu_regs_fp_ctl # -> no VX, load FP regs +.Lload_fpu_regs_vx_ctl: + tm __SF_EMPTY+32+5(%r15),2 # test VX control + jo .Lload_fpu_regs_vx + oi __SF_EMPTY+32+5(%r15),2 # set VX control + lctlg %c0,%c0,__SF_EMPTY+32(%r15) +.Lload_fpu_regs_vx: + VLM %v0,%v15,0,%r4 +.Lload_fpu_regs_vx_high: + VLM %v16,%v31,256,%r4 + j .Lload_fpu_regs_done +.Lload_fpu_regs_fp_ctl: + tm __SF_EMPTY+32+5(%r15),2 # test VX control + jz .Lload_fpu_regs_fp + ni __SF_EMPTY+32+5(%r15),253 # clear VX control + lctlg %c0,%c0,__SF_EMPTY+32(%r15) +.Lload_fpu_regs_fp: + ld 0,0(%r4) + ld 1,8(%r4) + ld 2,16(%r4) + ld 3,24(%r4) + ld 4,32(%r4) + ld 5,40(%r4) + ld 6,48(%r4) + ld 7,56(%r4) + ld 8,64(%r4) + ld 9,72(%r4) + ld 10,80(%r4) + ld 11,88(%r4) + ld 12,96(%r4) + ld 13,104(%r4) + ld 14,112(%r4) + ld 15,120(%r4) +.Lload_fpu_regs_done: + ni __LC_CPU_FLAGS+7,255-_CIF_FPU + br %r14 +.Lload_fpu_regs_end: + +/* Test and set the vector enablement control in CR0.46 */ +ENTRY(__ctl_set_vx) + stctg %c0,%c0,__SF_EMPTY(%r15) + tm __SF_EMPTY+5(%r15),2 + bor %r14 + oi __SF_EMPTY+5(%r15),2 + lctlg %c0,%c0,__SF_EMPTY(%r15) + br %r14 +.L__ctl_set_vx_end: + .L__critical_end: /* @@ -702,9 +868,8 @@ ENTRY(mcck_int_handler) lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs lg %r10,__LC_LAST_BREAK lg %r12,__LC_THREAD_INFO - larl %r13,system_call + larl %r13,cleanup_critical lmg %r8,%r9,__LC_MCK_OLD_PSW - HANDLE_SIE_INTERCEPT %r14,4 tm __LC_MCCK_CODE,0x80 # system damage? jo .Lmcck_panic # yes -> rest of mcck code invalid lghi %r14,__LC_CPU_TIMER_SAVE_AREA @@ -725,11 +890,7 @@ ENTRY(mcck_int_handler) mvc __LC_MCCK_ENTER_TIMER(8),0(%r14) 3: tm __LC_MCCK_CODE+2,0x09 # mwp + ia of old psw valid? jno .Lmcck_panic # no -> skip cleanup critical - SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_PANIC_STACK,PAGE_SHIFT - tm %r8,0x0001 # interrupting from user ? - jz .Lmcck_skip - UPDATE_VTIME %r14,__LC_MCCK_ENTER_TIMER - LAST_BREAK %r14 + SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER .Lmcck_skip: lghi %r14,__LC_GPREGS_SAVE_AREA+64 stmg %r0,%r7,__PT_R0(%r11) @@ -764,12 +925,8 @@ ENTRY(mcck_int_handler) lpswe __LC_RETURN_MCCK_PSW .Lmcck_panic: - lg %r14,__LC_PANIC_STACK - slgr %r14,%r15 - srag %r14,%r14,PAGE_SHIFT - jz 0f lg %r15,__LC_PANIC_STACK -0: aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) j .Lmcck_skip # @@ -819,20 +976,13 @@ stack_overflow: jg kernel_stack_overflow #endif - .align 8 -.Lcleanup_table: - .quad system_call - .quad .Lsysc_do_svc - .quad .Lsysc_tif - .quad .Lsysc_restore - .quad .Lsysc_done - .quad .Lio_tif - .quad .Lio_restore - .quad .Lio_done - .quad psw_idle - .quad .Lpsw_idle_end - cleanup_critical: +#if IS_ENABLED(CONFIG_KVM) + clg %r9,BASED(.Lcleanup_table_sie) # .Lsie_gmap + jl 0f + clg %r9,BASED(.Lcleanup_table_sie+8)# .Lsie_done + jl .Lcleanup_sie +#endif clg %r9,BASED(.Lcleanup_table) # system_call jl 0f clg %r9,BASED(.Lcleanup_table+8) # .Lsysc_do_svc @@ -853,8 +1003,54 @@ cleanup_critical: jl 0f clg %r9,BASED(.Lcleanup_table+72) # .Lpsw_idle_end jl .Lcleanup_idle + clg %r9,BASED(.Lcleanup_table+80) # save_fpu_regs + jl 0f + clg %r9,BASED(.Lcleanup_table+88) # .Lsave_fpu_regs_end + jl .Lcleanup_save_fpu_regs + clg %r9,BASED(.Lcleanup_table+96) # load_fpu_regs + jl 0f + clg %r9,BASED(.Lcleanup_table+104) # .Lload_fpu_regs_end + jl .Lcleanup_load_fpu_regs + clg %r9,BASED(.Lcleanup_table+112) # __ctl_set_vx + jl 0f + clg %r9,BASED(.Lcleanup_table+120) # .L__ctl_set_vx_end + jl .Lcleanup___ctl_set_vx 0: br %r14 + .align 8 +.Lcleanup_table: + .quad system_call + .quad .Lsysc_do_svc + .quad .Lsysc_tif + .quad .Lsysc_restore + .quad .Lsysc_done + .quad .Lio_tif + .quad .Lio_restore + .quad .Lio_done + .quad psw_idle + .quad .Lpsw_idle_end + .quad save_fpu_regs + .quad .Lsave_fpu_regs_end + .quad load_fpu_regs + .quad .Lload_fpu_regs_end + .quad __ctl_set_vx + .quad .L__ctl_set_vx_end + +#if IS_ENABLED(CONFIG_KVM) +.Lcleanup_table_sie: + .quad .Lsie_gmap + .quad .Lsie_done + +.Lcleanup_sie: + lg %r9,__SF_EMPTY(%r15) # get control block pointer + tm __LC_MACHINE_FLAGS+6,0x20 # MACHINE_FLAG_LPP + jz 0f + .insn s,0xb2800000,__SF_EMPTY+16(%r15)# set host id +0: ni __SIE_PROG0C+3(%r9),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit + br %r14 +#endif .Lcleanup_system_call: # check if stpt has been executed @@ -915,7 +1111,7 @@ cleanup_critical: .quad system_call .quad .Lsysc_stmg .quad .Lsysc_per - .quad .Lsysc_vtime+18 + .quad .Lsysc_vtime+36 .quad .Lsysc_vtime+42 .Lcleanup_sysc_tif: @@ -981,6 +1177,145 @@ cleanup_critical: .Lcleanup_idle_insn: .quad .Lpsw_idle_lpsw +.Lcleanup_save_fpu_regs: + tm __LC_CPU_FLAGS+7,_CIF_FPU + bor %r14 + clg %r9,BASED(.Lcleanup_save_fpu_regs_done) + jhe 5f + clg %r9,BASED(.Lcleanup_save_fpu_regs_fp) + jhe 4f + clg %r9,BASED(.Lcleanup_save_fpu_regs_vx_high) + jhe 3f + clg %r9,BASED(.Lcleanup_save_fpu_regs_vx_low) + jhe 2f + clg %r9,BASED(.Lcleanup_save_fpu_fpc_end) + jhe 1f + lg %r2,__LC_CURRENT +0: # Store floating-point controls + stfpc __THREAD_FPU_fpc(%r2) +1: # Load register save area and check if VX is active + lg %r3,__THREAD_FPU_regs(%r2) + ltgr %r3,%r3 + jz 5f # no save area -> set CIF_FPU + tm __THREAD_FPU_flags+3(%r2),FPU_USE_VX + jz 4f # no VX -> store FP regs +2: # Store vector registers (V0-V15) + VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3) +3: # Store vector registers (V16-V31) + VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3) + j 5f # -> done, set CIF_FPU flag +4: # Store floating-point registers + std 0,0(%r3) + std 1,8(%r3) + std 2,16(%r3) + std 3,24(%r3) + std 4,32(%r3) + std 5,40(%r3) + std 6,48(%r3) + std 7,56(%r3) + std 8,64(%r3) + std 9,72(%r3) + std 10,80(%r3) + std 11,88(%r3) + std 12,96(%r3) + std 13,104(%r3) + std 14,112(%r3) + std 15,120(%r3) +5: # Set CIF_FPU flag + oi __LC_CPU_FLAGS+7,_CIF_FPU + lg %r9,48(%r11) # return from save_fpu_regs + br %r14 +.Lcleanup_save_fpu_fpc_end: + .quad .Lsave_fpu_regs_fpc_end +.Lcleanup_save_fpu_regs_vx_low: + .quad .Lsave_fpu_regs_vx_low +.Lcleanup_save_fpu_regs_vx_high: + .quad .Lsave_fpu_regs_vx_high +.Lcleanup_save_fpu_regs_fp: + .quad .Lsave_fpu_regs_fp +.Lcleanup_save_fpu_regs_done: + .quad .Lsave_fpu_regs_done + +.Lcleanup_load_fpu_regs: + tm __LC_CPU_FLAGS+7,_CIF_FPU + bnor %r14 + clg %r9,BASED(.Lcleanup_load_fpu_regs_done) + jhe 1f + clg %r9,BASED(.Lcleanup_load_fpu_regs_fp) + jhe 2f + clg %r9,BASED(.Lcleanup_load_fpu_regs_fp_ctl) + jhe 3f + clg %r9,BASED(.Lcleanup_load_fpu_regs_vx_high) + jhe 4f + clg %r9,BASED(.Lcleanup_load_fpu_regs_vx) + jhe 5f + clg %r9,BASED(.Lcleanup_load_fpu_regs_vx_ctl) + jhe 6f + lg %r4,__LC_CURRENT + lfpc __THREAD_FPU_fpc(%r4) + tm __THREAD_FPU_flags+3(%r4),FPU_USE_VX # VX-enabled task ? + lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area + jz 3f # -> no VX, load FP regs +6: # Set VX-enablement control + stctg %c0,%c0,__SF_EMPTY+32(%r15) # store CR0 + tm __SF_EMPTY+32+5(%r15),2 # test VX control + jo 5f + oi __SF_EMPTY+32+5(%r15),2 # set VX control + lctlg %c0,%c0,__SF_EMPTY+32(%r15) +5: # Load V0 ..V15 registers + VLM %v0,%v15,0,%r4 +4: # Load V16..V31 registers + VLM %v16,%v31,256,%r4 + j 1f +3: # Clear VX-enablement control for FP + stctg %c0,%c0,__SF_EMPTY+32(%r15) # store CR0 + tm __SF_EMPTY+32+5(%r15),2 # test VX control + jz 2f + ni __SF_EMPTY+32+5(%r15),253 # clear VX control + lctlg %c0,%c0,__SF_EMPTY+32(%r15) +2: # Load floating-point registers + ld 0,0(%r4) + ld 1,8(%r4) + ld 2,16(%r4) + ld 3,24(%r4) + ld 4,32(%r4) + ld 5,40(%r4) + ld 6,48(%r4) + ld 7,56(%r4) + ld 8,64(%r4) + ld 9,72(%r4) + ld 10,80(%r4) + ld 11,88(%r4) + ld 12,96(%r4) + ld 13,104(%r4) + ld 14,112(%r4) + ld 15,120(%r4) +1: # Clear CIF_FPU bit + ni __LC_CPU_FLAGS+7,255-_CIF_FPU + lg %r9,48(%r11) # return from load_fpu_regs + br %r14 +.Lcleanup_load_fpu_regs_vx_ctl: + .quad .Lload_fpu_regs_vx_ctl +.Lcleanup_load_fpu_regs_vx: + .quad .Lload_fpu_regs_vx +.Lcleanup_load_fpu_regs_vx_high: + .quad .Lload_fpu_regs_vx_high +.Lcleanup_load_fpu_regs_fp_ctl: + .quad .Lload_fpu_regs_fp_ctl +.Lcleanup_load_fpu_regs_fp: + .quad .Lload_fpu_regs_fp +.Lcleanup_load_fpu_regs_done: + .quad .Lload_fpu_regs_done + +.Lcleanup___ctl_set_vx: + stctg %c0,%c0,__SF_EMPTY(%r15) + tm __SF_EMPTY+5(%r15),2 + bor %r14 + oi __SF_EMPTY+5(%r15),2 + lctlg %c0,%c0,__SF_EMPTY(%r15) + lg %r9,48(%r11) # return from __ctl_set_vx + br %r14 + /* * Integer constants */ @@ -989,62 +1324,11 @@ cleanup_critical: .quad .L__critical_start .Lcritical_length: .quad .L__critical_end - .L__critical_start - - #if IS_ENABLED(CONFIG_KVM) -/* - * sie64a calling convention: - * %r2 pointer to sie control block - * %r3 guest register save area - */ -ENTRY(sie64a) - stmg %r6,%r14,__SF_GPRS(%r15) # save kernel registers - stg %r2,__SF_EMPTY(%r15) # save control block pointer - stg %r3,__SF_EMPTY+8(%r15) # save guest register save area - xc __SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason - lmg %r0,%r13,0(%r3) # load guest gprs 0-13 - lg %r14,__LC_GMAP # get gmap pointer - ltgr %r14,%r14 - jz .Lsie_gmap - lctlg %c1,%c1,__GMAP_ASCE(%r14) # load primary asce -.Lsie_gmap: - lg %r14,__SF_EMPTY(%r15) # get control block pointer - oi __SIE_PROG0C+3(%r14),1 # we are going into SIE now - tm __SIE_PROG20+3(%r14),3 # last exit... - jnz .Lsie_done - LPP __SF_EMPTY(%r15) # set guest id - sie 0(%r14) -.Lsie_done: - LPP __SF_EMPTY+16(%r15) # set host id - ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE - lctlg %c1,%c1,__LC_USER_ASCE # load primary asce -# some program checks are suppressing. C code (e.g. do_protection_exception) -# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other -# instructions between sie64a and .Lsie_done should not cause program -# interrupts. So lets use a nop (47 00 00 00) as a landing pad. -# See also HANDLE_SIE_INTERCEPT -.Lrewind_pad: - nop 0 - .globl sie_exit -sie_exit: - lg %r14,__SF_EMPTY+8(%r15) # load guest register save area - stmg %r0,%r13,0(%r14) # save guest gprs 0-13 - lmg %r6,%r14,__SF_GPRS(%r15) # restore kernel registers - lg %r2,__SF_EMPTY+24(%r15) # return exit reason code - br %r14 -.Lsie_fault: - lghi %r14,-EFAULT - stg %r14,__SF_EMPTY+24(%r15) # set exit reason code - j sie_exit - - .align 8 -.Lsie_critical: +.Lsie_critical_start: .quad .Lsie_gmap .Lsie_critical_length: .quad .Lsie_done - .Lsie_gmap - - EX_TABLE(.Lrewind_pad,.Lsie_fault) - EX_TABLE(sie_exit,.Lsie_fault) #endif .section .rodata, "a" diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index 59b7c6470567..1255c6c5353e 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -370,6 +370,7 @@ ENTRY(startup_kdump) xc 0x200(256),0x200 # partially clear lowcore xc 0x300(256),0x300 xc 0xe00(256),0xe00 + lctlg %c0,%c15,0x200(%r0) # initialize control registers stck __LC_LAST_UPDATE_CLOCK spt 6f-.LPG0(%r13) mvc __LC_LAST_UPDATE_TIMER(8),6f-.LPG0(%r13) @@ -413,9 +414,9 @@ ENTRY(startup_kdump) # followed by the facility words. #if defined(CONFIG_MARCH_Z13) - .long 3, 0xc100eff2, 0xf46ce800, 0x00400000 + .long 2, 0xc100eff2, 0xf46cc800 #elif defined(CONFIG_MARCH_ZEC12) - .long 3, 0xc100eff2, 0xf46ce800, 0x00400000 + .long 2, 0xc100eff2, 0xf46cc800 #elif defined(CONFIG_MARCH_Z196) .long 2, 0xc100eff2, 0xf46c0000 #elif defined(CONFIG_MARCH_Z10) diff --git a/arch/s390/kernel/jump_label.c b/arch/s390/kernel/jump_label.c index a83d2248fea9..083b05f5f5ab 100644 --- a/arch/s390/kernel/jump_label.c +++ b/arch/s390/kernel/jump_label.c @@ -44,12 +44,9 @@ static void jump_label_bug(struct jump_entry *entry, struct insn *expected, unsigned char *ipn = (unsigned char *)new; pr_emerg("Jump label code mismatch at %pS [%p]\n", ipc, ipc); - pr_emerg("Found: %02x %02x %02x %02x %02x %02x\n", - ipc[0], ipc[1], ipc[2], ipc[3], ipc[4], ipc[5]); - pr_emerg("Expected: %02x %02x %02x %02x %02x %02x\n", - ipe[0], ipe[1], ipe[2], ipe[3], ipe[4], ipe[5]); - pr_emerg("New: %02x %02x %02x %02x %02x %02x\n", - ipn[0], ipn[1], ipn[2], ipn[3], ipn[4], ipn[5]); + pr_emerg("Found: %6ph\n", ipc); + pr_emerg("Expected: %6ph\n", ipe); + pr_emerg("New: %6ph\n", ipn); panic("Corrupted kernel text"); } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 56b550893593..0ae6f8e74840 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -21,6 +21,7 @@ #include <asm/nmi.h> #include <asm/crw.h> #include <asm/switch_to.h> +#include <asm/fpu-internal.h> #include <asm/ctl_reg.h> struct mcck_struct { @@ -164,8 +165,12 @@ static int notrace s390_revalidate_registers(struct mci *mci) cr0.val = S390_lowcore.cregs_save_area[0]; cr0.afp = cr0.vx = 1; __ctl_load(cr0.val, 0, 0); - restore_vx_regs((__vector128 *) - &S390_lowcore.vector_save_area); + asm volatile( + " la 1,%0\n" + " .word 0xe70f,0x1000,0x0036\n" /* vlm 0,15,0(1) */ + " .word 0xe70f,0x1100,0x0c36\n" /* vlm 16,31,256(1) */ + : : "Q" (*(struct vx_array *) + &S390_lowcore.vector_save_area) : "1"); __ctl_load(S390_lowcore.cregs_save_area[0], 0, 0); } /* Revalidate access registers */ @@ -358,4 +363,4 @@ static int __init machine_check_init(void) ctl_set_bit(14, 24); /* enable warning MCH */ return 0; } -arch_initcall(machine_check_init); +early_initcall(machine_check_init); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index afe05bfb7e00..b973972f6ba5 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1019,12 +1019,9 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) break; } - /* The host-program-parameter (hpp) contains the sie control - * block that is set by sie64a() in entry64.S. Check if hpp - * refers to a valid control block and set sde_regs flags - * accordingly. This would allow to use hpp values for other - * purposes too. - * For now, simply use a non-zero value as guest indicator. + /* The host-program-parameter (hpp) contains the pid of + * the CPU thread as set by sie64a() in entry.S. + * If non-zero assume a guest sample. */ if (sfr->basic.hpp) sde_regs->in_guest = 1; diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 8f587d871b9f..f2dac9f0799d 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -81,8 +81,38 @@ void release_thread(struct task_struct *dead_task) void arch_release_task_struct(struct task_struct *tsk) { - if (tsk->thread.vxrs) - kfree(tsk->thread.vxrs); + /* Free either the floating-point or the vector register save area */ + kfree(tsk->thread.fpu.regs); +} + +int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) +{ + *dst = *src; + + /* Set up a new floating-point register save area */ + dst->thread.fpu.fpc = 0; + dst->thread.fpu.flags = 0; /* Always start with VX disabled */ + dst->thread.fpu.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS, + GFP_KERNEL|__GFP_REPEAT); + if (!dst->thread.fpu.fprs) + return -ENOMEM; + + /* + * Save the floating-point or vector register state of the current + * task. The state is not saved for early kernel threads, for example, + * the init_task, which do not have an allocated save area. + * The CIF_FPU flag is set in any case to lazy clear or restore a saved + * state when switching to a different task or returning to user space. + */ + save_fpu_regs(); + dst->thread.fpu.fpc = current->thread.fpu.fpc; + if (is_vx_task(current)) + convert_vx_to_fp(dst->thread.fpu.fprs, + current->thread.fpu.vxrs); + else + memcpy(dst->thread.fpu.fprs, current->thread.fpu.fprs, + sizeof(freg_t) * __NUM_FPRS); + return 0; } int copy_thread(unsigned long clone_flags, unsigned long new_stackp, @@ -142,11 +172,6 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, p->thread.ri_signum = 0; frame->childregs.psw.mask &= ~PSW_MASK_RI; - /* Save the fpu registers to new thread structure. */ - save_fp_ctl(&p->thread.fp_regs.fpc); - save_fp_regs(p->thread.fp_regs.fprs); - p->thread.fp_regs.pad = 0; - p->thread.vxrs = NULL; /* Set a new TLS ? */ if (clone_flags & CLONE_SETTLS) { unsigned long tls = frame->childregs.gprs[6]; @@ -162,7 +187,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, asmlinkage void execve_tail(void) { - current->thread.fp_regs.fpc = 0; + current->thread.fpu.fpc = 0; asm volatile("sfpc %0" : : "d" (0)); } @@ -171,8 +196,15 @@ asmlinkage void execve_tail(void) */ int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs) { - save_fp_ctl(&fpregs->fpc); - save_fp_regs(fpregs->fprs); + save_fpu_regs(); + fpregs->fpc = current->thread.fpu.fpc; + fpregs->pad = 0; + if (is_vx_task(current)) + convert_vx_to_fp((freg_t *)&fpregs->fprs, + current->thread.fpu.vxrs); + else + memcpy(&fpregs->fprs, current->thread.fpu.fprs, + sizeof(fpregs->fprs)); return 1; } EXPORT_SYMBOL(dump_fpu); diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index dc488e13b7e3..e6e077ae3990 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -41,6 +41,15 @@ void cpu_init(void) } /* + * cpu_have_feature - Test CPU features on module initialization + */ +int cpu_have_feature(unsigned int num) +{ + return elf_hwcap & (1UL << num); +} +EXPORT_SYMBOL(cpu_have_feature); + +/* * show_cpuinfo - Get information on one CPU for use by procfs. */ static int show_cpuinfo(struct seq_file *m, void *v) diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index d363c9c322a1..8b1c8e33f184 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -45,39 +45,27 @@ void update_cr_regs(struct task_struct *task) struct per_regs old, new; /* Take care of the enable/disable of transactional execution. */ - if (MACHINE_HAS_TE || MACHINE_HAS_VX) { + if (MACHINE_HAS_TE) { unsigned long cr, cr_new; __ctl_store(cr, 0, 0); - cr_new = cr; - if (MACHINE_HAS_TE) { - /* Set or clear transaction execution TXC bit 8. */ - cr_new |= (1UL << 55); - if (task->thread.per_flags & PER_FLAG_NO_TE) - cr_new &= ~(1UL << 55); - } - if (MACHINE_HAS_VX) { - /* Enable/disable of vector extension */ - cr_new &= ~(1UL << 17); - if (task->thread.vxrs) - cr_new |= (1UL << 17); - } + /* Set or clear transaction execution TXC bit 8. */ + cr_new = cr | (1UL << 55); + if (task->thread.per_flags & PER_FLAG_NO_TE) + cr_new &= ~(1UL << 55); if (cr_new != cr) __ctl_load(cr_new, 0, 0); - if (MACHINE_HAS_TE) { - /* Set/clear transaction execution TDC bits 62/63. */ - __ctl_store(cr, 2, 2); - cr_new = cr & ~3UL; - if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { - if (task->thread.per_flags & - PER_FLAG_TE_ABORT_RAND_TEND) - cr_new |= 1UL; - else - cr_new |= 2UL; - } - if (cr_new != cr) - __ctl_load(cr_new, 2, 2); + /* Set or clear transaction execution TDC bits 62 and 63. */ + __ctl_store(cr, 2, 2); + cr_new = cr & ~3UL; + if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND) { + if (task->thread.per_flags & PER_FLAG_TE_ABORT_RAND_TEND) + cr_new |= 1UL; + else + cr_new |= 2UL; } + if (cr_new != cr) + __ctl_load(cr_new, 2, 2); } /* Copy user specified PER registers */ new.control = thread->per_user.control; @@ -242,21 +230,21 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr) /* * floating point control reg. is in the thread structure */ - tmp = child->thread.fp_regs.fpc; + tmp = child->thread.fpu.fpc; tmp <<= BITS_PER_LONG - 32; } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* - * floating point regs. are either in child->thread.fp_regs - * or the child->thread.vxrs array + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array */ offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; - if (child->thread.vxrs) + if (is_vx_task(child)) tmp = *(addr_t *) - ((addr_t) child->thread.vxrs + 2*offset); + ((addr_t) child->thread.fpu.vxrs + 2*offset); else tmp = *(addr_t *) - ((addr_t) &child->thread.fp_regs.fprs + offset); + ((addr_t) &child->thread.fpu.fprs + offset); } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { /* @@ -387,20 +375,20 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) if ((unsigned int) data != 0 || test_fp_ctl(data >> (BITS_PER_LONG - 32))) return -EINVAL; - child->thread.fp_regs.fpc = data >> (BITS_PER_LONG - 32); + child->thread.fpu.fpc = data >> (BITS_PER_LONG - 32); } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* - * floating point regs. are either in child->thread.fp_regs - * or the child->thread.vxrs array + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array */ offset = addr - (addr_t) &dummy->regs.fp_regs.fprs; - if (child->thread.vxrs) + if (is_vx_task(child)) *(addr_t *)((addr_t) - child->thread.vxrs + 2*offset) = data; + child->thread.fpu.vxrs + 2*offset) = data; else *(addr_t *)((addr_t) - &child->thread.fp_regs.fprs + offset) = data; + &child->thread.fpu.fprs + offset) = data; } else if (addr < (addr_t) (&dummy->regs.per_info + 1)) { /* @@ -621,20 +609,20 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) /* * floating point control reg. is in the thread structure */ - tmp = child->thread.fp_regs.fpc; + tmp = child->thread.fpu.fpc; } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* - * floating point regs. are either in child->thread.fp_regs - * or the child->thread.vxrs array + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array */ offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; - if (child->thread.vxrs) + if (is_vx_task(child)) tmp = *(__u32 *) - ((addr_t) child->thread.vxrs + 2*offset); + ((addr_t) child->thread.fpu.vxrs + 2*offset); else tmp = *(__u32 *) - ((addr_t) &child->thread.fp_regs.fprs + offset); + ((addr_t) &child->thread.fpu.fprs + offset); } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { /* @@ -746,20 +734,20 @@ static int __poke_user_compat(struct task_struct *child, */ if (test_fp_ctl(tmp)) return -EINVAL; - child->thread.fp_regs.fpc = data; + child->thread.fpu.fpc = data; } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* - * floating point regs. are either in child->thread.fp_regs - * or the child->thread.vxrs array + * floating point regs. are either in child->thread.fpu + * or the child->thread.fpu.vxrs array */ offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs; - if (child->thread.vxrs) + if (is_vx_task(child)) *(__u32 *)((addr_t) - child->thread.vxrs + 2*offset) = tmp; + child->thread.fpu.vxrs + 2*offset) = tmp; else *(__u32 *)((addr_t) - &child->thread.fp_regs.fprs + offset) = tmp; + &child->thread.fpu.fprs + offset) = tmp; } else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) { /* @@ -952,18 +940,16 @@ static int s390_fpregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - if (target == current) { - save_fp_ctl(&target->thread.fp_regs.fpc); - save_fp_regs(target->thread.fp_regs.fprs); - } else if (target->thread.vxrs) { - int i; + _s390_fp_regs fp_regs; + + if (target == current) + save_fpu_regs(); + + fp_regs.fpc = target->thread.fpu.fpc; + fpregs_store(&fp_regs, &target->thread.fpu); - for (i = 0; i < __NUM_VXRS_LOW; i++) - target->thread.fp_regs.fprs[i] = - *(freg_t *)(target->thread.vxrs + i); - } return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fp_regs, 0, -1); + &fp_regs, 0, -1); } static int s390_fpregs_set(struct task_struct *target, @@ -972,41 +958,33 @@ static int s390_fpregs_set(struct task_struct *target, const void __user *ubuf) { int rc = 0; + freg_t fprs[__NUM_FPRS]; - if (target == current) { - save_fp_ctl(&target->thread.fp_regs.fpc); - save_fp_regs(target->thread.fp_regs.fprs); - } + if (target == current) + save_fpu_regs(); /* If setting FPC, must validate it first. */ if (count > 0 && pos < offsetof(s390_fp_regs, fprs)) { - u32 ufpc[2] = { target->thread.fp_regs.fpc, 0 }; + u32 ufpc[2] = { target->thread.fpu.fpc, 0 }; rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &ufpc, 0, offsetof(s390_fp_regs, fprs)); if (rc) return rc; if (ufpc[1] != 0 || test_fp_ctl(ufpc[0])) return -EINVAL; - target->thread.fp_regs.fpc = ufpc[0]; + target->thread.fpu.fpc = ufpc[0]; } if (rc == 0 && count > 0) rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - target->thread.fp_regs.fprs, - offsetof(s390_fp_regs, fprs), -1); - - if (rc == 0) { - if (target == current) { - restore_fp_ctl(&target->thread.fp_regs.fpc); - restore_fp_regs(target->thread.fp_regs.fprs); - } else if (target->thread.vxrs) { - int i; - - for (i = 0; i < __NUM_VXRS_LOW; i++) - *(freg_t *)(target->thread.vxrs + i) = - target->thread.fp_regs.fprs[i]; - } - } + fprs, offsetof(s390_fp_regs, fprs), -1); + if (rc) + return rc; + + if (is_vx_task(target)) + convert_fp_to_vx(target->thread.fpu.vxrs, fprs); + else + memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs)); return rc; } @@ -1069,11 +1047,11 @@ static int s390_vxrs_low_get(struct task_struct *target, if (!MACHINE_HAS_VX) return -ENODEV; - if (target->thread.vxrs) { + if (is_vx_task(target)) { if (target == current) - save_vx_regs(target->thread.vxrs); + save_fpu_regs(); for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(target->thread.vxrs + i) + 1); + vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1); } else memset(vxrs, 0, sizeof(vxrs)); return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); @@ -1089,20 +1067,17 @@ static int s390_vxrs_low_set(struct task_struct *target, if (!MACHINE_HAS_VX) return -ENODEV; - if (!target->thread.vxrs) { + if (!is_vx_task(target)) { rc = alloc_vector_registers(target); if (rc) return rc; } else if (target == current) - save_vx_regs(target->thread.vxrs); + save_fpu_regs(); rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1); - if (rc == 0) { + if (rc == 0) for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(target->thread.vxrs + i) + 1) = vxrs[i]; - if (target == current) - restore_vx_regs(target->thread.vxrs); - } + *((__u64 *)(target->thread.fpu.vxrs + i) + 1) = vxrs[i]; return rc; } @@ -1116,10 +1091,10 @@ static int s390_vxrs_high_get(struct task_struct *target, if (!MACHINE_HAS_VX) return -ENODEV; - if (target->thread.vxrs) { + if (is_vx_task(target)) { if (target == current) - save_vx_regs(target->thread.vxrs); - memcpy(vxrs, target->thread.vxrs + __NUM_VXRS_LOW, + save_fpu_regs(); + memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs)); } else memset(vxrs, 0, sizeof(vxrs)); @@ -1135,18 +1110,15 @@ static int s390_vxrs_high_set(struct task_struct *target, if (!MACHINE_HAS_VX) return -ENODEV; - if (!target->thread.vxrs) { + if (!is_vx_task(target)) { rc = alloc_vector_registers(target); if (rc) return rc; } else if (target == current) - save_vx_regs(target->thread.vxrs); + save_fpu_regs(); rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - target->thread.vxrs + __NUM_VXRS_LOW, 0, -1); - if (rc == 0 && target == current) - restore_vx_regs(target->thread.vxrs); - + target->thread.fpu.vxrs + __NUM_VXRS_LOW, 0, -1); return rc; } diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c index 9f60467938d1..5090d3dad10b 100644 --- a/arch/s390/kernel/s390_ksyms.c +++ b/arch/s390/kernel/s390_ksyms.c @@ -1,5 +1,6 @@ #include <linux/module.h> #include <linux/kvm_host.h> +#include <asm/fpu-internal.h> #include <asm/ftrace.h> #ifdef CONFIG_FUNCTION_TRACER @@ -8,6 +9,8 @@ EXPORT_SYMBOL(_mcount); #if IS_ENABLED(CONFIG_KVM) EXPORT_SYMBOL(sie64a); EXPORT_SYMBOL(sie_exit); +EXPORT_SYMBOL(save_fpu_regs); +EXPORT_SYMBOL(__ctl_set_vx); #endif EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); diff --git a/arch/s390/kernel/sclp.S b/arch/s390/kernel/sclp.S deleted file mode 100644 index ada0c07fe1a8..000000000000 --- a/arch/s390/kernel/sclp.S +++ /dev/null @@ -1,355 +0,0 @@ -/* - * Mini SCLP driver. - * - * Copyright IBM Corp. 2004, 2009 - * - * Author(s): Peter Oberparleiter <Peter.Oberparleiter@de.ibm.com>, - * Heiko Carstens <heiko.carstens@de.ibm.com>, - * - */ - -#include <linux/linkage.h> -#include <asm/irq.h> - -LC_EXT_NEW_PSW = 0x58 # addr of ext int handler -LC_EXT_NEW_PSW_64 = 0x1b0 # addr of ext int handler 64 bit -LC_EXT_INT_PARAM = 0x80 # addr of ext int parameter -LC_EXT_INT_CODE = 0x86 # addr of ext int code -LC_AR_MODE_ID = 0xa3 - -# -# Subroutine which waits synchronously until either an external interruption -# or a timeout occurs. -# -# Parameters: -# R2 = 0 for no timeout, non-zero for timeout in (approximated) seconds -# -# Returns: -# R2 = 0 on interrupt, 2 on timeout -# R3 = external interruption parameter if R2=0 -# - -_sclp_wait_int: - stm %r6,%r15,24(%r15) # save registers - basr %r13,0 # get base register -.LbaseS1: - ahi %r15,-96 # create stack frame - la %r8,LC_EXT_NEW_PSW # register int handler - la %r9,.LextpswS1-.LbaseS1(%r13) - tm LC_AR_MODE_ID,1 - jno .Lesa1 - la %r8,LC_EXT_NEW_PSW_64 # register int handler 64 bit - la %r9,.LextpswS1_64-.LbaseS1(%r13) -.Lesa1: - mvc .LoldpswS1-.LbaseS1(16,%r13),0(%r8) - mvc 0(16,%r8),0(%r9) - epsw %r6,%r7 # set current addressing mode - nill %r6,0x1 # in new psw (31 or 64 bit mode) - nilh %r7,0x8000 - stm %r6,%r7,0(%r8) - lhi %r6,0x0200 # cr mask for ext int (cr0.54) - ltr %r2,%r2 - jz .LsetctS1 - ahi %r6,0x0800 # cr mask for clock int (cr0.52) - stck .LtimeS1-.LbaseS1(%r13) # initiate timeout - al %r2,.LtimeS1-.LbaseS1(%r13) - st %r2,.LtimeS1-.LbaseS1(%r13) - sckc .LtimeS1-.LbaseS1(%r13) - -.LsetctS1: - stctl %c0,%c0,.LctlS1-.LbaseS1(%r13) # enable required interrupts - l %r0,.LctlS1-.LbaseS1(%r13) - lhi %r1,~(0x200 | 0x800) # clear old values - nr %r1,%r0 - or %r1,%r6 # set new value - st %r1,.LctlS1-.LbaseS1(%r13) - lctl %c0,%c0,.LctlS1-.LbaseS1(%r13) - st %r0,.LctlS1-.LbaseS1(%r13) - lhi %r2,2 # return code for timeout -.LloopS1: - lpsw .LwaitpswS1-.LbaseS1(%r13) # wait until interrupt -.LwaitS1: - lh %r7,LC_EXT_INT_CODE - chi %r7,EXT_IRQ_CLK_COMP # timeout? - je .LtimeoutS1 - chi %r7,EXT_IRQ_SERVICE_SIG # service int? - jne .LloopS1 - sr %r2,%r2 - l %r3,LC_EXT_INT_PARAM -.LtimeoutS1: - lctl %c0,%c0,.LctlS1-.LbaseS1(%r13) # restore interrupt setting - # restore old handler - mvc 0(16,%r8),.LoldpswS1-.LbaseS1(%r13) - lm %r6,%r15,120(%r15) # restore registers - br %r14 # return to caller - - .align 8 -.LoldpswS1: - .long 0, 0, 0, 0 # old ext int PSW -.LextpswS1: - .long 0x00080000, 0x80000000+.LwaitS1 # PSW to handle ext int -.LextpswS1_64: - .quad 0, .LwaitS1 # PSW to handle ext int, 64 bit -.LwaitpswS1: - .long 0x010a0000, 0x00000000+.LloopS1 # PSW to wait for ext int -.LtimeS1: - .quad 0 # current time -.LctlS1: - .long 0 # CT0 contents - -# -# Subroutine to synchronously issue a service call. -# -# Parameters: -# R2 = command word -# R3 = sccb address -# -# Returns: -# R2 = 0 on success, 1 on failure -# R3 = sccb response code if R2 = 0 -# - -_sclp_servc: - stm %r6,%r15,24(%r15) # save registers - ahi %r15,-96 # create stack frame - lr %r6,%r2 # save command word - lr %r7,%r3 # save sccb address -.LretryS2: - lhi %r2,1 # error return code - .insn rre,0xb2200000,%r6,%r7 # servc - brc 1,.LendS2 # exit if not operational - brc 8,.LnotbusyS2 # go on if not busy - sr %r2,%r2 # wait until no longer busy - bras %r14,_sclp_wait_int - j .LretryS2 # retry -.LnotbusyS2: - sr %r2,%r2 # wait until result - bras %r14,_sclp_wait_int - sr %r2,%r2 - lh %r3,6(%r7) -.LendS2: - lm %r6,%r15,120(%r15) # restore registers - br %r14 - -# -# Subroutine to set up the SCLP interface. -# -# Parameters: -# R2 = 0 to activate, non-zero to deactivate -# -# Returns: -# R2 = 0 on success, non-zero on failure -# - -_sclp_setup: - stm %r6,%r15,24(%r15) # save registers - ahi %r15,-96 # create stack frame - basr %r13,0 # get base register -.LbaseS3: - l %r6,.LsccbS0-.LbaseS3(%r13) # prepare init mask sccb - mvc 0(.LinitendS3-.LinitsccbS3,%r6),.LinitsccbS3-.LbaseS3(%r13) - ltr %r2,%r2 # initialization? - jz .LdoinitS3 # go ahead - # clear masks - xc .LinitmaskS3-.LinitsccbS3(8,%r6),.LinitmaskS3-.LinitsccbS3(%r6) -.LdoinitS3: - l %r2,.LwritemaskS3-.LbaseS3(%r13)# get command word - lr %r3,%r6 # get sccb address - bras %r14,_sclp_servc # issue service call - ltr %r2,%r2 # servc successful? - jnz .LerrorS3 - chi %r3,0x20 # write mask successful? - jne .LerrorS3 - # check masks - la %r2,.LinitmaskS3-.LinitsccbS3(%r6) - l %r1,0(%r2) # receive mask ok? - n %r1,12(%r2) - cl %r1,0(%r2) - jne .LerrorS3 - l %r1,4(%r2) # send mask ok? - n %r1,8(%r2) - cl %r1,4(%r2) - sr %r2,%r2 - je .LendS3 -.LerrorS3: - lhi %r2,1 # error return code -.LendS3: - lm %r6,%r15,120(%r15) # restore registers - br %r14 -.LwritemaskS3: - .long 0x00780005 # SCLP command for write mask -.LinitsccbS3: - .word .LinitendS3-.LinitsccbS3 - .byte 0,0,0,0 - .word 0 - .word 0 - .word 4 -.LinitmaskS3: - .long 0x80000000 - .long 0x40000000 - .long 0 - .long 0 -.LinitendS3: - -# -# Subroutine which prints a given text to the SCLP console. -# -# Parameters: -# R2 = address of nil-terminated ASCII text -# -# Returns: -# R2 = 0 on success, 1 on failure -# - -_sclp_print: - stm %r6,%r15,24(%r15) # save registers - ahi %r15,-96 # create stack frame - basr %r13,0 # get base register -.LbaseS4: - l %r8,.LsccbS0-.LbaseS4(%r13) # prepare write data sccb - mvc 0(.LmtoS4-.LwritesccbS4,%r8),.LwritesccbS4-.LbaseS4(%r13) - la %r7,.LmtoS4-.LwritesccbS4(%r8) # current mto addr - sr %r0,%r0 - l %r10,.Lascebc-.LbaseS4(%r13) # address of translation table -.LinitmtoS4: - # initialize mto - mvc 0(.LmtoendS4-.LmtoS4,%r7),.LmtoS4-.LbaseS4(%r13) - lhi %r6,.LmtoendS4-.LmtoS4 # current mto length -.LloopS4: - ic %r0,0(%r2) # get character - ahi %r2,1 - ltr %r0,%r0 # end of string? - jz .LfinalizemtoS4 - chi %r0,0x0a # end of line (NL)? - jz .LfinalizemtoS4 - stc %r0,0(%r6,%r7) # copy to mto - la %r11,0(%r6,%r7) - tr 0(1,%r11),0(%r10) # translate to EBCDIC - ahi %r6,1 - j .LloopS4 -.LfinalizemtoS4: - sth %r6,0(%r7) # update mto length - lh %r9,.LmdbS4-.LwritesccbS4(%r8) # update mdb length - ar %r9,%r6 - sth %r9,.LmdbS4-.LwritesccbS4(%r8) - lh %r9,.LevbufS4-.LwritesccbS4(%r8)# update evbuf length - ar %r9,%r6 - sth %r9,.LevbufS4-.LwritesccbS4(%r8) - lh %r9,0(%r8) # update sccb length - ar %r9,%r6 - sth %r9,0(%r8) - ar %r7,%r6 # update current mto address - ltr %r0,%r0 # more characters? - jnz .LinitmtoS4 - l %r2,.LwritedataS4-.LbaseS4(%r13)# write data - lr %r3,%r8 - bras %r14,_sclp_servc - ltr %r2,%r2 # servc successful? - jnz .LendS4 - chi %r3,0x20 # write data successful? - je .LendS4 - lhi %r2,1 # error return code -.LendS4: - lm %r6,%r15,120(%r15) # restore registers - br %r14 - -# -# Function which prints a given text to the SCLP console. -# -# Parameters: -# R2 = address of nil-terminated ASCII text -# -# Returns: -# R2 = 0 on success, 1 on failure -# - -ENTRY(_sclp_print_early) - stm %r6,%r15,24(%r15) # save registers - ahi %r15,-96 # create stack frame - tm LC_AR_MODE_ID,1 - jno .Lesa2 - ahi %r15,-80 - stmh %r6,%r15,96(%r15) # store upper register halves - basr %r13,0 - lmh %r0,%r15,.Lzeroes-.(%r13) # clear upper register halves -.Lesa2: - lr %r10,%r2 # save string pointer - lhi %r2,0 - bras %r14,_sclp_setup # enable console - ltr %r2,%r2 - jnz .LendS5 - lr %r2,%r10 - bras %r14,_sclp_print # print string - ltr %r2,%r2 - jnz .LendS5 - lhi %r2,1 - bras %r14,_sclp_setup # disable console -.LendS5: - tm LC_AR_MODE_ID,1 - jno .Lesa3 - lgfr %r2,%r2 # sign extend return value - lmh %r6,%r15,96(%r15) # restore upper register halves - ahi %r15,80 -.Lesa3: - lm %r6,%r15,120(%r15) # restore registers - br %r14 -.Lzeroes: - .fill 64,4,0 - -.LwritedataS4: - .long 0x00760005 # SCLP command for write data -.LwritesccbS4: - # sccb - .word .LmtoS4-.LwritesccbS4 - .byte 0 - .byte 0,0,0 - .word 0 - - # evbuf -.LevbufS4: - .word .LmtoS4-.LevbufS4 - .byte 0x02 - .byte 0 - .word 0 - -.LmdbS4: - # mdb - .word .LmtoS4-.LmdbS4 - .word 1 - .long 0xd4c4c240 - .long 1 - - # go -.LgoS4: - .word .LmtoS4-.LgoS4 - .word 1 - .long 0 - .byte 0,0,0,0,0,0,0,0 - .byte 0,0,0 - .byte 0 - .byte 0,0,0,0,0,0,0 - .byte 0 - .word 0 - .byte 0,0,0,0,0,0,0,0,0,0 - .byte 0,0,0,0,0,0,0,0 - .byte 0,0,0,0,0,0,0,0 - -.LmtoS4: - .word .LmtoendS4-.LmtoS4 - .word 4 - .word 0x1000 - .byte 0 - .byte 0,0,0 -.LmtoendS4: - - # Global constants -.LsccbS0: - .long _sclp_work_area -.Lascebc: - .long _ascebc - -.section .data,"aw",@progbits - .balign 4096 -_sclp_work_area: - .fill 4096 -.previous diff --git a/arch/s390/kernel/sclp.c b/arch/s390/kernel/sclp.c new file mode 100644 index 000000000000..fa0bdff1d413 --- /dev/null +++ b/arch/s390/kernel/sclp.c @@ -0,0 +1,160 @@ +/* + * Copyright IBM Corp. 2015 + * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com> + */ +#include <linux/kernel.h> +#include <asm/ebcdic.h> +#include <asm/irq.h> +#include <asm/lowcore.h> +#include <asm/processor.h> +#include <asm/sclp.h> + +static char _sclp_work_area[4096] __aligned(PAGE_SIZE); + +static void _sclp_wait_int(void) +{ + unsigned long cr0, cr0_new, psw_mask, addr; + psw_t psw_ext_save, psw_wait; + + __ctl_store(cr0, 0, 0); + cr0_new = cr0 | 0x200; + __ctl_load(cr0_new, 0, 0); + + psw_ext_save = S390_lowcore.external_new_psw; + psw_mask = __extract_psw() & (PSW_MASK_EA | PSW_MASK_BA); + S390_lowcore.external_new_psw.mask = psw_mask; + psw_wait.mask = psw_mask | PSW_MASK_EXT | PSW_MASK_WAIT; + S390_lowcore.ext_int_code = 0; + + do { + asm volatile( + " larl %[addr],0f\n" + " stg %[addr],%[psw_wait_addr]\n" + " stg %[addr],%[psw_ext_addr]\n" + " lpswe %[psw_wait]\n" + "0:\n" + : [addr] "=&d" (addr), + [psw_wait_addr] "=Q" (psw_wait.addr), + [psw_ext_addr] "=Q" (S390_lowcore.external_new_psw.addr) + : [psw_wait] "Q" (psw_wait) + : "cc", "memory"); + } while (S390_lowcore.ext_int_code != EXT_IRQ_SERVICE_SIG); + + __ctl_load(cr0, 0, 0); + S390_lowcore.external_new_psw = psw_ext_save; +} + +static int _sclp_servc(unsigned int cmd, char *sccb) +{ + unsigned int cc; + + do { + asm volatile( + " .insn rre,0xb2200000,%1,%2\n" + " ipm %0\n" + : "=d" (cc) : "d" (cmd), "a" (sccb) + : "cc", "memory"); + cc >>= 28; + if (cc == 3) + return -EINVAL; + _sclp_wait_int(); + } while (cc != 0); + return (*(unsigned short *)(sccb + 6) == 0x20) ? 0 : -EIO; +} + +static int _sclp_setup(int disable) +{ + static unsigned char init_sccb[] = { + 0x00, 0x1c, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x04, + 0x80, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + }; + unsigned int *masks; + int rc; + + memcpy(_sclp_work_area, init_sccb, 28); + masks = (unsigned int *)(_sclp_work_area + 12); + if (disable) + memset(masks, 0, 16); + /* SCLP write mask */ + rc = _sclp_servc(0x00780005, _sclp_work_area); + if (rc) + return rc; + if ((masks[0] & masks[3]) != masks[0] || + (masks[1] & masks[2]) != masks[1]) + return -EIO; + return 0; +} + +static int _sclp_print(const char *str) +{ + static unsigned char write_head[] = { + /* sccb header */ + 0x00, 0x52, /* 0 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 2 */ + /* evbuf */ + 0x00, 0x4a, /* 8 */ + 0x02, 0x00, 0x00, 0x00, /* 10 */ + /* mdb */ + 0x00, 0x44, /* 14 */ + 0x00, 0x01, /* 16 */ + 0xd4, 0xc4, 0xc2, 0x40, /* 18 */ + 0x00, 0x00, 0x00, 0x01, /* 22 */ + /* go */ + 0x00, 0x38, /* 26 */ + 0x00, 0x01, /* 28 */ + 0x00, 0x00, 0x00, 0x00, /* 30 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 34 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 42 */ + 0x00, 0x00, 0x00, 0x00, /* 50 */ + 0x00, 0x00, /* 54 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 56 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 64 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 72 */ + 0x00, 0x00, /* 80 */ + }; + static unsigned char write_mto[] = { + /* mto */ + 0x00, 0x0a, /* 0 */ + 0x00, 0x04, /* 2 */ + 0x10, 0x00, /* 4 */ + 0x00, 0x00, 0x00, 0x00 /* 6 */ + }; + unsigned char *ptr, ch; + unsigned int count; + + memcpy(_sclp_work_area, write_head, sizeof(write_head)); + ptr = _sclp_work_area + sizeof(write_head); + do { + memcpy(ptr, write_mto, sizeof(write_mto)); + for (count = sizeof(write_mto); (ch = *str++) != 0; count++) { + if (ch == 0x0a) + break; + ptr[count] = _ascebc[ch]; + } + /* Update length fields in mto, mdb, evbuf and sccb */ + *(unsigned short *) ptr = count; + *(unsigned short *)(_sclp_work_area + 14) += count; + *(unsigned short *)(_sclp_work_area + 8) += count; + *(unsigned short *)(_sclp_work_area + 0) += count; + ptr += count; + } while (ch != 0); + + /* SCLP write data */ + return _sclp_servc(0x00760005, _sclp_work_area); +} + +int _sclp_print_early(const char *str) +{ + int rc; + + rc = _sclp_setup(0); + if (rc) + return rc; + rc = _sclp_print(str); + if (rc) + return rc; + return _sclp_setup(1); +} diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index ca070d260af2..ce0cbd6ba7ca 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -62,6 +62,7 @@ #include <asm/os_info.h> #include <asm/sclp.h> #include <asm/sysinfo.h> +#include <asm/numa.h> #include "entry.h" /* @@ -76,7 +77,7 @@ EXPORT_SYMBOL(console_devno); unsigned int console_irq = -1; EXPORT_SYMBOL(console_irq); -unsigned long elf_hwcap = 0; +unsigned long elf_hwcap __read_mostly = 0; char elf_platform[ELF_PLATFORM_SIZE]; int __initdata memory_end_set; @@ -688,7 +689,7 @@ static void __init setup_memory(void) /* * Setup hardware capabilities. */ -static void __init setup_hwcaps(void) +static int __init setup_hwcaps(void) { static const int stfl_bits[6] = { 0, 2, 7, 17, 19, 21 }; struct cpuid cpu_id; @@ -754,9 +755,11 @@ static void __init setup_hwcaps(void) elf_hwcap |= HWCAP_S390_TE; /* - * Vector extension HWCAP_S390_VXRS is bit 11. + * Vector extension HWCAP_S390_VXRS is bit 11. The Vector extension + * can be disabled with the "novx" parameter. Use MACHINE_HAS_VX + * instead of facility bit 129. */ - if (test_facility(129)) + if (MACHINE_HAS_VX) elf_hwcap |= HWCAP_S390_VXRS; get_cpu_id(&cpu_id); add_device_randomness(&cpu_id, sizeof(cpu_id)); @@ -793,7 +796,9 @@ static void __init setup_hwcaps(void) strcpy(elf_platform, "z13"); break; } + return 0; } +arch_initcall(setup_hwcaps); /* * Add system information as device randomness @@ -879,11 +884,7 @@ void __init setup_arch(char **cmdline_p) setup_lowcore(); smp_fill_possible_mask(); cpu_init(); - - /* - * Setup capabilities (ELF_HWCAP & ELF_PLATFORM). - */ - setup_hwcaps(); + numa_setup(); /* * Create kernel page tables and switch to virtual addressing. diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index c551f22ce066..9549af102d75 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -105,32 +105,13 @@ struct rt_sigframe static void store_sigregs(void) { save_access_regs(current->thread.acrs); - save_fp_ctl(¤t->thread.fp_regs.fpc); - if (current->thread.vxrs) { - int i; - - save_vx_regs(current->thread.vxrs); - for (i = 0; i < __NUM_FPRS; i++) - current->thread.fp_regs.fprs[i] = - *(freg_t *)(current->thread.vxrs + i); - } else - save_fp_regs(current->thread.fp_regs.fprs); + save_fpu_regs(); } /* Load registers after signal return */ static void load_sigregs(void) { restore_access_regs(current->thread.acrs); - /* restore_fp_ctl is done in restore_sigregs */ - if (current->thread.vxrs) { - int i; - - for (i = 0; i < __NUM_FPRS; i++) - *(freg_t *)(current->thread.vxrs + i) = - current->thread.fp_regs.fprs[i]; - restore_vx_regs(current->thread.vxrs); - } else - restore_fp_regs(current->thread.fp_regs.fprs); } /* Returns non-zero on fault. */ @@ -146,8 +127,7 @@ static int save_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(&user_sregs.regs.gprs, ®s->gprs, sizeof(sregs->regs.gprs)); memcpy(&user_sregs.regs.acrs, current->thread.acrs, sizeof(user_sregs.regs.acrs)); - memcpy(&user_sregs.fpregs, ¤t->thread.fp_regs, - sizeof(user_sregs.fpregs)); + fpregs_store(&user_sregs.fpregs, ¤t->thread.fpu); if (__copy_to_user(sregs, &user_sregs, sizeof(_sigregs))) return -EFAULT; return 0; @@ -166,8 +146,8 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) if (!is_ri_task(current) && (user_sregs.regs.psw.mask & PSW_MASK_RI)) return -EINVAL; - /* Loading the floating-point-control word can fail. Do that first. */ - if (restore_fp_ctl(&user_sregs.fpregs.fpc)) + /* Test the floating-point-control word. */ + if (test_fp_ctl(user_sregs.fpregs.fpc)) return -EINVAL; /* Use regs->psw.mask instead of PSW_USER_BITS to preserve PER bit. */ @@ -185,8 +165,7 @@ static int restore_sigregs(struct pt_regs *regs, _sigregs __user *sregs) memcpy(¤t->thread.acrs, &user_sregs.regs.acrs, sizeof(current->thread.acrs)); - memcpy(¤t->thread.fp_regs, &user_sregs.fpregs, - sizeof(current->thread.fp_regs)); + fpregs_load(&user_sregs.fpregs, ¤t->thread.fpu); clear_pt_regs_flag(regs, PIF_SYSCALL); /* No longer in a system call */ return 0; @@ -200,13 +179,13 @@ static int save_sigregs_ext(struct pt_regs *regs, int i; /* Save vector registers to signal stack */ - if (current->thread.vxrs) { + if (is_vx_task(current)) { for (i = 0; i < __NUM_VXRS_LOW; i++) - vxrs[i] = *((__u64 *)(current->thread.vxrs + i) + 1); + vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1); if (__copy_to_user(&sregs_ext->vxrs_low, vxrs, sizeof(sregs_ext->vxrs_low)) || __copy_to_user(&sregs_ext->vxrs_high, - current->thread.vxrs + __NUM_VXRS_LOW, + current->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(sregs_ext->vxrs_high))) return -EFAULT; } @@ -220,15 +199,15 @@ static int restore_sigregs_ext(struct pt_regs *regs, int i; /* Restore vector registers from signal stack */ - if (current->thread.vxrs) { + if (is_vx_task(current)) { if (__copy_from_user(vxrs, &sregs_ext->vxrs_low, sizeof(sregs_ext->vxrs_low)) || - __copy_from_user(current->thread.vxrs + __NUM_VXRS_LOW, + __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW, &sregs_ext->vxrs_high, sizeof(sregs_ext->vxrs_high))) return -EFAULT; for (i = 0; i < __NUM_VXRS_LOW; i++) - *((__u64 *)(current->thread.vxrs + i) + 1) = vxrs[i]; + *((__u64 *)(current->thread.fpu.vxrs + i) + 1) = vxrs[i]; } return 0; } @@ -243,6 +222,7 @@ SYSCALL_DEFINE0(sigreturn) if (__copy_from_user(&set.sig, &frame->sc.oldmask, _SIGMASK_COPY_SIZE)) goto badframe; set_current_blocked(&set); + save_fpu_regs(); if (restore_sigregs(regs, &frame->sregs)) goto badframe; if (restore_sigregs_ext(regs, &frame->sregs_ext)) @@ -266,6 +246,7 @@ SYSCALL_DEFINE0(rt_sigreturn) set_current_blocked(&set); if (restore_altstack(&frame->uc.uc_stack)) goto badframe; + save_fpu_regs(); if (restore_sigregs(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_sigregs_ext(regs, &frame->uc.uc_mcontext_ext)) @@ -400,7 +381,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, uc_flags = 0; if (MACHINE_HAS_VX) { frame_size += sizeof(_sigregs_ext); - if (current->thread.vxrs) + if (is_vx_task(current)) uc_flags |= UC_VXRS; } frame = get_sigframe(&ksig->ka, regs, frame_size); diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 6f54c175f5c9..c6355e6f3fcc 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -532,8 +532,8 @@ EXPORT_SYMBOL(smp_ctl_clear_bit); #ifdef CONFIG_CRASH_DUMP -static void __smp_store_cpu_state(struct save_area_ext *sa_ext, u16 address, - int is_boot_cpu) +static void __init __smp_store_cpu_state(struct save_area_ext *sa_ext, + u16 address, int is_boot_cpu) { void *lc = (void *)(unsigned long) store_prefix(); unsigned long vx_sa; diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 1acad02681c4..f3f4a137aef6 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -276,9 +276,9 @@ SYSCALL(sys_ni_syscall,compat_sys_s390_fadvise64_64) SYSCALL(sys_statfs64,compat_sys_statfs64) SYSCALL(sys_fstatfs64,compat_sys_fstatfs64) SYSCALL(sys_remap_file_pages,compat_sys_remap_file_pages) -NI_SYSCALL /* 268 sys_mbind */ -NI_SYSCALL /* 269 sys_get_mempolicy */ -NI_SYSCALL /* 270 sys_set_mempolicy */ +SYSCALL(sys_mbind,compat_sys_mbind) +SYSCALL(sys_get_mempolicy,compat_sys_get_mempolicy) +SYSCALL(sys_set_mempolicy,compat_sys_set_mempolicy) SYSCALL(sys_mq_open,compat_sys_mq_open) SYSCALL(sys_mq_unlink,compat_sys_mq_unlink) SYSCALL(sys_mq_timedsend,compat_sys_mq_timedsend) @@ -295,7 +295,7 @@ SYSCALL(sys_ioprio_get,compat_sys_ioprio_get) SYSCALL(sys_inotify_init,sys_inotify_init) SYSCALL(sys_inotify_add_watch,compat_sys_inotify_add_watch) /* 285 */ SYSCALL(sys_inotify_rm_watch,compat_sys_inotify_rm_watch) -NI_SYSCALL /* 287 sys_migrate_pages */ +SYSCALL(sys_migrate_pages,compat_sys_migrate_pages) SYSCALL(sys_openat,compat_sys_openat) SYSCALL(sys_mkdirat,compat_sys_mkdirat) SYSCALL(sys_mknodat,compat_sys_mknodat) /* 290 */ @@ -318,7 +318,7 @@ SYSCALL(sys_splice,compat_sys_splice) SYSCALL(sys_sync_file_range,compat_sys_s390_sync_file_range) SYSCALL(sys_tee,compat_sys_tee) SYSCALL(sys_vmsplice,compat_sys_vmsplice) -NI_SYSCALL /* 310 sys_move_pages */ +SYSCALL(sys_move_pages,compat_sys_move_pages) SYSCALL(sys_getcpu,compat_sys_getcpu) SYSCALL(sys_epoll_pwait,compat_sys_epoll_pwait) SYSCALL(sys_utimes,compat_sys_utimes) diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index f5a0bd778ace..017c3a9bfc28 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -58,6 +58,9 @@ EXPORT_SYMBOL_GPL(sched_clock_base_cc); static DEFINE_PER_CPU(struct clock_event_device, comparators); +ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier); +EXPORT_SYMBOL(s390_epoch_delta_notifier); + /* * Scheduler clock - returns current time in nanosec units. */ @@ -117,11 +120,6 @@ static int s390_next_event(unsigned long delta, return 0; } -static void s390_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ -} - /* * Set up lowcore and control register of the current cpu to * enable TOD clock and clock comparator interrupts. @@ -145,7 +143,6 @@ void init_cpu_timer(void) cd->rating = 400; cd->cpumask = cpumask_of(cpu); cd->set_next_event = s390_next_event; - cd->set_mode = s390_set_mode; clockevents_register_device(cd); @@ -752,7 +749,7 @@ static void clock_sync_cpu(struct clock_sync_data *sync) static int etr_sync_clock(void *data) { static int first; - unsigned long long clock, old_clock, delay, delta; + unsigned long long clock, old_clock, clock_delta, delay, delta; struct clock_sync_data *etr_sync; struct etr_aib *sync_port, *aib; int port; @@ -789,6 +786,9 @@ static int etr_sync_clock(void *data) delay = (unsigned long long) (aib->edf2.etv - sync_port->edf2.etv) << 32; delta = adjust_time(old_clock, clock, delay); + clock_delta = clock - old_clock; + atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, + &clock_delta); etr_sync->fixup_cc = delta; fixup_clock_comparator(delta); /* Verify that the clock is properly set. */ @@ -1526,7 +1526,7 @@ void stp_island_check(void) static int stp_sync_clock(void *data) { static int first; - unsigned long long old_clock, delta; + unsigned long long old_clock, delta, new_clock, clock_delta; struct clock_sync_data *stp_sync; int rc; @@ -1551,7 +1551,11 @@ static int stp_sync_clock(void *data) old_clock = get_tod_clock(); rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0); if (rc == 0) { - delta = adjust_time(old_clock, get_tod_clock(), 0); + new_clock = get_tod_clock(); + delta = adjust_time(old_clock, new_clock, 0); + clock_delta = new_clock - old_clock; + atomic_notifier_call_chain(&s390_epoch_delta_notifier, + 0, &clock_delta); fixup_clock_comparator(delta); rc = chsc_sstpi(stp_page, &stp_info, sizeof(struct stp_sstpi)); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 5728c5bd44a8..bf05e7fc3e70 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -18,7 +18,10 @@ #include <linux/cpu.h> #include <linux/smp.h> #include <linux/mm.h> +#include <linux/nodemask.h> +#include <linux/node.h> #include <asm/sysinfo.h> +#include <asm/numa.h> #define PTF_HORIZONTAL (0UL) #define PTF_VERTICAL (1UL) @@ -37,8 +40,10 @@ static struct sysinfo_15_1_x *tl_info; static int topology_enabled = 1; static DECLARE_WORK(topology_work, topology_work_fn); -/* topology_lock protects the socket and book linked lists */ -static DEFINE_SPINLOCK(topology_lock); +/* + * Socket/Book linked lists and per_cpu(cpu_topology) updates are + * protected by "sched_domains_mutex". + */ static struct mask_info socket_info; static struct mask_info book_info; @@ -188,7 +193,6 @@ static void tl_to_masks(struct sysinfo_15_1_x *info) { struct cpuid cpu_id; - spin_lock_irq(&topology_lock); get_cpu_id(&cpu_id); clear_masks(); switch (cpu_id.machine) { @@ -199,7 +203,6 @@ static void tl_to_masks(struct sysinfo_15_1_x *info) default: __tl_to_masks_generic(info); } - spin_unlock_irq(&topology_lock); } static void topology_update_polarization_simple(void) @@ -244,10 +247,8 @@ int topology_set_cpu_management(int fc) static void update_cpu_masks(void) { - unsigned long flags; int cpu; - spin_lock_irqsave(&topology_lock, flags); for_each_possible_cpu(cpu) { per_cpu(cpu_topology, cpu).thread_mask = cpu_thread_map(cpu); per_cpu(cpu_topology, cpu).core_mask = cpu_group_map(&socket_info, cpu); @@ -259,7 +260,7 @@ static void update_cpu_masks(void) per_cpu(cpu_topology, cpu).book_id = cpu; } } - spin_unlock_irqrestore(&topology_lock, flags); + numa_update_cpu_topology(); } void store_topology(struct sysinfo_15_1_x *info) @@ -274,21 +275,21 @@ int arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; struct device *dev; - int cpu; + int cpu, rc = 0; - if (!MACHINE_HAS_TOPOLOGY) { - update_cpu_masks(); - topology_update_polarization_simple(); - return 0; + if (MACHINE_HAS_TOPOLOGY) { + rc = 1; + store_topology(info); + tl_to_masks(info); } - store_topology(info); - tl_to_masks(info); update_cpu_masks(); + if (!MACHINE_HAS_TOPOLOGY) + topology_update_polarization_simple(); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); kobject_uevent(&dev->kobj, KOBJ_CHANGE); } - return 1; + return rc; } static void topology_work_fn(struct work_struct *work) diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c index 7bea81d8a363..9861613fb35a 100644 --- a/arch/s390/kernel/traps.c +++ b/arch/s390/kernel/traps.c @@ -19,7 +19,7 @@ #include <linux/sched.h> #include <linux/mm.h> #include <linux/slab.h> -#include <asm/switch_to.h> +#include <asm/fpu-internal.h> #include "entry.h" int show_unhandled_signals = 1; @@ -151,7 +151,7 @@ DO_ERROR_INFO(special_op_exception, SIGILL, ILL_ILLOPN, DO_ERROR_INFO(transaction_exception, SIGILL, ILL_ILLOPN, "transaction constraint exception") -static inline void do_fp_trap(struct pt_regs *regs, int fpc) +static inline void do_fp_trap(struct pt_regs *regs, __u32 fpc) { int si_code = 0; /* FPC[2] is Data Exception Code */ @@ -227,7 +227,7 @@ DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN, int alloc_vector_registers(struct task_struct *tsk) { __vector128 *vxrs; - int i; + freg_t *fprs; /* Allocate vector register save area. */ vxrs = kzalloc(sizeof(__vector128) * __NUM_VXRS, @@ -236,15 +236,13 @@ int alloc_vector_registers(struct task_struct *tsk) return -ENOMEM; preempt_disable(); if (tsk == current) - save_fp_regs(tsk->thread.fp_regs.fprs); + save_fpu_regs(); /* Copy the 16 floating point registers */ - for (i = 0; i < 16; i++) - *(freg_t *) &vxrs[i] = tsk->thread.fp_regs.fprs[i]; - tsk->thread.vxrs = vxrs; - if (tsk == current) { - __ctl_set_bit(0, 17); - restore_vx_regs(vxrs); - } + convert_fp_to_vx(vxrs, tsk->thread.fpu.fprs); + fprs = tsk->thread.fpu.fprs; + tsk->thread.fpu.vxrs = vxrs; + tsk->thread.fpu.flags |= FPU_USE_VX; + kfree(fprs); preempt_enable(); return 0; } @@ -259,8 +257,8 @@ void vector_exception(struct pt_regs *regs) } /* get vector interrupt code from fpc */ - asm volatile("stfpc %0" : "=Q" (current->thread.fp_regs.fpc)); - vic = (current->thread.fp_regs.fpc & 0xf00) >> 8; + save_fpu_regs(); + vic = (current->thread.fpu.fpc & 0xf00) >> 8; switch (vic) { case 1: /* invalid vector operation */ si_code = FPE_FLTINV; @@ -297,22 +295,22 @@ void data_exception(struct pt_regs *regs) location = get_trap_ip(regs); - asm volatile("stfpc %0" : "=Q" (current->thread.fp_regs.fpc)); + save_fpu_regs(); /* Check for vector register enablement */ - if (MACHINE_HAS_VX && !current->thread.vxrs && - (current->thread.fp_regs.fpc & FPC_DXC_MASK) == 0xfe00) { + if (MACHINE_HAS_VX && !is_vx_task(current) && + (current->thread.fpu.fpc & FPC_DXC_MASK) == 0xfe00) { alloc_vector_registers(current); /* Vector data exception is suppressing, rewind psw. */ regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16); clear_pt_regs_flag(regs, PIF_PER_TRAP); return; } - if (current->thread.fp_regs.fpc & FPC_DXC_MASK) + if (current->thread.fpu.fpc & FPC_DXC_MASK) signal = SIGFPE; else signal = SIGILL; if (signal == SIGFPE) - do_fp_trap(regs, current->thread.fp_regs.fpc); + do_fp_trap(regs, current->thread.fpu.fpc); else if (signal) do_trap(regs, signal, ILL_ILLOPN, "data exception"); } diff --git a/arch/s390/kernel/vdso32/Makefile b/arch/s390/kernel/vdso32/Makefile index 8ad2b34ad151..ee8a18e50a25 100644 --- a/arch/s390/kernel/vdso32/Makefile +++ b/arch/s390/kernel/vdso32/Makefile @@ -13,7 +13,7 @@ KBUILD_AFLAGS_31 += -m31 -s KBUILD_CFLAGS_31 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_31 += -m31 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_31 += -nostdlib -Wl,-soname=linux-vdso32.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) + $(call cc-ldoption, -Wl$(comma)--hash-style=both) $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_31) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_31) diff --git a/arch/s390/kernel/vdso64/Makefile b/arch/s390/kernel/vdso64/Makefile index 2a8ddfd12a5b..c4b03f9ed228 100644 --- a/arch/s390/kernel/vdso64/Makefile +++ b/arch/s390/kernel/vdso64/Makefile @@ -13,7 +13,7 @@ KBUILD_AFLAGS_64 += -m64 -s KBUILD_CFLAGS_64 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_64 += -m64 -fPIC -shared -fno-common -fno-builtin KBUILD_CFLAGS_64 += -nostdlib -Wl,-soname=linux-vdso64.so.1 \ - $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) + $(call cc-ldoption, -Wl$(comma)--hash-style=both) $(targets:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_64) $(targets:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_64) diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index e53d3595a7c8..b9ce650e9e99 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -28,6 +28,7 @@ static atomic64_t virt_timer_elapsed; static DEFINE_PER_CPU(u64, mt_cycles[32]); static DEFINE_PER_CPU(u64, mt_scaling_mult) = { 1 }; static DEFINE_PER_CPU(u64, mt_scaling_div) = { 1 }; +static DEFINE_PER_CPU(u64, mt_scaling_jiffies); static inline u64 get_vtimer(void) { @@ -85,7 +86,8 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock; /* Do MT utilization calculation */ - if (smp_cpu_mtid) { + if (smp_cpu_mtid && + time_after64(jiffies_64, __this_cpu_read(mt_scaling_jiffies))) { u64 cycles_new[32], *cycles_old; u64 delta, mult, div; @@ -105,6 +107,7 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) sizeof(u64) * (smp_cpu_mtid + 1)); } } + __this_cpu_write(mt_scaling_jiffies, jiffies_64); } user = S390_lowcore.user_timer - ti->user_timer; @@ -376,4 +379,11 @@ void vtime_init(void) { /* set initial cpu timer */ set_vtimer(VTIMER_MAX_SLICE); + /* Setup initial MT scaling values */ + if (smp_cpu_mtid) { + __this_cpu_write(mt_scaling_jiffies, jiffies); + __this_cpu_write(mt_scaling_mult, 1); + __this_cpu_write(mt_scaling_div, 1); + stcctm5(smp_cpu_mtid + 1, this_cpu_ptr(mt_cycles)); + } } diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c index fc7ec95848c3..5fbfb88f8477 100644 --- a/arch/s390/kvm/diag.c +++ b/arch/s390/kvm/diag.c @@ -27,13 +27,13 @@ static int diag_release_pages(struct kvm_vcpu *vcpu) start = vcpu->run->s.regs.gprs[(vcpu->arch.sie_block->ipa & 0xf0) >> 4]; end = vcpu->run->s.regs.gprs[vcpu->arch.sie_block->ipa & 0xf] + 4096; + vcpu->stat.diagnose_10++; if (start & ~PAGE_MASK || end & ~PAGE_MASK || start >= end || start < 2 * PAGE_SIZE) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); VCPU_EVENT(vcpu, 5, "diag release pages %lX %lX", start, end); - vcpu->stat.diagnose_10++; /* * We checked for start >= end above, so lets check for the @@ -75,6 +75,9 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) u16 rx = (vcpu->arch.sie_block->ipa & 0xf0) >> 4; u16 ry = (vcpu->arch.sie_block->ipa & 0x0f); + VCPU_EVENT(vcpu, 3, "diag page reference parameter block at 0x%llx", + vcpu->run->s.regs.gprs[rx]); + vcpu->stat.diagnose_258++; if (vcpu->run->s.regs.gprs[rx] & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); @@ -85,6 +88,9 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) switch (parm.subcode) { case 0: /* TOKEN */ + VCPU_EVENT(vcpu, 3, "pageref token addr 0x%llx " + "select mask 0x%llx compare mask 0x%llx", + parm.token_addr, parm.select_mask, parm.compare_mask); if (vcpu->arch.pfault_token != KVM_S390_PFAULT_TOKEN_INVALID) { /* * If the pagefault handshake is already activated, @@ -114,6 +120,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu) * the cancel, therefore to reduce code complexity, we assume * all outstanding tokens are already pending. */ + VCPU_EVENT(vcpu, 3, "pageref cancel addr 0x%llx", parm.token_addr); if (parm.token_addr || parm.select_mask || parm.compare_mask || parm.zarch) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); @@ -174,7 +181,8 @@ static int __diag_ipl_functions(struct kvm_vcpu *vcpu) unsigned int reg = vcpu->arch.sie_block->ipa & 0xf; unsigned long subcode = vcpu->run->s.regs.gprs[reg] & 0xffff; - VCPU_EVENT(vcpu, 5, "diag ipl functions, subcode %lx", subcode); + VCPU_EVENT(vcpu, 3, "diag ipl functions, subcode %lx", subcode); + vcpu->stat.diagnose_308++; switch (subcode) { case 3: vcpu->run->s390_reset_flags = KVM_S390_RESET_CLEAR; @@ -202,6 +210,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu) { int ret; + vcpu->stat.diagnose_500++; /* No virtio-ccw notification? Get out quickly. */ if (!vcpu->kvm->arch.css_support || (vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY)) diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c index e97b3455d7e6..47518a324d75 100644 --- a/arch/s390/kvm/guestdbg.c +++ b/arch/s390/kvm/guestdbg.c @@ -473,10 +473,45 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->iprcc &= ~PGM_PER; } +#define pssec(vcpu) (vcpu->arch.sie_block->gcr[1] & _ASCE_SPACE_SWITCH) +#define hssec(vcpu) (vcpu->arch.sie_block->gcr[13] & _ASCE_SPACE_SWITCH) +#define old_ssec(vcpu) ((vcpu->arch.sie_block->tecmc >> 31) & 0x1) +#define old_as_is_home(vcpu) !(vcpu->arch.sie_block->tecmc & 0xffff) + void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu) { + int new_as; + if (debug_exit_required(vcpu)) vcpu->guest_debug |= KVM_GUESTDBG_EXIT_PENDING; filter_guest_per_event(vcpu); + + /* + * Only RP, SAC, SACF, PT, PTI, PR, PC instructions can trigger + * a space-switch event. PER events enforce space-switch events + * for these instructions. So if no PER event for the guest is left, + * we might have to filter the space-switch element out, too. + */ + if (vcpu->arch.sie_block->iprcc == PGM_SPACE_SWITCH) { + vcpu->arch.sie_block->iprcc = 0; + new_as = psw_bits(vcpu->arch.sie_block->gpsw).as; + + /* + * If the AS changed from / to home, we had RP, SAC or SACF + * instruction. Check primary and home space-switch-event + * controls. (theoretically home -> home produced no event) + */ + if (((new_as == PSW_AS_HOME) ^ old_as_is_home(vcpu)) && + (pssec(vcpu) || hssec(vcpu))) + vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; + + /* + * PT, PTI, PR, PC instruction operate on primary AS only. Check + * if the primary-space-switch-event control was or got set. + */ + if (new_as == PSW_AS_PRIMARY && !old_as_is_home(vcpu) && + (pssec(vcpu) || old_ssec(vcpu))) + vcpu->arch.sie_block->iprcc = PGM_SPACE_SWITCH; + } } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 57309e9cdd80..5c2c169395c3 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -30,7 +30,6 @@ #define IOINT_SCHID_MASK 0x0000ffff #define IOINT_SSID_MASK 0x00030000 #define IOINT_CSSID_MASK 0x03fc0000 -#define IOINT_AI_MASK 0x04000000 #define PFAULT_INIT 0x0600 #define PFAULT_DONE 0x0680 #define VIRTIO_PARAM 0x0d00 @@ -72,9 +71,13 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu) static int ckc_irq_pending(struct kvm_vcpu *vcpu) { + preempt_disable(); if (!(vcpu->arch.sie_block->ckc < - get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) + get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) { + preempt_enable(); return 0; + } + preempt_enable(); return ckc_interrupts_enabled(vcpu); } @@ -311,8 +314,8 @@ static int __must_check __deliver_pfault_init(struct kvm_vcpu *vcpu) li->irq.ext.ext_params2 = 0; spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "interrupt: pfault init parm:%x,parm64:%llx", - 0, ext.ext_params2); + VCPU_EVENT(vcpu, 4, "deliver: pfault init token 0x%llx", + ext.ext_params2); trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT, 0, ext.ext_params2); @@ -368,7 +371,7 @@ static int __must_check __deliver_machine_check(struct kvm_vcpu *vcpu) spin_unlock(&fi->lock); if (deliver) { - VCPU_EVENT(vcpu, 4, "interrupt: machine check mcic=%llx", + VCPU_EVENT(vcpu, 3, "deliver: machine check mcic 0x%llx", mchk.mcic); trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_MCHK, @@ -403,7 +406,7 @@ static int __must_check __deliver_restart(struct kvm_vcpu *vcpu) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; int rc; - VCPU_EVENT(vcpu, 4, "%s", "interrupt: cpu restart"); + VCPU_EVENT(vcpu, 3, "%s", "deliver: cpu restart"); vcpu->stat.deliver_restart_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); @@ -427,7 +430,6 @@ static int __must_check __deliver_set_prefix(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_SET_PREFIX, &li->pending_irqs); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "interrupt: set prefix to %x", prefix.address); vcpu->stat.deliver_prefix_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, @@ -450,7 +452,7 @@ static int __must_check __deliver_emergency_signal(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp emerg"); + VCPU_EVENT(vcpu, 4, "%s", "deliver: sigp emerg"); vcpu->stat.deliver_emergency_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, cpu_addr, 0); @@ -477,7 +479,7 @@ static int __must_check __deliver_external_call(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_EXT_EXTERNAL, &li->pending_irqs); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "%s", "interrupt: sigp ext call"); + VCPU_EVENT(vcpu, 4, "%s", "deliver: sigp ext call"); vcpu->stat.deliver_external_call++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, @@ -506,7 +508,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu) memset(&li->irq.pgm, 0, sizeof(pgm_info)); spin_unlock(&li->lock); - VCPU_EVENT(vcpu, 4, "interrupt: pgm check code:%x, ilc:%x", + VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d", pgm_info.code, ilc); vcpu->stat.deliver_program_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, @@ -622,7 +624,7 @@ static int __must_check __deliver_service(struct kvm_vcpu *vcpu) clear_bit(IRQ_PEND_EXT_SERVICE, &fi->pending_irqs); spin_unlock(&fi->lock); - VCPU_EVENT(vcpu, 4, "interrupt: sclp parm:%x", + VCPU_EVENT(vcpu, 4, "deliver: sclp parameter 0x%x", ext.ext_params); vcpu->stat.deliver_service_signal++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_INT_SERVICE, @@ -651,9 +653,6 @@ static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) struct kvm_s390_interrupt_info, list); if (inti) { - trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, - KVM_S390_INT_PFAULT_DONE, 0, - inti->ext.ext_params2); list_del(&inti->list); fi->counters[FIRQ_CNTR_PFAULT] -= 1; } @@ -662,6 +661,12 @@ static int __must_check __deliver_pfault_done(struct kvm_vcpu *vcpu) spin_unlock(&fi->lock); if (inti) { + trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, + KVM_S390_INT_PFAULT_DONE, 0, + inti->ext.ext_params2); + VCPU_EVENT(vcpu, 4, "deliver: pfault done token 0x%llx", + inti->ext.ext_params2); + rc = put_guest_lc(vcpu, EXT_IRQ_CP_SERVICE, (u16 *)__LC_EXT_INT_CODE); rc |= put_guest_lc(vcpu, PFAULT_DONE, @@ -691,7 +696,7 @@ static int __must_check __deliver_virtio(struct kvm_vcpu *vcpu) list); if (inti) { VCPU_EVENT(vcpu, 4, - "interrupt: virtio parm:%x,parm64:%llx", + "deliver: virtio parm: 0x%x,parm64: 0x%llx", inti->ext.ext_params, inti->ext.ext_params2); vcpu->stat.deliver_virtio_interrupt++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, @@ -741,7 +746,7 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu, struct kvm_s390_interrupt_info, list); if (inti) { - VCPU_EVENT(vcpu, 4, "interrupt: I/O %llx", inti->type); + VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type); vcpu->stat.deliver_io_int++; trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, inti->type, @@ -855,7 +860,9 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) goto no_timer; } + preempt_disable(); now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; + preempt_enable(); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* underflow */ @@ -864,7 +871,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu) __set_cpu_idle(vcpu); hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL); - VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime); + VCPU_EVENT(vcpu, 4, "enabled wait via clock comparator: %llu ns", sltime); no_timer: srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); kvm_vcpu_block(vcpu); @@ -894,7 +901,9 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer) u64 now, sltime; vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer); + preempt_disable(); now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch; + preempt_enable(); sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now); /* @@ -968,6 +977,10 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; + VCPU_EVENT(vcpu, 3, "inject: program irq code 0x%x", irq->u.pgm.code); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, + irq->u.pgm.code, 0); + li->irq.pgm = irq->u.pgm; set_bit(IRQ_PEND_PROG, &li->pending_irqs); return 0; @@ -978,9 +991,6 @@ int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_irq irq; - VCPU_EVENT(vcpu, 3, "inject: program check %d (from kernel)", code); - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, code, - 0, 1); spin_lock(&li->lock); irq.u.pgm.code = code; __inject_prog(vcpu, &irq); @@ -996,10 +1006,6 @@ int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu, struct kvm_s390_irq irq; int rc; - VCPU_EVENT(vcpu, 3, "inject: prog irq %d (from kernel)", - pgm_info->code); - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT, - pgm_info->code, 0, 1); spin_lock(&li->lock); irq.u.pgm = *pgm_info; rc = __inject_prog(vcpu, &irq); @@ -1012,11 +1018,11 @@ static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: external irq params:%x, params2:%llx", - irq->u.ext.ext_params, irq->u.ext.ext_params2); + VCPU_EVENT(vcpu, 4, "inject: pfault init parameter block at 0x%llx", + irq->u.ext.ext_params2); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_PFAULT_INIT, irq->u.ext.ext_params, - irq->u.ext.ext_params2, 2); + irq->u.ext.ext_params2); li->irq.ext = irq->u.ext; set_bit(IRQ_PEND_PFAULT_INIT, &li->pending_irqs); @@ -1045,10 +1051,10 @@ static int __inject_extcall(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_extcall_info *extcall = &li->irq.extcall; uint16_t src_id = irq->u.extcall.code; - VCPU_EVENT(vcpu, 3, "inject: external call source-cpu:%u", + VCPU_EVENT(vcpu, 4, "inject: external call source-cpu:%u", src_id); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EXTERNAL_CALL, - src_id, 0, 2); + src_id, 0); /* sending vcpu invalid */ if (src_id >= KVM_MAX_VCPUS || @@ -1070,10 +1076,10 @@ static int __inject_set_prefix(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_prefix_info *prefix = &li->irq.prefix; - VCPU_EVENT(vcpu, 3, "inject: set prefix to %x (from user)", + VCPU_EVENT(vcpu, 3, "inject: set prefix to %x", irq->u.prefix.address); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_SET_PREFIX, - irq->u.prefix.address, 0, 2); + irq->u.prefix.address, 0); if (!is_vcpu_stopped(vcpu)) return -EBUSY; @@ -1090,7 +1096,7 @@ static int __inject_sigp_stop(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_stop_info *stop = &li->irq.stop; int rc = 0; - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0, 2); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_SIGP_STOP, 0, 0); if (irq->u.stop.flags & ~KVM_S390_STOP_SUPP_FLAGS) return -EINVAL; @@ -1114,8 +1120,8 @@ static int __inject_sigp_restart(struct kvm_vcpu *vcpu, { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: restart type %llx", irq->type); - trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0, 2); + VCPU_EVENT(vcpu, 3, "%s", "inject: restart int"); + trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_RESTART, 0, 0); set_bit(IRQ_PEND_RESTART, &li->pending_irqs); return 0; @@ -1126,10 +1132,10 @@ static int __inject_sigp_emergency(struct kvm_vcpu *vcpu, { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: emergency %u\n", + VCPU_EVENT(vcpu, 4, "inject: emergency from cpu %u", irq->u.emerg.code); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_EMERGENCY, - irq->u.emerg.code, 0, 2); + irq->u.emerg.code, 0); set_bit(irq->u.emerg.code, li->sigp_emerg_pending); set_bit(IRQ_PEND_EXT_EMERGENCY, &li->pending_irqs); @@ -1142,10 +1148,10 @@ static int __inject_mchk(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; struct kvm_s390_mchk_info *mchk = &li->irq.mchk; - VCPU_EVENT(vcpu, 5, "inject: machine check parm64:%llx", + VCPU_EVENT(vcpu, 3, "inject: machine check mcic 0x%llx", irq->u.mchk.mcic); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_MCHK, 0, - irq->u.mchk.mcic, 2); + irq->u.mchk.mcic); /* * Because repressible machine checks can be indicated along with @@ -1172,9 +1178,9 @@ static int __inject_ckc(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: type %x", KVM_S390_INT_CLOCK_COMP); + VCPU_EVENT(vcpu, 3, "%s", "inject: clock comparator external"); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CLOCK_COMP, - 0, 0, 2); + 0, 0); set_bit(IRQ_PEND_EXT_CLOCK_COMP, &li->pending_irqs); atomic_or(CPUSTAT_EXT_INT, li->cpuflags); @@ -1185,9 +1191,9 @@ static int __inject_cpu_timer(struct kvm_vcpu *vcpu) { struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int; - VCPU_EVENT(vcpu, 3, "inject: type %x", KVM_S390_INT_CPU_TIMER); + VCPU_EVENT(vcpu, 3, "%s", "inject: cpu timer external"); trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_INT_CPU_TIMER, - 0, 0, 2); + 0, 0); set_bit(IRQ_PEND_EXT_CPU_TIMER, &li->pending_irqs); atomic_or(CPUSTAT_EXT_INT, li->cpuflags); @@ -1435,20 +1441,20 @@ int kvm_s390_inject_vm(struct kvm *kvm, inti->ext.ext_params2 = s390int->parm64; break; case KVM_S390_INT_SERVICE: - VM_EVENT(kvm, 5, "inject: sclp parm:%x", s390int->parm); + VM_EVENT(kvm, 4, "inject: sclp parm:%x", s390int->parm); inti->ext.ext_params = s390int->parm; break; case KVM_S390_INT_PFAULT_DONE: inti->ext.ext_params2 = s390int->parm64; break; case KVM_S390_MCHK: - VM_EVENT(kvm, 5, "inject: machine check parm64:%llx", + VM_EVENT(kvm, 3, "inject: machine check mcic 0x%llx", s390int->parm64); inti->mchk.cr14 = s390int->parm; /* upper bits are not used */ inti->mchk.mcic = s390int->parm64; break; case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX: - if (inti->type & IOINT_AI_MASK) + if (inti->type & KVM_S390_INT_IO_AI_MASK) VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)"); else VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x", @@ -1535,8 +1541,6 @@ static int do_inject_vcpu(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq) switch (irq->type) { case KVM_S390_PROGRAM_INT: - VCPU_EVENT(vcpu, 3, "inject: program check %d (from user)", - irq->u.pgm.code); rc = __inject_prog(vcpu, irq); break; case KVM_S390_SIGP_SET_PREFIX: diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index b73302fb0507..c91eb941b444 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -28,6 +28,7 @@ #include <linux/vmalloc.h> #include <asm/asm-offsets.h> #include <asm/lowcore.h> +#include <asm/etr.h> #include <asm/pgtable.h> #include <asm/nmi.h> #include <asm/switch_to.h> @@ -108,6 +109,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "diagnose_10", VCPU_STAT(diagnose_10) }, { "diagnose_44", VCPU_STAT(diagnose_44) }, { "diagnose_9c", VCPU_STAT(diagnose_9c) }, + { "diagnose_258", VCPU_STAT(diagnose_258) }, + { "diagnose_308", VCPU_STAT(diagnose_308) }, + { "diagnose_500", VCPU_STAT(diagnose_500) }, { NULL } }; @@ -124,6 +128,7 @@ unsigned long kvm_s390_fac_list_mask_size(void) } static struct gmap_notifier gmap_notifier; +debug_info_t *kvm_s390_dbf; /* Section: not file related */ int kvm_arch_hardware_enable(void) @@ -134,24 +139,69 @@ int kvm_arch_hardware_enable(void) static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address); +/* + * This callback is executed during stop_machine(). All CPUs are therefore + * temporarily stopped. In order not to change guest behavior, we have to + * disable preemption whenever we touch the epoch of kvm and the VCPUs, + * so a CPU won't be stopped while calculating with the epoch. + */ +static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val, + void *v) +{ + struct kvm *kvm; + struct kvm_vcpu *vcpu; + int i; + unsigned long long *delta = v; + + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm->arch.epoch -= *delta; + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.sie_block->epoch -= *delta; + } + } + return NOTIFY_OK; +} + +static struct notifier_block kvm_clock_notifier = { + .notifier_call = kvm_clock_sync, +}; + int kvm_arch_hardware_setup(void) { gmap_notifier.notifier_call = kvm_gmap_notifier; gmap_register_ipte_notifier(&gmap_notifier); + atomic_notifier_chain_register(&s390_epoch_delta_notifier, + &kvm_clock_notifier); return 0; } void kvm_arch_hardware_unsetup(void) { gmap_unregister_ipte_notifier(&gmap_notifier); + atomic_notifier_chain_unregister(&s390_epoch_delta_notifier, + &kvm_clock_notifier); } int kvm_arch_init(void *opaque) { + kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long)); + if (!kvm_s390_dbf) + return -ENOMEM; + + if (debug_register_view(kvm_s390_dbf, &debug_sprintf_view)) { + debug_unregister(kvm_s390_dbf); + return -ENOMEM; + } + /* Register floating interrupt controller interface. */ return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC); } +void kvm_arch_exit(void) +{ + debug_unregister(kvm_s390_dbf); +} + /* Section: device related */ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) @@ -281,10 +331,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) switch (cap->cap) { case KVM_CAP_S390_IRQCHIP: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_IRQCHIP"); kvm->arch.use_irqchip = 1; r = 0; break; case KVM_CAP_S390_USER_SIGP: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_SIGP"); kvm->arch.user_sigp = 1; r = 0; break; @@ -295,8 +347,11 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) r = 0; } else r = -EINVAL; + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_VECTOR_REGISTERS %s", + r ? "(not available)" : "(success)"); break; case KVM_CAP_S390_USER_STSI: + VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI"); kvm->arch.user_stsi = 1; r = 0; break; @@ -314,6 +369,8 @@ static int kvm_s390_get_mem_control(struct kvm *kvm, struct kvm_device_attr *att switch (attr->attr) { case KVM_S390_VM_MEM_LIMIT_SIZE: ret = 0; + VM_EVENT(kvm, 3, "QUERY: max guest memory: %lu bytes", + kvm->arch.gmap->asce_end); if (put_user(kvm->arch.gmap->asce_end, (u64 __user *)attr->addr)) ret = -EFAULT; break; @@ -330,7 +387,13 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att unsigned int idx; switch (attr->attr) { case KVM_S390_VM_MEM_ENABLE_CMMA: + /* enable CMMA only for z10 and later (EDAT_1) */ + ret = -EINVAL; + if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1) + break; + ret = -EBUSY; + VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support"); mutex_lock(&kvm->lock); if (atomic_read(&kvm->online_vcpus) == 0) { kvm->arch.use_cmma = 1; @@ -339,6 +402,11 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att mutex_unlock(&kvm->lock); break; case KVM_S390_VM_MEM_CLR_CMMA: + ret = -EINVAL; + if (!kvm->arch.use_cmma) + break; + + VM_EVENT(kvm, 3, "%s", "RESET: CMMA states"); mutex_lock(&kvm->lock); idx = srcu_read_lock(&kvm->srcu); s390_reset_cmma(kvm->arch.gmap->mm); @@ -374,6 +442,7 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att } } mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: max guest memory: %lu bytes", new_limit); break; } default: @@ -400,22 +469,26 @@ static int kvm_s390_vm_set_crypto(struct kvm *kvm, struct kvm_device_attr *attr) kvm->arch.crypto.crycb->aes_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); kvm->arch.crypto.aes_kw = 1; + VM_EVENT(kvm, 3, "%s", "ENABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_ENABLE_DEA_KW: get_random_bytes( kvm->arch.crypto.crycb->dea_wrapping_key_mask, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); kvm->arch.crypto.dea_kw = 1; + VM_EVENT(kvm, 3, "%s", "ENABLE: DEA keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_AES_KW: kvm->arch.crypto.aes_kw = 0; memset(kvm->arch.crypto.crycb->aes_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask)); + VM_EVENT(kvm, 3, "%s", "DISABLE: AES keywrapping support"); break; case KVM_S390_VM_CRYPTO_DISABLE_DEA_KW: kvm->arch.crypto.dea_kw = 0; memset(kvm->arch.crypto.crycb->dea_wrapping_key_mask, 0, sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask)); + VM_EVENT(kvm, 3, "%s", "DISABLE: DEA keywrapping support"); break; default: mutex_unlock(&kvm->lock); @@ -440,6 +513,7 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (gtod_high != 0) return -EINVAL; + VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high); return 0; } @@ -459,12 +533,15 @@ static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) return r; mutex_lock(&kvm->lock); + preempt_disable(); kvm->arch.epoch = gtod - host_tod; kvm_s390_vcpu_block_all(kvm); kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm) cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch; kvm_s390_vcpu_unblock_all(kvm); + preempt_enable(); mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod); return 0; } @@ -496,6 +573,7 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr) if (copy_to_user((void __user *)attr->addr, >od_high, sizeof(gtod_high))) return -EFAULT; + VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high); return 0; } @@ -509,9 +587,12 @@ static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr) if (r) return r; + preempt_disable(); gtod = host_tod + kvm->arch.epoch; + preempt_enable(); if (copy_to_user((void __user *)attr->addr, >od, sizeof(gtod))) return -EFAULT; + VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod); return 0; } @@ -821,7 +902,9 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args) } /* Enable storage key handling for the guest */ - s390_enable_skey(); + r = s390_enable_skey(); + if (r) + goto out; for (i = 0; i < args->count; i++) { hva = gfn_to_hva(kvm, args->start_gfn + i); @@ -879,8 +962,7 @@ long kvm_arch_vm_ioctl(struct file *filp, if (kvm->arch.use_irqchip) { /* Set up dummy routing. */ memset(&routing, 0, sizeof(routing)); - kvm_set_irq_routing(kvm, &routing, 0, 0); - r = 0; + r = kvm_set_irq_routing(kvm, &routing, 0, 0); } break; } @@ -1043,7 +1125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) sprintf(debug_name, "kvm-%u", current->pid); - kvm->arch.dbf = debug_register(debug_name, 8, 2, 8 * sizeof(long)); + kvm->arch.dbf = debug_register(debug_name, 32, 1, 7 * sizeof(long)); if (!kvm->arch.dbf) goto out_err; @@ -1086,7 +1168,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) mutex_init(&kvm->arch.ipte_mutex); debug_register_view(kvm->arch.dbf, &debug_sprintf_view); - VM_EVENT(kvm, 3, "%s", "vm created"); + VM_EVENT(kvm, 3, "vm created with type %lu", type); if (type & KVM_VM_S390_UCONTROL) { kvm->arch.gmap = NULL; @@ -1103,6 +1185,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm->arch.epoch = 0; spin_lock_init(&kvm->arch.start_stop_lock); + KVM_EVENT(3, "vm 0x%p created by pid %u", kvm, current->pid); return 0; out_err: @@ -1110,6 +1193,7 @@ out_err: free_page((unsigned long)kvm->arch.model.fac); debug_unregister(kvm->arch.dbf); free_page((unsigned long)(kvm->arch.sca)); + KVM_EVENT(3, "creation of vm failed: %d", rc); return rc; } @@ -1131,7 +1215,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) if (kvm_is_ucontrol(vcpu->kvm)) gmap_free(vcpu->arch.gmap); - if (kvm_s390_cmma_enabled(vcpu->kvm)) + if (vcpu->kvm->arch.use_cmma) kvm_s390_vcpu_unsetup_cmma(vcpu); free_page((unsigned long)(vcpu->arch.sie_block)); @@ -1166,6 +1250,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm) gmap_free(kvm->arch.gmap); kvm_s390_destroy_adapters(kvm); kvm_s390_clear_float_irqs(kvm); + KVM_EVENT(3, "vm 0x%p destroyed", kvm); } /* Section: vcpu related */ @@ -1198,21 +1283,54 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) return 0; } +/* + * Backs up the current FP/VX register save area on a particular + * destination. Used to switch between different register save + * areas. + */ +static inline void save_fpu_to(struct fpu *dst) +{ + dst->fpc = current->thread.fpu.fpc; + dst->flags = current->thread.fpu.flags; + dst->regs = current->thread.fpu.regs; +} + +/* + * Switches the FP/VX register save area from which to lazy + * restore register contents. + */ +static inline void load_fpu_from(struct fpu *from) +{ + current->thread.fpu.fpc = from->fpc; + current->thread.fpu.flags = from->flags; + current->thread.fpu.regs = from->regs; +} + void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - save_fp_ctl(&vcpu->arch.host_fpregs.fpc); - if (test_kvm_facility(vcpu->kvm, 129)) - save_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs); - else - save_fp_regs(vcpu->arch.host_fpregs.fprs); - save_access_regs(vcpu->arch.host_acrs); + /* Save host register state */ + save_fpu_regs(); + save_fpu_to(&vcpu->arch.host_fpregs); + if (test_kvm_facility(vcpu->kvm, 129)) { - restore_fp_ctl(&vcpu->run->s.regs.fpc); - restore_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs); - } else { - restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc); - restore_fp_regs(vcpu->arch.guest_fpregs.fprs); - } + current->thread.fpu.fpc = vcpu->run->s.regs.fpc; + current->thread.fpu.flags = FPU_USE_VX; + /* + * Use the register save area in the SIE-control block + * for register restore and save in kvm_arch_vcpu_put() + */ + current->thread.fpu.vxrs = + (__vector128 *)&vcpu->run->s.regs.vrs; + /* Always enable the vector extension for KVM */ + __ctl_set_vx(); + } else + load_fpu_from(&vcpu->arch.guest_fpregs); + + if (test_fp_ctl(current->thread.fpu.fpc)) + /* User space provided an invalid FPC, let's clear it */ + current->thread.fpu.fpc = 0; + + save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); gmap_enable(vcpu->arch.gmap); atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); @@ -1222,19 +1340,22 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); gmap_disable(vcpu->arch.gmap); - if (test_kvm_facility(vcpu->kvm, 129)) { - save_fp_ctl(&vcpu->run->s.regs.fpc); - save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs); - } else { - save_fp_ctl(&vcpu->arch.guest_fpregs.fpc); - save_fp_regs(vcpu->arch.guest_fpregs.fprs); - } - save_access_regs(vcpu->run->s.regs.acrs); - restore_fp_ctl(&vcpu->arch.host_fpregs.fpc); + + save_fpu_regs(); + if (test_kvm_facility(vcpu->kvm, 129)) - restore_vx_regs((__vector128 *)&vcpu->arch.host_vregs->vrs); + /* + * kvm_arch_vcpu_load() set up the register save area to + * the &vcpu->run->s.regs.vrs and, thus, the vector registers + * are already saved. Only the floating-point control must be + * copied. + */ + vcpu->run->s.regs.fpc = current->thread.fpu.fpc; else - restore_fp_regs(vcpu->arch.host_fpregs.fprs); + save_fpu_to(&vcpu->arch.guest_fpregs); + load_fpu_from(&vcpu->arch.host_fpregs); + + save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); } @@ -1264,7 +1385,9 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) { mutex_lock(&vcpu->kvm->lock); + preempt_disable(); vcpu->arch.sie_block->epoch = vcpu->kvm->arch.epoch; + preempt_enable(); mutex_unlock(&vcpu->kvm->lock); if (!kvm_is_ucontrol(vcpu->kvm)) vcpu->arch.gmap = vcpu->kvm->arch.gmap; @@ -1342,7 +1465,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) } vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; - if (kvm_s390_cmma_enabled(vcpu->kvm)) { + if (vcpu->kvm->arch.use_cmma) { rc = kvm_s390_vcpu_setup_cmma(vcpu); if (rc) return rc; @@ -1377,7 +1500,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.sie_block = &sie_page->sie_block; vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb; - vcpu->arch.host_vregs = &sie_page->vregs; vcpu->arch.sie_block->icpua = id; if (!kvm_is_ucontrol(kvm)) { @@ -1399,6 +1521,19 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, vcpu->arch.local_int.wq = &vcpu->wq; vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags; + /* + * Allocate a save area for floating-point registers. If the vector + * extension is available, register contents are saved in the SIE + * control block. The allocated save area is still required in + * particular places, for example, in kvm_s390_vcpu_store_status(). + */ + vcpu->arch.guest_fpregs.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS, + GFP_KERNEL); + if (!vcpu->arch.guest_fpregs.fprs) { + rc = -ENOMEM; + goto out_free_sie_block; + } + rc = kvm_vcpu_init(vcpu, kvm, id); if (rc) goto out_free_sie_block; @@ -1621,16 +1756,16 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { if (test_fp_ctl(fpu->fpc)) return -EINVAL; - memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); + memcpy(vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs)); vcpu->arch.guest_fpregs.fpc = fpu->fpc; - restore_fp_ctl(&vcpu->arch.guest_fpregs.fpc); - restore_fp_regs(vcpu->arch.guest_fpregs.fprs); + save_fpu_regs(); + load_fpu_from(&vcpu->arch.guest_fpregs); return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs)); + memcpy(&fpu->fprs, vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs)); fpu->fpc = vcpu->arch.guest_fpregs.fpc; return 0; } @@ -1723,18 +1858,6 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return rc; } -bool kvm_s390_cmma_enabled(struct kvm *kvm) -{ - if (!MACHINE_IS_LPAR) - return false; - /* only enable for z10 and later */ - if (!MACHINE_HAS_EDAT1) - return false; - if (!kvm->arch.use_cmma) - return false; - return true; -} - static bool ibs_enabled(struct kvm_vcpu *vcpu) { return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_IBS; @@ -1742,10 +1865,10 @@ static bool ibs_enabled(struct kvm_vcpu *vcpu) static int kvm_s390_handle_requests(struct kvm_vcpu *vcpu) { - if (!vcpu->requests) - return 0; retry: kvm_s390_vcpu_request_handled(vcpu); + if (!vcpu->requests) + return 0; /* * We use MMU_RELOAD just to re-arm the ipte notifier for the * guest prefix page. gmap_ipte_notify will wait on the ptl lock. @@ -2193,8 +2316,21 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) * copying in vcpu load/put. Lets update our copies before we save * it into the save area */ - save_fp_ctl(&vcpu->arch.guest_fpregs.fpc); - save_fp_regs(vcpu->arch.guest_fpregs.fprs); + save_fpu_regs(); + if (test_kvm_facility(vcpu->kvm, 129)) { + /* + * If the vector extension is available, the vector registers + * which overlaps with floating-point registers are saved in + * the SIE-control block. Hence, extract the floating-point + * registers and the FPC value and store them in the + * guest_fpregs structure. + */ + WARN_ON(!is_vx_task(current)); /* XXX remove later */ + vcpu->arch.guest_fpregs.fpc = current->thread.fpu.fpc; + convert_vx_to_fp(vcpu->arch.guest_fpregs.fprs, + current->thread.fpu.vxrs); + } else + save_fpu_to(&vcpu->arch.guest_fpregs); save_access_regs(vcpu->run->s.regs.acrs); return kvm_s390_store_status_unloaded(vcpu, addr); @@ -2221,10 +2357,13 @@ int kvm_s390_vcpu_store_adtl_status(struct kvm_vcpu *vcpu, unsigned long addr) /* * The guest VXRS are in the host VXRs due to the lazy - * copying in vcpu load/put. Let's update our copies before we save - * it into the save area. + * copying in vcpu load/put. We can simply call save_fpu_regs() + * to save the current register state because we are in the + * middle of a load/put cycle. + * + * Let's update our copies before we save it into the save area. */ - save_vx_regs((__vector128 *)&vcpu->run->s.regs.vrs); + save_fpu_regs(); return kvm_s390_store_adtl_status_unloaded(vcpu, addr); } @@ -2340,6 +2479,7 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, case KVM_CAP_S390_CSS_SUPPORT: if (!vcpu->kvm->arch.css_support) { vcpu->kvm->arch.css_support = 1; + VM_EVENT(vcpu->kvm, 3, "%s", "ENABLE: CSS support"); trace_kvm_s390_enable_css(vcpu->kvm); } r = 0; diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index c5704786e473..c446aabf60d3 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -27,6 +27,13 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); #define TDB_FORMAT1 1 #define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) +extern debug_info_t *kvm_s390_dbf; +#define KVM_EVENT(d_loglevel, d_string, d_args...)\ +do { \ + debug_sprintf_event(kvm_s390_dbf, d_loglevel, d_string "\n", \ + d_args); \ +} while (0) + #define VM_EVENT(d_kvm, d_loglevel, d_string, d_args...)\ do { \ debug_sprintf_event(d_kvm->arch.dbf, d_loglevel, d_string "\n", \ @@ -65,6 +72,8 @@ static inline u32 kvm_s390_get_prefix(struct kvm_vcpu *vcpu) static inline void kvm_s390_set_prefix(struct kvm_vcpu *vcpu, u32 prefix) { + VCPU_EVENT(vcpu, 3, "set prefix of cpu %03u to 0x%x", vcpu->vcpu_id, + prefix); vcpu->arch.sie_block->prefix = prefix >> GUEST_PREFIX_SHIFT; kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu); @@ -217,8 +226,6 @@ void exit_sie(struct kvm_vcpu *vcpu); void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu); int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu); void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu); -/* is cmma enabled */ -bool kvm_s390_cmma_enabled(struct kvm *kvm); unsigned long kvm_s390_fac_list_mask_size(void); extern unsigned long kvm_s390_fac_list_mask[]; diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index ad4242245771..4d21dc4d1a84 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -53,11 +53,14 @@ static int handle_set_clock(struct kvm_vcpu *vcpu) kvm_s390_set_psw_cc(vcpu, 3); return 0; } + VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val); val = (val - hostclk) & ~0x3fUL; mutex_lock(&vcpu->kvm->lock); + preempt_disable(); kvm_for_each_vcpu(i, cpup, vcpu->kvm) cpup->arch.sie_block->epoch = val; + preempt_enable(); mutex_unlock(&vcpu->kvm->lock); kvm_s390_set_psw_cc(vcpu, 0); @@ -98,8 +101,6 @@ static int handle_set_prefix(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING); kvm_s390_set_prefix(vcpu, address); - - VCPU_EVENT(vcpu, 5, "setting prefix to %x", address); trace_kvm_s390_handle_prefix(vcpu, 1, address); return 0; } @@ -129,7 +130,7 @@ static int handle_store_prefix(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - VCPU_EVENT(vcpu, 5, "storing prefix to %x", address); + VCPU_EVENT(vcpu, 3, "STPX: storing prefix 0x%x into 0x%llx", address, operand2); trace_kvm_s390_handle_prefix(vcpu, 0, address); return 0; } @@ -155,7 +156,7 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - VCPU_EVENT(vcpu, 5, "storing cpu address to %llx", ga); + VCPU_EVENT(vcpu, 3, "STAP: storing cpu address (%u) to 0x%llx", vcpu_id, ga); trace_kvm_s390_handle_stap(vcpu, ga); return 0; } @@ -167,6 +168,7 @@ static int __skey_check_enable(struct kvm_vcpu *vcpu) return rc; rc = s390_enable_skey(); + VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest"); trace_kvm_s390_skey_related_inst(vcpu); vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); return rc; @@ -370,7 +372,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu) &fac, sizeof(fac)); if (rc) return rc; - VCPU_EVENT(vcpu, 5, "store facility list value %x", fac); + VCPU_EVENT(vcpu, 3, "STFL: store facility list 0x%x", fac); trace_kvm_s390_handle_stfl(vcpu, fac); return 0; } @@ -468,7 +470,7 @@ static int handle_stidp(struct kvm_vcpu *vcpu) if (rc) return kvm_s390_inject_prog_cond(vcpu, rc); - VCPU_EVENT(vcpu, 5, "%s", "store cpu id"); + VCPU_EVENT(vcpu, 3, "STIDP: store cpu id 0x%llx", stidp_data); return 0; } @@ -521,7 +523,7 @@ static int handle_stsi(struct kvm_vcpu *vcpu) ar_t ar; vcpu->stat.instruction_stsi++; - VCPU_EVENT(vcpu, 4, "stsi: fc: %x sel1: %x sel2: %x", fc, sel1, sel2); + VCPU_EVENT(vcpu, 3, "STSI: fc: %u sel1: %u sel2: %u", fc, sel1, sel2); if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); @@ -758,10 +760,10 @@ static int handle_essa(struct kvm_vcpu *vcpu) struct gmap *gmap; int i; - VCPU_EVENT(vcpu, 5, "cmma release %d pages", entries); + VCPU_EVENT(vcpu, 4, "ESSA: release %d pages", entries); gmap = vcpu->arch.gmap; vcpu->stat.instruction_essa++; - if (!kvm_s390_cmma_enabled(vcpu->kvm)) + if (!vcpu->kvm->arch.use_cmma) return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE) @@ -829,7 +831,7 @@ int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu) if (ga & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "lctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "LCTL: r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_lctl(vcpu, 0, reg1, reg3, ga); nr_regs = ((reg3 - reg1) & 0xf) + 1; @@ -868,7 +870,7 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu) if (ga & 3) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "stctl r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "STCTL r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_stctl(vcpu, 0, reg1, reg3, ga); reg = reg1; @@ -902,7 +904,7 @@ static int handle_lctlg(struct kvm_vcpu *vcpu) if (ga & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "lctlg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "LCTLG: r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_lctl(vcpu, 1, reg1, reg3, ga); nr_regs = ((reg3 - reg1) & 0xf) + 1; @@ -940,7 +942,7 @@ static int handle_stctg(struct kvm_vcpu *vcpu) if (ga & 7) return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); - VCPU_EVENT(vcpu, 5, "stctg r1:%x, r3:%x, addr:%llx", reg1, reg3, ga); + VCPU_EVENT(vcpu, 4, "STCTG r1:%d, r3:%d, addr: 0x%llx", reg1, reg3, ga); trace_kvm_s390_handle_stctl(vcpu, 1, reg1, reg3, ga); reg = reg1; diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c index 72e58bd2bee7..da690b69f9fe 100644 --- a/arch/s390/kvm/sigp.c +++ b/arch/s390/kvm/sigp.c @@ -205,9 +205,6 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, *reg &= 0xffffffff00000000UL; *reg |= SIGP_STATUS_INCORRECT_STATE; return SIGP_CC_STATUS_STORED; - } else if (rc == 0) { - VCPU_EVENT(vcpu, 4, "set prefix of cpu %02x to %x", - dst_vcpu->vcpu_id, irq.u.prefix.address); } return rc; @@ -371,7 +368,8 @@ static int handle_sigp_dst(struct kvm_vcpu *vcpu, u8 order_code, return rc; } -static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code) +static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code, + u16 cpu_addr) { if (!vcpu->kvm->arch.user_sigp) return 0; @@ -414,9 +412,8 @@ static int handle_sigp_order_in_user_space(struct kvm_vcpu *vcpu, u8 order_code) default: vcpu->stat.instruction_sigp_unknown++; } - - VCPU_EVENT(vcpu, 4, "sigp order %u: completely handled in user space", - order_code); + VCPU_EVENT(vcpu, 3, "SIGP: order %u for CPU %d handled in userspace", + order_code, cpu_addr); return 1; } @@ -435,7 +432,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu) return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP); order_code = kvm_s390_get_base_disp_rs(vcpu, NULL); - if (handle_sigp_order_in_user_space(vcpu, order_code)) + if (handle_sigp_order_in_user_space(vcpu, order_code, cpu_addr)) return -EOPNOTSUPP; if (r1 % 2) diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 3208d33a48cb..cc1d6c68356f 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -105,11 +105,22 @@ TRACE_EVENT(kvm_s390_vcpu_start_stop, {KVM_S390_PROGRAM_INT, "program interrupt"}, \ {KVM_S390_SIGP_SET_PREFIX, "sigp set prefix"}, \ {KVM_S390_RESTART, "sigp restart"}, \ + {KVM_S390_INT_PFAULT_INIT, "pfault init"}, \ + {KVM_S390_INT_PFAULT_DONE, "pfault done"}, \ + {KVM_S390_MCHK, "machine check"}, \ + {KVM_S390_INT_CLOCK_COMP, "clock comparator"}, \ + {KVM_S390_INT_CPU_TIMER, "cpu timer"}, \ {KVM_S390_INT_VIRTIO, "virtio interrupt"}, \ {KVM_S390_INT_SERVICE, "sclp interrupt"}, \ {KVM_S390_INT_EMERGENCY, "sigp emergency"}, \ {KVM_S390_INT_EXTERNAL_CALL, "sigp ext call"} +#define get_irq_name(__type) \ + (__type > KVM_S390_INT_IO_MAX ? \ + __print_symbolic(__type, kvm_s390_int_type) : \ + (__type & KVM_S390_INT_IO_AI_MASK ? \ + "adapter I/O interrupt" : "subchannel I/O interrupt")) + TRACE_EVENT(kvm_s390_inject_vm, TP_PROTO(__u64 type, __u32 parm, __u64 parm64, int who), TP_ARGS(type, parm, parm64, who), @@ -131,22 +142,19 @@ TRACE_EVENT(kvm_s390_inject_vm, TP_printk("inject%s: type:%x (%s) parm:%x parm64:%llx", (__entry->who == 1) ? " (from kernel)" : (__entry->who == 2) ? " (from user)" : "", - __entry->inttype, - __print_symbolic(__entry->inttype, kvm_s390_int_type), + __entry->inttype, get_irq_name(__entry->inttype), __entry->parm, __entry->parm64) ); TRACE_EVENT(kvm_s390_inject_vcpu, - TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64, \ - int who), - TP_ARGS(id, type, parm, parm64, who), + TP_PROTO(unsigned int id, __u64 type, __u32 parm, __u64 parm64), + TP_ARGS(id, type, parm, parm64), TP_STRUCT__entry( __field(int, id) __field(__u32, inttype) __field(__u32, parm) __field(__u64, parm64) - __field(int, who) ), TP_fast_assign( @@ -154,15 +162,12 @@ TRACE_EVENT(kvm_s390_inject_vcpu, __entry->inttype = type & 0x00000000ffffffff; __entry->parm = parm; __entry->parm64 = parm64; - __entry->who = who; ), - TP_printk("inject%s (vcpu %d): type:%x (%s) parm:%x parm64:%llx", - (__entry->who == 1) ? " (from kernel)" : - (__entry->who == 2) ? " (from user)" : "", + TP_printk("inject (vcpu %d): type:%x (%s) parm:%x parm64:%llx", __entry->id, __entry->inttype, - __print_symbolic(__entry->inttype, kvm_s390_int_type), - __entry->parm, __entry->parm64) + get_irq_name(__entry->inttype), __entry->parm, + __entry->parm64) ); /* @@ -189,8 +194,8 @@ TRACE_EVENT(kvm_s390_deliver_interrupt, TP_printk("deliver interrupt (vcpu %d): type:%x (%s) " \ "data:%08llx %016llx", __entry->id, __entry->inttype, - __print_symbolic(__entry->inttype, kvm_s390_int_type), - __entry->data0, __entry->data1) + get_irq_name(__entry->inttype), __entry->data0, + __entry->data1) ); /* diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c index 16dc42d83f93..246a7eb4b680 100644 --- a/arch/s390/lib/delay.c +++ b/arch/s390/lib/delay.c @@ -26,6 +26,7 @@ void __delay(unsigned long loops) */ asm volatile("0: brct %0,0b" : : "d" ((loops/2) + 1)); } +EXPORT_SYMBOL(__delay); static void __udelay_disabled(unsigned long long usecs) { diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index 93cb1d09493d..ae4de559e3a0 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -370,22 +370,9 @@ long __strncpy_from_user(char *dst, const char __user *src, long size) } EXPORT_SYMBOL(__strncpy_from_user); -/* - * The "old" uaccess variant without mvcos can be enforced with the - * uaccess_primary kernel parameter. This is mainly for debugging purposes. - */ -static int uaccess_primary __initdata; - -static int __init parse_uaccess_pt(char *__unused) -{ - uaccess_primary = 1; - return 0; -} -early_param("uaccess_primary", parse_uaccess_pt); - static int __init uaccess_init(void) { - if (!uaccess_primary && test_facility(27)) + if (test_facility(27)) static_branch_enable(&have_mvcos); return 0; } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 4c8f5d7f9c23..f985856a538b 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -646,7 +646,7 @@ static void pfault_interrupt(struct ext_code ext_code, return; inc_irq_stat(IRQEXT_PFL); /* Get the token (= pid of the affected task). */ - pid = sizeof(void *) == 4 ? param32 : param64; + pid = param64; rcu_read_lock(); tsk = find_task_by_pid_ns(pid, &init_pid_ns); if (tsk) diff --git a/arch/s390/mm/gup.c b/arch/s390/mm/gup.c index 1eb41bb3010c..12bbf0e8478f 100644 --- a/arch/s390/mm/gup.c +++ b/arch/s390/mm/gup.c @@ -30,6 +30,9 @@ static inline int gup_pte_range(pmd_t *pmdp, pmd_t pmd, unsigned long addr, do { pte = *ptep; barrier(); + /* Similar to the PMD case, NUMA hinting must take slow path */ + if (pte_protnone(pte)) + return 0; if ((pte_val(pte) & mask) != 0) return 0; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -125,6 +128,13 @@ static inline int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; if (unlikely(pmd_large(pmd))) { + /* + * NUMA hinting faults need to be handled in the GUP + * slowpath for accounting purposes and so that they + * can be serialised against THP migration. + */ + if (pmd_protnone(pmd)) + return 0; if (!gup_huge_pmd(pmdp, pmd, addr, next, write, pages, nr)) return 0; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 76e873748b56..2963b563621c 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -27,6 +27,7 @@ #include <linux/initrd.h> #include <linux/export.h> #include <linux/gfp.h> +#include <linux/memblock.h> #include <asm/processor.h> #include <asm/uaccess.h> #include <asm/pgtable.h> @@ -138,7 +139,7 @@ void __init mem_init(void) cpumask_set_cpu(0, mm_cpumask(&init_mm)); atomic_set(&init_mm.context.attach_count, 1); - max_mapnr = max_low_pfn; + set_max_mapnr(max_low_pfn); high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); /* Setup guest page hinting */ @@ -170,37 +171,36 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) #ifdef CONFIG_MEMORY_HOTPLUG int arch_add_memory(int nid, u64 start, u64 size) { - unsigned long zone_start_pfn, zone_end_pfn, nr_pages; + unsigned long normal_end_pfn = PFN_DOWN(memblock_end_of_DRAM()); + unsigned long dma_end_pfn = PFN_DOWN(MAX_DMA_ADDRESS); unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - struct zone *zone; - int rc; + unsigned long nr_pages; + int rc, zone_enum; rc = vmem_add_mapping(start, size); if (rc) return rc; - for_each_zone(zone) { - if (zone_idx(zone) != ZONE_MOVABLE) { - /* Add range within existing zone limits */ - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone->zone_start_pfn + - zone->spanned_pages; + + while (size_pages > 0) { + if (start_pfn < dma_end_pfn) { + nr_pages = (start_pfn + size_pages > dma_end_pfn) ? + dma_end_pfn - start_pfn : size_pages; + zone_enum = ZONE_DMA; + } else if (start_pfn < normal_end_pfn) { + nr_pages = (start_pfn + size_pages > normal_end_pfn) ? + normal_end_pfn - start_pfn : size_pages; + zone_enum = ZONE_NORMAL; } else { - /* Add remaining range to ZONE_MOVABLE */ - zone_start_pfn = start_pfn; - zone_end_pfn = start_pfn + size_pages; + nr_pages = size_pages; + zone_enum = ZONE_MOVABLE; } - if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) - continue; - nr_pages = (start_pfn + size_pages > zone_end_pfn) ? - zone_end_pfn - start_pfn : size_pages; - rc = __add_pages(nid, zone, start_pfn, nr_pages); + rc = __add_pages(nid, NODE_DATA(nid)->node_zones + zone_enum, + start_pfn, size_pages); if (rc) break; start_pfn += nr_pages; size_pages -= nr_pages; - if (!size_pages) - break; } if (rc) vmem_remove_mapping(start, size); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index b33f66110ca9..54ef3bc01b43 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -10,11 +10,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/smp.h> -#include <linux/highmem.h> -#include <linux/pagemap.h> #include <linux/spinlock.h> -#include <linux/module.h> -#include <linux/quicklist.h> #include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/swapops.h> @@ -28,12 +24,9 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> -#define ALLOC_ORDER 2 -#define FRAG_MASK 0x03 - unsigned long *crst_table_alloc(struct mm_struct *mm) { - struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); + struct page *page = alloc_pages(GFP_KERNEL, 2); if (!page) return NULL; @@ -42,7 +35,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) void crst_table_free(struct mm_struct *mm, unsigned long *table) { - free_pages((unsigned long) table, ALLOC_ORDER); + free_pages((unsigned long) table, 2); } static void __crst_table_upgrade(void *arg) @@ -176,7 +169,7 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit) INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC); spin_lock_init(&gmap->guest_table_lock); gmap->mm = mm; - page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); + page = alloc_pages(GFP_KERNEL, 2); if (!page) goto out_free; page->index = 0; @@ -247,7 +240,7 @@ void gmap_free(struct gmap *gmap) /* Free all segment & region tables. */ list_for_each_entry_safe(page, next, &gmap->crst_list, lru) - __free_pages(page, ALLOC_ORDER); + __free_pages(page, 2); gmap_radix_tree_free(&gmap->guest_to_host); gmap_radix_tree_free(&gmap->host_to_guest); down_write(&gmap->mm->mmap_sem); @@ -287,7 +280,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, unsigned long *new; /* since we dont free the gmap table until gmap_free we can unlock */ - page = alloc_pages(GFP_KERNEL, ALLOC_ORDER); + page = alloc_pages(GFP_KERNEL, 2); if (!page) return -ENOMEM; new = (unsigned long *) page_to_phys(page); @@ -302,7 +295,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table, } spin_unlock(&gmap->mm->page_table_lock); if (page) - __free_pages(page, ALLOC_ORDER); + __free_pages(page, 2); return 0; } @@ -795,40 +788,6 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte) } EXPORT_SYMBOL_GPL(gmap_do_ipte_notify); -static inline int page_table_with_pgste(struct page *page) -{ - return atomic_read(&page->_mapcount) == 0; -} - -static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) -{ - struct page *page; - unsigned long *table; - - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; - } - atomic_set(&page->_mapcount, 0); - table = (unsigned long *) page_to_phys(page); - clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); - clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); - return table; -} - -static inline void page_table_free_pgste(unsigned long *table) -{ - struct page *page; - - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); -} - int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, unsigned long key, bool nq) { @@ -957,20 +916,6 @@ __initcall(page_table_register_sysctl); #else /* CONFIG_PGSTE */ -static inline int page_table_with_pgste(struct page *page) -{ - return 0; -} - -static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm) -{ - return NULL; -} - -static inline void page_table_free_pgste(unsigned long *table) -{ -} - static inline void gmap_unlink(struct mm_struct *mm, unsigned long *table, unsigned long vmaddr) { @@ -994,44 +939,55 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits) */ unsigned long *page_table_alloc(struct mm_struct *mm) { - unsigned long *uninitialized_var(table); - struct page *uninitialized_var(page); + unsigned long *table; + struct page *page; unsigned int mask, bit; - if (mm_alloc_pgste(mm)) - return page_table_alloc_pgste(mm); - /* Allocate fragments of a 4K page as 1K/2K page table */ - spin_lock_bh(&mm->context.list_lock); - mask = FRAG_MASK; - if (!list_empty(&mm->context.pgtable_list)) { - page = list_first_entry(&mm->context.pgtable_list, - struct page, lru); - table = (unsigned long *) page_to_phys(page); - mask = atomic_read(&page->_mapcount); - mask = mask | (mask >> 4); - } - if ((mask & FRAG_MASK) == FRAG_MASK) { - spin_unlock_bh(&mm->context.list_lock); - page = alloc_page(GFP_KERNEL|__GFP_REPEAT); - if (!page) - return NULL; - if (!pgtable_page_ctor(page)) { - __free_page(page); - return NULL; + /* Try to get a fragment of a 4K page as a 2K page table */ + if (!mm_alloc_pgste(mm)) { + table = NULL; + spin_lock_bh(&mm->context.list_lock); + if (!list_empty(&mm->context.pgtable_list)) { + page = list_first_entry(&mm->context.pgtable_list, + struct page, lru); + mask = atomic_read(&page->_mapcount); + mask = (mask | (mask >> 4)) & 3; + if (mask != 3) { + table = (unsigned long *) page_to_phys(page); + bit = mask & 1; /* =1 -> second 2K */ + if (bit) + table += PTRS_PER_PTE; + atomic_xor_bits(&page->_mapcount, 1U << bit); + list_del(&page->lru); + } } + spin_unlock_bh(&mm->context.list_lock); + if (table) + return table; + } + /* Allocate a fresh page */ + page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (!page) + return NULL; + if (!pgtable_page_ctor(page)) { + __free_page(page); + return NULL; + } + /* Initialize page table */ + table = (unsigned long *) page_to_phys(page); + if (mm_alloc_pgste(mm)) { + /* Return 4K page table with PGSTEs */ + atomic_set(&page->_mapcount, 3); + clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); + clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); + } else { + /* Return the first 2K fragment of the page */ atomic_set(&page->_mapcount, 1); - table = (unsigned long *) page_to_phys(page); clear_table(table, _PAGE_INVALID, PAGE_SIZE); spin_lock_bh(&mm->context.list_lock); list_add(&page->lru, &mm->context.pgtable_list); - } else { - for (bit = 1; mask & bit; bit <<= 1) - table += PTRS_PER_PTE; - mask = atomic_xor_bits(&page->_mapcount, bit); - if ((mask & FRAG_MASK) == FRAG_MASK) - list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); } - spin_unlock_bh(&mm->context.list_lock); return table; } @@ -1041,37 +997,23 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) unsigned int bit, mask; page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (page_table_with_pgste(page)) - return page_table_free_pgste(table); - /* Free 1K/2K page table fragment of a 4K page */ - bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); - spin_lock_bh(&mm->context.list_lock); - if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) - list_del(&page->lru); - mask = atomic_xor_bits(&page->_mapcount, bit); - if (mask & FRAG_MASK) - list_add(&page->lru, &mm->context.pgtable_list); - spin_unlock_bh(&mm->context.list_lock); - if (mask == 0) { - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); + if (!mm_alloc_pgste(mm)) { + /* Free 2K page table fragment of a 4K page */ + bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + spin_lock_bh(&mm->context.list_lock); + mask = atomic_xor_bits(&page->_mapcount, 1U << bit); + if (mask & 3) + list_add(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); + spin_unlock_bh(&mm->context.list_lock); + if (mask != 0) + return; } -} - -static void __page_table_free_rcu(void *table, unsigned bit) -{ - struct page *page; - if (bit == FRAG_MASK) - return page_table_free_pgste(table); - /* Free 1K/2K page table fragment of a 4K page */ - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (atomic_xor_bits(&page->_mapcount, bit) == 0) { - pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); - __free_page(page); - } + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); } void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, @@ -1083,34 +1025,45 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, mm = tlb->mm; page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (page_table_with_pgste(page)) { + if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | FRAG_MASK); + table = (unsigned long *) (__pa(table) | 3); tlb_remove_table(tlb, table); return; } - bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); + bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.list_lock); - if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) - list_del(&page->lru); - mask = atomic_xor_bits(&page->_mapcount, bit | (bit << 4)); - if (mask & FRAG_MASK) + mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); + if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); + else + list_del(&page->lru); spin_unlock_bh(&mm->context.list_lock); - table = (unsigned long *) (__pa(table) | (bit << 4)); + table = (unsigned long *) (__pa(table) | (1U << bit)); tlb_remove_table(tlb, table); } static void __tlb_remove_table(void *_table) { - const unsigned long mask = (FRAG_MASK << 4) | FRAG_MASK; - void *table = (void *)((unsigned long) _table & ~mask); - unsigned type = (unsigned long) _table & mask; - - if (type) - __page_table_free_rcu(table, type); - else - free_pages((unsigned long) table, ALLOC_ORDER); + unsigned int mask = (unsigned long) _table & 3; + void *table = (void *)((unsigned long) _table ^ mask); + struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + + switch (mask) { + case 0: /* pmd or pud */ + free_pages((unsigned long) table, 2); + break; + case 1: /* lower 2K of a 4K page table */ + case 2: /* higher 2K of a 4K page table */ + if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) + break; + /* fallthrough */ + case 3: /* 4K page table with pgstes */ + pgtable_page_dtor(page); + atomic_set(&page->_mapcount, -1); + __free_page(page); + break; + } } static void tlb_remove_table_smp_sync(void *arg) diff --git a/arch/s390/net/bpf_jit.h b/arch/s390/net/bpf_jit.h index f6498eec9ee1..f010c93a88b1 100644 --- a/arch/s390/net/bpf_jit.h +++ b/arch/s390/net/bpf_jit.h @@ -36,6 +36,8 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * | BPF stack | | * | | | * +---------------+ | + * | 8 byte skbp | | + * R15+170 -> +---------------+ | * | 8 byte hlen | | * R15+168 -> +---------------+ | * | 4 byte align | | @@ -51,11 +53,12 @@ extern u8 sk_load_word[], sk_load_half[], sk_load_byte[]; * We get 160 bytes stack space from calling function, but only use * 12 * 8 byte for old backchain, r15..r6, and tail_call_cnt. */ -#define STK_SPACE (MAX_BPF_STACK + 8 + 4 + 4 + 160) +#define STK_SPACE (MAX_BPF_STACK + 8 + 8 + 4 + 4 + 160) #define STK_160_UNUSED (160 - 12 * 8) #define STK_OFF (STK_SPACE - STK_160_UNUSED) #define STK_OFF_TMP 160 /* Offset of tmp buffer on stack */ #define STK_OFF_HLEN 168 /* Offset of SKB header length on stack */ +#define STK_OFF_SKBP 170 /* Offset of SKB pointer on stack */ #define STK_OFF_R6 (160 - 11 * 8) /* Offset of r6 on stack */ #define STK_OFF_TCCNT (160 - 12 * 8) /* Offset of tail_call_cnt on stack */ diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 8d2e5165865f..eeda051442c3 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -45,7 +45,7 @@ struct bpf_jit { int labels[1]; /* Labels for local jumps */ }; -#define BPF_SIZE_MAX 4096 /* Max size for program */ +#define BPF_SIZE_MAX 0x7ffff /* Max size for program (20 bit signed displ) */ #define SEEN_SKB 1 /* skb access */ #define SEEN_MEM 2 /* use mem[] for temporary storage */ @@ -53,6 +53,7 @@ struct bpf_jit { #define SEEN_LITERAL 8 /* code uses literals */ #define SEEN_FUNC 16 /* calls C functions */ #define SEEN_TAIL_CALL 32 /* code uses tail calls */ +#define SEEN_SKB_CHANGE 64 /* code changes skb data */ #define SEEN_STACK (SEEN_FUNC | SEEN_MEM | SEEN_SKB) /* @@ -203,19 +204,11 @@ static inline void reg_set_seen(struct bpf_jit *jit, u32 b1) _EMIT6(op1 | __disp, op2); \ }) -#define EMIT6_DISP(op1, op2, b1, b2, b3, disp) \ -({ \ - _EMIT6_DISP(op1 | reg(b1, b2) << 16 | \ - reg_high(b3) << 8, op2, disp); \ - REG_SET_SEEN(b1); \ - REG_SET_SEEN(b2); \ - REG_SET_SEEN(b3); \ -}) - #define _EMIT6_DISP_LH(op1, op2, disp) \ ({ \ - unsigned int __disp_h = ((u32)disp) & 0xff000; \ - unsigned int __disp_l = ((u32)disp) & 0x00fff; \ + u32 _disp = (u32) disp; \ + unsigned int __disp_h = _disp & 0xff000; \ + unsigned int __disp_l = _disp & 0x00fff; \ _EMIT6(op1 | __disp_l, op2 | __disp_h >> 4); \ }) @@ -390,12 +383,32 @@ static void save_restore_regs(struct bpf_jit *jit, int op) } /* + * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S" + * we store the SKB header length on the stack and the SKB data + * pointer in REG_SKB_DATA. + */ +static void emit_load_skb_data_hlen(struct bpf_jit *jit) +{ + /* Header length: llgf %w1,<len>(%b1) */ + EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_1, + offsetof(struct sk_buff, len)); + /* s %w1,<data_len>(%b1) */ + EMIT4_DISP(0x5b000000, REG_W1, BPF_REG_1, + offsetof(struct sk_buff, data_len)); + /* stg %w1,ST_OFF_HLEN(%r0,%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, STK_OFF_HLEN); + /* lg %skb_data,data_off(%b1) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, + BPF_REG_1, offsetof(struct sk_buff, data)); +} + +/* * Emit function prologue * * Save registers and create stack frame if necessary. * See stack frame layout desription in "bpf_jit.h"! */ -static void bpf_jit_prologue(struct bpf_jit *jit) +static void bpf_jit_prologue(struct bpf_jit *jit, bool is_classic) { if (jit->seen & SEEN_TAIL_CALL) { /* xc STK_OFF_TCCNT(4,%r15),STK_OFF_TCCNT(%r15) */ @@ -429,32 +442,21 @@ static void bpf_jit_prologue(struct bpf_jit *jit) EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, 152); } - /* - * For SKB access %b1 contains the SKB pointer. For "bpf_jit.S" - * we store the SKB header length on the stack and the SKB data - * pointer in REG_SKB_DATA. - */ - if (jit->seen & SEEN_SKB) { - /* Header length: llgf %w1,<len>(%b1) */ - EMIT6_DISP_LH(0xe3000000, 0x0016, REG_W1, REG_0, BPF_REG_1, - offsetof(struct sk_buff, len)); - /* s %w1,<data_len>(%b1) */ - EMIT4_DISP(0x5b000000, REG_W1, BPF_REG_1, - offsetof(struct sk_buff, data_len)); - /* stg %w1,ST_OFF_HLEN(%r0,%r15) */ + if (jit->seen & SEEN_SKB) + emit_load_skb_data_hlen(jit); + if (jit->seen & SEEN_SKB_CHANGE) + /* stg %b1,ST_OFF_SKBP(%r0,%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0, REG_15, - STK_OFF_HLEN); - /* lg %skb_data,data_off(%b1) */ - EMIT6_DISP_LH(0xe3000000, 0x0004, REG_SKB_DATA, REG_0, - BPF_REG_1, offsetof(struct sk_buff, data)); + STK_OFF_SKBP); + /* Clear A (%b0) and X (%b7) registers for converted BPF programs */ + if (is_classic) { + if (REG_SEEN(BPF_REG_A)) + /* lghi %ba,0 */ + EMIT4_IMM(0xa7090000, BPF_REG_A, 0); + if (REG_SEEN(BPF_REG_X)) + /* lghi %bx,0 */ + EMIT4_IMM(0xa7090000, BPF_REG_X, 0); } - /* BPF compatibility: clear A (%b0) and X (%b7) registers */ - if (REG_SEEN(BPF_REG_A)) - /* lghi %ba,0 */ - EMIT4_IMM(0xa7090000, BPF_REG_A, 0); - if (REG_SEEN(BPF_REG_X)) - /* lghi %bx,0 */ - EMIT4_IMM(0xa7090000, BPF_REG_X, 0); } /* @@ -976,12 +978,19 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i REG_SET_SEEN(BPF_REG_5); jit->seen |= SEEN_FUNC; /* lg %w1,<d(imm)>(%l) */ - EMIT6_DISP(0xe3000000, 0x0004, REG_W1, REG_0, REG_L, - EMIT_CONST_U64(func)); + EMIT6_DISP_LH(0xe3000000, 0x0004, REG_W1, REG_0, REG_L, + EMIT_CONST_U64(func)); /* basr %r14,%w1 */ EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); + if (bpf_helper_changes_skb_data((void *)func)) { + jit->seen |= SEEN_SKB_CHANGE; + /* lg %b1,ST_OFF_SKBP(%r15) */ + EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, + REG_15, STK_OFF_SKBP); + emit_load_skb_data_hlen(jit); + } break; } case BPF_JMP | BPF_CALL | BPF_X: @@ -1023,7 +1032,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i MAX_TAIL_CALL_CNT, 0, 0x2); /* - * prog = array->prog[index]; + * prog = array->ptrs[index]; * if (prog == NULL) * goto out; */ @@ -1032,7 +1041,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT6_DISP_LH(0xeb000000, 0x000d, REG_1, BPF_REG_3, REG_0, 3); /* lg %r1,prog(%b2,%r1) */ EMIT6_DISP_LH(0xe3000000, 0x0004, REG_1, BPF_REG_2, - REG_1, offsetof(struct bpf_array, prog)); + REG_1, offsetof(struct bpf_array, ptrs)); /* clgij %r1,0,0x8,label0 */ EMIT6_PCREL_IMM_LABEL(0xec000000, 0x007d, REG_1, 0, 0, 0x8); @@ -1236,7 +1245,7 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) jit->lit = jit->lit_start; jit->prg = 0; - bpf_jit_prologue(jit); + bpf_jit_prologue(jit, bpf_prog_was_classic(fp)); for (i = 0; i < fp->len; i += insn_count) { insn_count = bpf_jit_insn(jit, fp, i); if (insn_count < 0) diff --git a/arch/s390/numa/Makefile b/arch/s390/numa/Makefile new file mode 100644 index 000000000000..f94ecaffa71b --- /dev/null +++ b/arch/s390/numa/Makefile @@ -0,0 +1,3 @@ +obj-y += numa.o +obj-y += toptree.o +obj-$(CONFIG_NUMA_EMU) += mode_emu.o diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c new file mode 100644 index 000000000000..7de4e2f780d7 --- /dev/null +++ b/arch/s390/numa/mode_emu.c @@ -0,0 +1,530 @@ +/* + * NUMA support for s390 + * + * NUMA emulation (aka fake NUMA) distributes the available memory to nodes + * without using real topology information about the physical memory of the + * machine. + * + * It distributes the available CPUs to nodes while respecting the original + * machine topology information. This is done by trying to avoid to separate + * CPUs which reside on the same book or even on the same MC. + * + * Because the current Linux scheduler code requires a stable cpu to node + * mapping, cores are pinned to nodes when the first CPU thread is set online. + * + * Copyright IBM Corp. 2015 + */ + +#define KMSG_COMPONENT "numa_emu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/cpumask.h> +#include <linux/memblock.h> +#include <linux/node.h> +#include <linux/memory.h> +#include <linux/slab.h> +#include <asm/smp.h> +#include <asm/topology.h> +#include "numa_mode.h" +#include "toptree.h" + +/* Distances between the different system components */ +#define DIST_EMPTY 0 +#define DIST_CORE 1 +#define DIST_MC 2 +#define DIST_BOOK 3 +#define DIST_MAX 4 + +/* Node distance reported to common code */ +#define EMU_NODE_DIST 10 + +/* Node ID for free (not yet pinned) cores */ +#define NODE_ID_FREE -1 + +/* Different levels of toptree */ +enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY}; + +/* The two toptree IDs */ +enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA}; + +/* Number of NUMA nodes */ +static int emu_nodes = 1; +/* NUMA stripe size */ +static unsigned long emu_size; + +/* + * Node to core pinning information updates are protected by + * "sched_domains_mutex". + */ +static struct { + s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */ + int total; /* Total number of pinned cores */ + int per_node_target; /* Cores per node without extra cores */ + int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */ +} *emu_cores; + +/* + * Pin a core to a node + */ +static void pin_core_to_node(int core_id, int node_id) +{ + if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) { + emu_cores->per_node[node_id]++; + emu_cores->to_node_id[core_id] = node_id; + emu_cores->total++; + } else { + WARN_ON(emu_cores->to_node_id[core_id] != node_id); + } +} + +/* + * Number of pinned cores of a node + */ +static int cores_pinned(struct toptree *node) +{ + return emu_cores->per_node[node->id]; +} + +/* + * ID of the node where the core is pinned (or NODE_ID_FREE) + */ +static int core_pinned_to_node_id(struct toptree *core) +{ + return emu_cores->to_node_id[core->id]; +} + +/* + * Number of cores in the tree that are not yet pinned + */ +static int cores_free(struct toptree *tree) +{ + struct toptree *core; + int count = 0; + + toptree_for_each(core, tree, CORE) { + if (core_pinned_to_node_id(core) == NODE_ID_FREE) + count++; + } + return count; +} + +/* + * Return node of core + */ +static struct toptree *core_node(struct toptree *core) +{ + return core->parent->parent->parent; +} + +/* + * Return book of core + */ +static struct toptree *core_book(struct toptree *core) +{ + return core->parent->parent; +} + +/* + * Return mc of core + */ +static struct toptree *core_mc(struct toptree *core) +{ + return core->parent; +} + +/* + * Distance between two cores + */ +static int dist_core_to_core(struct toptree *core1, struct toptree *core2) +{ + if (core_book(core1)->id != core_book(core2)->id) + return DIST_BOOK; + if (core_mc(core1)->id != core_mc(core2)->id) + return DIST_MC; + /* Same core or sibling on same MC */ + return DIST_CORE; +} + +/* + * Distance of a node to a core + */ +static int dist_node_to_core(struct toptree *node, struct toptree *core) +{ + struct toptree *core_node; + int dist_min = DIST_MAX; + + toptree_for_each(core_node, node, CORE) + dist_min = min(dist_min, dist_core_to_core(core_node, core)); + return dist_min == DIST_MAX ? DIST_EMPTY : dist_min; +} + +/* + * Unify will delete empty nodes, therefore recreate nodes. + */ +static void toptree_unify_tree(struct toptree *tree) +{ + int nid; + + toptree_unify(tree); + for (nid = 0; nid < emu_nodes; nid++) + toptree_get_child(tree, nid); +} + +/* + * Find the best/nearest node for a given core and ensure that no node + * gets more than "emu_cores->per_node_target + extra" cores. + */ +static struct toptree *node_for_core(struct toptree *numa, struct toptree *core, + int extra) +{ + struct toptree *node, *node_best = NULL; + int dist_cur, dist_best, cores_target; + + cores_target = emu_cores->per_node_target + extra; + dist_best = DIST_MAX; + node_best = NULL; + toptree_for_each(node, numa, NODE) { + /* Already pinned cores must use their nodes */ + if (core_pinned_to_node_id(core) == node->id) { + node_best = node; + break; + } + /* Skip nodes that already have enough cores */ + if (cores_pinned(node) >= cores_target) + continue; + dist_cur = dist_node_to_core(node, core); + if (dist_cur < dist_best) { + dist_best = dist_cur; + node_best = node; + } + } + return node_best; +} + +/* + * Find the best node for each core with respect to "extra" core count + */ +static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys, + int extra) +{ + struct toptree *node, *core, *tmp; + + toptree_for_each_safe(core, tmp, phys, CORE) { + node = node_for_core(numa, core, extra); + if (!node) + return; + toptree_move(core, node); + pin_core_to_node(core->id, node->id); + } +} + +/* + * Move structures of given level to specified NUMA node + */ +static void move_level_to_numa_node(struct toptree *node, struct toptree *phys, + enum toptree_level level, bool perfect) +{ + int cores_free, cores_target = emu_cores->per_node_target; + struct toptree *cur, *tmp; + + toptree_for_each_safe(cur, tmp, phys, level) { + cores_free = cores_target - toptree_count(node, CORE); + if (perfect) { + if (cores_free == toptree_count(cur, CORE)) + toptree_move(cur, node); + } else { + if (cores_free >= toptree_count(cur, CORE)) + toptree_move(cur, node); + } + } +} + +/* + * Move structures of a given level to NUMA nodes. If "perfect" is specified + * move only perfectly fitting structures. Otherwise move also smaller + * than needed structures. + */ +static void move_level_to_numa(struct toptree *numa, struct toptree *phys, + enum toptree_level level, bool perfect) +{ + struct toptree *node; + + toptree_for_each(node, numa, NODE) + move_level_to_numa_node(node, phys, level, perfect); +} + +/* + * For the first run try to move the big structures + */ +static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys) +{ + struct toptree *core; + + /* Always try to move perfectly fitting structures first */ + move_level_to_numa(numa, phys, BOOK, true); + move_level_to_numa(numa, phys, BOOK, false); + move_level_to_numa(numa, phys, MC, true); + move_level_to_numa(numa, phys, MC, false); + /* Now pin all the moved cores */ + toptree_for_each(core, numa, CORE) + pin_core_to_node(core->id, core_node(core)->id); +} + +/* + * Allocate new topology and create required nodes + */ +static struct toptree *toptree_new(int id, int nodes) +{ + struct toptree *tree; + int nid; + + tree = toptree_alloc(TOPOLOGY, id); + if (!tree) + goto fail; + for (nid = 0; nid < nodes; nid++) { + if (!toptree_get_child(tree, nid)) + goto fail; + } + return tree; +fail: + panic("NUMA emulation could not allocate topology"); +} + +/* + * Allocate and initialize core to node mapping + */ +static void create_core_to_node_map(void) +{ + int i; + + emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL); + if (emu_cores == NULL) + panic("Could not allocate cores to node memory"); + for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) + emu_cores->to_node_id[i] = NODE_ID_FREE; +} + +/* + * Move cores from physical topology into NUMA target topology + * and try to keep as much of the physical topology as possible. + */ +static struct toptree *toptree_to_numa(struct toptree *phys) +{ + static int first = 1; + struct toptree *numa; + int cores_total; + + cores_total = emu_cores->total + cores_free(phys); + emu_cores->per_node_target = cores_total / emu_nodes; + numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes); + if (first) { + toptree_to_numa_first(numa, phys); + first = 0; + } + toptree_to_numa_single(numa, phys, 0); + toptree_to_numa_single(numa, phys, 1); + toptree_unify_tree(numa); + + WARN_ON(cpumask_weight(&phys->mask)); + return numa; +} + +/* + * Create a toptree out of the physical topology that we got from the hypervisor + */ +static struct toptree *toptree_from_topology(void) +{ + struct toptree *phys, *node, *book, *mc, *core; + struct cpu_topology_s390 *top; + int cpu; + + phys = toptree_new(TOPTREE_ID_PHYS, 1); + + for_each_online_cpu(cpu) { + top = &per_cpu(cpu_topology, cpu); + node = toptree_get_child(phys, 0); + book = toptree_get_child(node, top->book_id); + mc = toptree_get_child(book, top->socket_id); + core = toptree_get_child(mc, top->core_id); + if (!book || !mc || !core) + panic("NUMA emulation could not allocate memory"); + cpumask_set_cpu(cpu, &core->mask); + toptree_update_mask(mc); + } + return phys; +} + +/* + * Add toptree core to topology and create correct CPU masks + */ +static void topology_add_core(struct toptree *core) +{ + struct cpu_topology_s390 *top; + int cpu; + + for_each_cpu(cpu, &core->mask) { + top = &per_cpu(cpu_topology, cpu); + cpumask_copy(&top->thread_mask, &core->mask); + cpumask_copy(&top->core_mask, &core_mc(core)->mask); + cpumask_copy(&top->book_mask, &core_book(core)->mask); + cpumask_set_cpu(cpu, node_to_cpumask_map[core_node(core)->id]); + top->node_id = core_node(core)->id; + } +} + +/* + * Apply toptree to topology and create CPU masks + */ +static void toptree_to_topology(struct toptree *numa) +{ + struct toptree *core; + int i; + + /* Clear all node masks */ + for (i = 0; i < MAX_NUMNODES; i++) + cpumask_clear(node_to_cpumask_map[i]); + + /* Rebuild all masks */ + toptree_for_each(core, numa, CORE) + topology_add_core(core); +} + +/* + * Show the node to core mapping + */ +static void print_node_to_core_map(void) +{ + int nid, cid; + + if (!numa_debug_enabled) + return; + printk(KERN_DEBUG "NUMA node to core mapping\n"); + for (nid = 0; nid < emu_nodes; nid++) { + printk(KERN_DEBUG " node %3d: ", nid); + for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) { + if (emu_cores->to_node_id[cid] == nid) + printk(KERN_CONT "%d ", cid); + } + printk(KERN_CONT "\n"); + } +} + +/* + * Transfer physical topology into a NUMA topology and modify CPU masks + * according to the NUMA topology. + * + * Must be called with "sched_domains_mutex" lock held. + */ +static void emu_update_cpu_topology(void) +{ + struct toptree *phys, *numa; + + if (emu_cores == NULL) + create_core_to_node_map(); + phys = toptree_from_topology(); + numa = toptree_to_numa(phys); + toptree_free(phys); + toptree_to_topology(numa); + toptree_free(numa); + print_node_to_core_map(); +} + +/* + * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum + * alignment (needed for memory hotplug). + */ +static unsigned long emu_setup_size_adjust(unsigned long size) +{ + size = size ? : CONFIG_EMU_SIZE; + size = roundup(size, memory_block_size_bytes()); + return size; +} + +/* + * If we have not enough memory for the specified nodes, reduce the node count. + */ +static int emu_setup_nodes_adjust(int nodes) +{ + int nodes_max; + + nodes_max = memblock.memory.total_size / emu_size; + nodes_max = max(nodes_max, 1); + if (nodes_max >= nodes) + return nodes; + pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes); + return nodes_max; +} + +/* + * Early emu setup + */ +static void emu_setup(void) +{ + emu_size = emu_setup_size_adjust(emu_size); + emu_nodes = emu_setup_nodes_adjust(emu_nodes); + pr_info("Creating %d nodes with memory stripe size %ld MB\n", + emu_nodes, emu_size >> 20); +} + +/* + * Return node id for given page number + */ +static int emu_pfn_to_nid(unsigned long pfn) +{ + return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes; +} + +/* + * Return stripe size + */ +static unsigned long emu_align(void) +{ + return emu_size; +} + +/* + * Return distance between two nodes + */ +static int emu_distance(int node1, int node2) +{ + return (node1 != node2) * EMU_NODE_DIST; +} + +/* + * Define callbacks for generic s390 NUMA infrastructure + */ +const struct numa_mode numa_mode_emu = { + .name = "emu", + .setup = emu_setup, + .update_cpu_topology = emu_update_cpu_topology, + .__pfn_to_nid = emu_pfn_to_nid, + .align = emu_align, + .distance = emu_distance, +}; + +/* + * Kernel parameter: emu_nodes=<n> + */ +static int __init early_parse_emu_nodes(char *p) +{ + int count; + + if (kstrtoint(p, 0, &count) != 0 || count <= 0) + return 0; + if (count <= 0) + return 0; + emu_nodes = min(count, MAX_NUMNODES); + return 0; +} +early_param("emu_nodes", early_parse_emu_nodes); + +/* + * Kernel parameter: emu_size=[<n>[k|M|G|T]] + */ +static int __init early_parse_emu_size(char *p) +{ + emu_size = memparse(p, NULL); + return 0; +} +early_param("emu_size", early_parse_emu_size); diff --git a/arch/s390/numa/numa.c b/arch/s390/numa/numa.c new file mode 100644 index 000000000000..09b1d2355bd9 --- /dev/null +++ b/arch/s390/numa/numa.c @@ -0,0 +1,184 @@ +/* + * NUMA support for s390 + * + * Implement NUMA core code. + * + * Copyright IBM Corp. 2015 + */ + +#define KMSG_COMPONENT "numa" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/mmzone.h> +#include <linux/cpumask.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/slab.h> +#include <linux/node.h> + +#include <asm/numa.h> +#include "numa_mode.h" + +pg_data_t *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +const struct numa_mode numa_mode_plain = { + .name = "plain", +}; + +static const struct numa_mode *mode = &numa_mode_plain; + +int numa_pfn_to_nid(unsigned long pfn) +{ + return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0; +} + +void numa_update_cpu_topology(void) +{ + if (mode->update_cpu_topology) + mode->update_cpu_topology(); +} + +int __node_distance(int a, int b) +{ + return mode->distance ? mode->distance(a, b) : 0; +} + +int numa_debug_enabled; + +/* + * alloc_node_data() - Allocate node data + */ +static __init pg_data_t *alloc_node_data(void) +{ + pg_data_t *res; + + res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1); + if (!res) + panic("Could not allocate memory for node data!\n"); + memset(res, 0, sizeof(pg_data_t)); + return res; +} + +/* + * numa_setup_memory() - Assign bootmem to nodes + * + * The memory is first added to memblock without any respect to nodes. + * This is fixed before remaining memblock memory is handed over to the + * buddy allocator. + * An important side effect is that large bootmem allocations might easily + * cross node boundaries, which can be needed for large allocations with + * smaller memory stripes in each node (i.e. when using NUMA emulation). + * + * Memory defines nodes: + * Therefore this routine also sets the nodes online with memory. + */ +static void __init numa_setup_memory(void) +{ + unsigned long cur_base, align, end_of_dram; + int nid = 0; + + end_of_dram = memblock_end_of_DRAM(); + align = mode->align ? mode->align() : ULONG_MAX; + + /* + * Step through all available memory and assign it to the nodes + * indicated by the mode implementation. + * All nodes which are seen here will be set online. + */ + cur_base = 0; + do { + nid = numa_pfn_to_nid(PFN_DOWN(cur_base)); + node_set_online(nid); + memblock_set_node(cur_base, align, &memblock.memory, nid); + cur_base += align; + } while (cur_base < end_of_dram); + + /* Allocate and fill out node_data */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = alloc_node_data(); + + for_each_online_node(nid) { + unsigned long start_pfn, end_pfn; + unsigned long t_start, t_end; + int i; + + start_pfn = ULONG_MAX; + end_pfn = 0; + for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) { + if (t_start < start_pfn) + start_pfn = t_start; + if (t_end > end_pfn) + end_pfn = t_end; + } + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; + NODE_DATA(nid)->node_id = nid; + } +} + +/* + * numa_setup() - Earliest initialization + * + * Assign the mode and call the mode's setup routine. + */ +void __init numa_setup(void) +{ + pr_info("NUMA mode: %s\n", mode->name); + if (mode->setup) + mode->setup(); + numa_setup_memory(); + memblock_dump_all(); +} + + +/* + * numa_init_early() - Initialization initcall + * + * This runs when only one CPU is online and before the first + * topology update is called for by the scheduler. + */ +static int __init numa_init_early(void) +{ + /* Attach all possible CPUs to node 0 for now. */ + cpumask_copy(node_to_cpumask_map[0], cpu_possible_mask); + return 0; +} +early_initcall(numa_init_early); + +/* + * numa_init_late() - Initialization initcall + * + * Register NUMA nodes. + */ +static int __init numa_init_late(void) +{ + int nid; + + for_each_online_node(nid) + register_one_node(nid); + return 0; +} +device_initcall(numa_init_late); + +static int __init parse_debug(char *parm) +{ + numa_debug_enabled = 1; + return 0; +} +early_param("numa_debug", parse_debug); + +static int __init parse_numa(char *parm) +{ + if (strcmp(parm, numa_mode_plain.name) == 0) + mode = &numa_mode_plain; +#ifdef CONFIG_NUMA_EMU + if (strcmp(parm, numa_mode_emu.name) == 0) + mode = &numa_mode_emu; +#endif + return 0; +} +early_param("numa", parse_numa); diff --git a/arch/s390/numa/numa_mode.h b/arch/s390/numa/numa_mode.h new file mode 100644 index 000000000000..08953b0b1c7f --- /dev/null +++ b/arch/s390/numa/numa_mode.h @@ -0,0 +1,24 @@ +/* + * NUMA support for s390 + * + * Define declarations used for communication between NUMA mode + * implementations and NUMA core functionality. + * + * Copyright IBM Corp. 2015 + */ +#ifndef __S390_NUMA_MODE_H +#define __S390_NUMA_MODE_H + +struct numa_mode { + char *name; /* Name of mode */ + void (*setup)(void); /* Initizalize mode */ + void (*update_cpu_topology)(void); /* Called by topology code */ + int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */ + unsigned long (*align)(void); /* Minimum node alignment */ + int (*distance)(int a, int b); /* Distance between two nodes */ +}; + +extern const struct numa_mode numa_mode_plain; +extern const struct numa_mode numa_mode_emu; + +#endif /* __S390_NUMA_MODE_H */ diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c new file mode 100644 index 000000000000..902d350d859a --- /dev/null +++ b/arch/s390/numa/toptree.c @@ -0,0 +1,342 @@ +/* + * NUMA support for s390 + * + * A tree structure used for machine topology mangling + * + * Copyright IBM Corp. 2015 + */ + +#include <linux/kernel.h> +#include <linux/cpumask.h> +#include <linux/list.h> +#include <linux/list_sort.h> +#include <linux/slab.h> +#include <asm/numa.h> + +#include "toptree.h" + +/** + * toptree_alloc - Allocate and initialize a new tree node. + * @level: The node's vertical level; level 0 contains the leaves. + * @id: ID number, explicitly not unique beyond scope of node's siblings + * + * Allocate a new tree node and initialize it. + * + * RETURNS: + * Pointer to the new tree node or NULL on error + */ +struct toptree *toptree_alloc(int level, int id) +{ + struct toptree *res = kzalloc(sizeof(struct toptree), GFP_KERNEL); + + if (!res) + return res; + + INIT_LIST_HEAD(&res->children); + INIT_LIST_HEAD(&res->sibling); + cpumask_clear(&res->mask); + res->level = level; + res->id = id; + return res; +} + +/** + * toptree_remove - Remove a tree node from a tree + * @cand: Pointer to the node to remove + * + * The node is detached from its parent node. The parent node's + * masks will be updated to reflect the loss of the child. + */ +static void toptree_remove(struct toptree *cand) +{ + struct toptree *oldparent; + + list_del_init(&cand->sibling); + oldparent = cand->parent; + cand->parent = NULL; + toptree_update_mask(oldparent); +} + +/** + * toptree_free - discard a tree node + * @cand: Pointer to the tree node to discard + * + * Checks if @cand is attached to a parent node. Detaches it + * cleanly using toptree_remove. Possible children are freed + * recursively. In the end @cand itself is freed. + */ +void toptree_free(struct toptree *cand) +{ + struct toptree *child, *tmp; + + if (cand->parent) + toptree_remove(cand); + toptree_for_each_child_safe(child, tmp, cand) + toptree_free(child); + kfree(cand); +} + +/** + * toptree_update_mask - Update node bitmasks + * @cand: Pointer to a tree node + * + * The node's cpumask will be updated by combining all children's + * masks. Then toptree_update_mask is called recursively for the + * parent if applicable. + * + * NOTE: + * This must not be called on leaves. If called on a leaf, its + * CPU mask is cleared and lost. + */ +void toptree_update_mask(struct toptree *cand) +{ + struct toptree *child; + + cpumask_clear(&cand->mask); + list_for_each_entry(child, &cand->children, sibling) + cpumask_or(&cand->mask, &cand->mask, &child->mask); + if (cand->parent) + toptree_update_mask(cand->parent); +} + +/** + * toptree_insert - Insert a tree node into tree + * @cand: Pointer to the node to insert + * @target: Pointer to the node to which @cand will added as a child + * + * Insert a tree node into a tree. Masks will be updated automatically. + * + * RETURNS: + * 0 on success, -1 if NULL is passed as argument or the node levels + * don't fit. + */ +static int toptree_insert(struct toptree *cand, struct toptree *target) +{ + if (!cand || !target) + return -1; + if (target->level != (cand->level + 1)) + return -1; + list_add_tail(&cand->sibling, &target->children); + cand->parent = target; + toptree_update_mask(target); + return 0; +} + +/** + * toptree_move_children - Move all child nodes of a node to a new place + * @cand: Pointer to the node whose children are to be moved + * @target: Pointer to the node to which @cand's children will be attached + * + * Take all child nodes of @cand and move them using toptree_move. + */ +static void toptree_move_children(struct toptree *cand, struct toptree *target) +{ + struct toptree *child, *tmp; + + toptree_for_each_child_safe(child, tmp, cand) + toptree_move(child, target); +} + +/** + * toptree_unify - Merge children with same ID + * @cand: Pointer to node whose direct children should be made unique + * + * When mangling the tree it is possible that a node has two or more children + * which have the same ID. This routine merges these children into one and + * moves all children of the merged nodes into the unified node. + */ +void toptree_unify(struct toptree *cand) +{ + struct toptree *child, *tmp, *cand_copy; + + /* Threads cannot be split, cores are not split */ + if (cand->level < 2) + return; + + cand_copy = toptree_alloc(cand->level, 0); + toptree_for_each_child_safe(child, tmp, cand) { + struct toptree *tmpchild; + + if (!cpumask_empty(&child->mask)) { + tmpchild = toptree_get_child(cand_copy, child->id); + toptree_move_children(child, tmpchild); + } + toptree_free(child); + } + toptree_move_children(cand_copy, cand); + toptree_free(cand_copy); + + toptree_for_each_child(child, cand) + toptree_unify(child); +} + +/** + * toptree_move - Move a node to another context + * @cand: Pointer to the node to move + * @target: Pointer to the node where @cand should go + * + * In the easiest case @cand is exactly on the level below @target + * and will be immediately moved to the target. + * + * If @target's level is not the direct parent level of @cand, + * nodes for the missing levels are created and put between + * @cand and @target. The "stacking" nodes' IDs are taken from + * @cand's parents. + * + * After this it is likely to have redundant nodes in the tree + * which are addressed by means of toptree_unify. + */ +void toptree_move(struct toptree *cand, struct toptree *target) +{ + struct toptree *stack_target, *real_insert_point, *ptr, *tmp; + + if (cand->level + 1 == target->level) { + toptree_remove(cand); + toptree_insert(cand, target); + return; + } + + real_insert_point = NULL; + ptr = cand; + stack_target = NULL; + + do { + tmp = stack_target; + stack_target = toptree_alloc(ptr->level + 1, + ptr->parent->id); + toptree_insert(tmp, stack_target); + if (!real_insert_point) + real_insert_point = stack_target; + ptr = ptr->parent; + } while (stack_target->level < (target->level - 1)); + + toptree_remove(cand); + toptree_insert(cand, real_insert_point); + toptree_insert(stack_target, target); +} + +/** + * toptree_get_child - Access a tree node's child by its ID + * @cand: Pointer to tree node whose child is to access + * @id: The desired child's ID + * + * @cand's children are searched for a child with matching ID. + * If no match can be found, a new child with the desired ID + * is created and returned. + */ +struct toptree *toptree_get_child(struct toptree *cand, int id) +{ + struct toptree *child; + + toptree_for_each_child(child, cand) + if (child->id == id) + return child; + child = toptree_alloc(cand->level-1, id); + toptree_insert(child, cand); + return child; +} + +/** + * toptree_first - Find the first descendant on specified level + * @context: Pointer to tree node whose descendants are to be used + * @level: The level of interest + * + * RETURNS: + * @context's first descendant on the specified level, or NULL + * if there is no matching descendant + */ +struct toptree *toptree_first(struct toptree *context, int level) +{ + struct toptree *child, *tmp; + + if (context->level == level) + return context; + + if (!list_empty(&context->children)) { + list_for_each_entry(child, &context->children, sibling) { + tmp = toptree_first(child, level); + if (tmp) + return tmp; + } + } + return NULL; +} + +/** + * toptree_next_sibling - Return next sibling + * @cur: Pointer to a tree node + * + * RETURNS: + * If @cur has a parent and is not the last in the parent's children list, + * the next sibling is returned. Or NULL when there are no siblings left. + */ +static struct toptree *toptree_next_sibling(struct toptree *cur) +{ + if (cur->parent == NULL) + return NULL; + + if (cur == list_last_entry(&cur->parent->children, + struct toptree, sibling)) + return NULL; + return (struct toptree *) list_next_entry(cur, sibling); +} + +/** + * toptree_next - Tree traversal function + * @cur: Pointer to current element + * @context: Pointer to the root node of the tree or subtree to + * be traversed. + * @level: The level of interest. + * + * RETURNS: + * Pointer to the next node on level @level + * or NULL when there is no next node. + */ +struct toptree *toptree_next(struct toptree *cur, struct toptree *context, + int level) +{ + struct toptree *cur_context, *tmp; + + if (!cur) + return NULL; + + if (context->level == level) + return NULL; + + tmp = toptree_next_sibling(cur); + if (tmp != NULL) + return tmp; + + cur_context = cur; + while (cur_context->level < context->level - 1) { + /* Step up */ + cur_context = cur_context->parent; + /* Step aside */ + tmp = toptree_next_sibling(cur_context); + if (tmp != NULL) { + /* Step down */ + tmp = toptree_first(tmp, level); + if (tmp != NULL) + return tmp; + } + } + return NULL; +} + +/** + * toptree_count - Count descendants on specified level + * @context: Pointer to node whose descendants are to be considered + * @level: Only descendants on the specified level will be counted + * + * RETURNS: + * Number of descendants on the specified level + */ +int toptree_count(struct toptree *context, int level) +{ + struct toptree *cur; + int cnt = 0; + + toptree_for_each(cur, context, level) + cnt++; + return cnt; +} diff --git a/arch/s390/numa/toptree.h b/arch/s390/numa/toptree.h new file mode 100644 index 000000000000..bdf502027af4 --- /dev/null +++ b/arch/s390/numa/toptree.h @@ -0,0 +1,60 @@ +/* + * NUMA support for s390 + * + * A tree structure used for machine topology mangling + * + * Copyright IBM Corp. 2015 + */ +#ifndef S390_TOPTREE_H +#define S390_TOPTREE_H + +#include <linux/cpumask.h> +#include <linux/list.h> + +struct toptree { + int level; + int id; + cpumask_t mask; + struct toptree *parent; + struct list_head sibling; + struct list_head children; +}; + +struct toptree *toptree_alloc(int level, int id); +void toptree_free(struct toptree *cand); +void toptree_update_mask(struct toptree *cand); +void toptree_unify(struct toptree *cand); +struct toptree *toptree_get_child(struct toptree *cand, int id); +void toptree_move(struct toptree *cand, struct toptree *target); +int toptree_count(struct toptree *context, int level); + +struct toptree *toptree_first(struct toptree *context, int level); +struct toptree *toptree_next(struct toptree *cur, struct toptree *context, + int level); + +#define toptree_for_each_child(child, ptree) \ + list_for_each_entry(child, &ptree->children, sibling) + +#define toptree_for_each_child_safe(child, ptmp, ptree) \ + list_for_each_entry_safe(child, ptmp, &ptree->children, sibling) + +#define toptree_is_last(ptree) \ + ((ptree->parent == NULL) || \ + (ptree->parent->children.prev == &ptree->sibling)) + +#define toptree_for_each(ptree, cont, ttype) \ + for (ptree = toptree_first(cont, ttype); \ + ptree != NULL; \ + ptree = toptree_next(ptree, cont, ttype)) + +#define toptree_for_each_safe(ptree, tmp, cont, ttype) \ + for (ptree = toptree_first(cont, ttype), \ + tmp = toptree_next(ptree, cont, ttype); \ + ptree != NULL; \ + ptree = tmp, \ + tmp = toptree_next(ptree, cont, ttype)) + +#define toptree_for_each_sibling(ptree, start) \ + toptree_for_each(ptree, start->parent, start->level) + +#endif /* S390_TOPTREE_H */ diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 598f023cf8a6..7ef12a3ace3a 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -76,11 +76,6 @@ EXPORT_SYMBOL_GPL(zpci_iomap_start); static struct kmem_cache *zdev_fmb_cache; -struct zpci_dev *get_zdev(struct pci_dev *pdev) -{ - return (struct zpci_dev *) pdev->sysdata; -} - struct zpci_dev *get_zdev_by_fid(u32 fid) { struct zpci_dev *tmp, *zdev = NULL; @@ -269,7 +264,7 @@ void __iomem *pci_iomap_range(struct pci_dev *pdev, unsigned long offset, unsigned long max) { - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); u64 addr; int idx; @@ -385,7 +380,7 @@ static void zpci_irq_handler(struct airq_struct *airq) int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) { - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); unsigned int hwirq, msi_vecs; unsigned long aisb; struct msi_desc *msi; @@ -414,7 +409,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) /* Request MSI interrupts */ hwirq = 0; - list_for_each_entry(msi, &pdev->msi_list, list) { + for_each_pci_msi_entry(msi, pdev) { rc = -EIO; irq = irq_alloc_desc(0); /* Alloc irq on node 0 */ if (irq < 0) @@ -440,7 +435,7 @@ int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) return (msi_vecs == nvec) ? 0 : msi_vecs; out_msi: - list_for_each_entry(msi, &pdev->msi_list, list) { + for_each_pci_msi_entry(msi, pdev) { if (hwirq-- == 0) break; irq_set_msi_desc(msi->irq, NULL); @@ -460,7 +455,7 @@ out: void arch_teardown_msi_irqs(struct pci_dev *pdev) { - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); struct msi_desc *msi; int rc; @@ -470,7 +465,7 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) return; /* Release MSI interrupts */ - list_for_each_entry(msi, &pdev->msi_list, list) { + for_each_pci_msi_entry(msi, pdev) { if (msi->msi_attrib.is_msix) __pci_msix_desc_mask_irq(msi, 1); else @@ -637,7 +632,7 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev) int i; for (i = 0; i < PCI_BAR_COUNT; i++) { - if (!zdev->bars[i].size) + if (!zdev->bars[i].size || !zdev->bars[i].res) continue; zpci_free_iomap(zdev, zdev->bars[i].map_idx); @@ -648,7 +643,7 @@ static void zpci_cleanup_bus_resources(struct zpci_dev *zdev) int pcibios_add_device(struct pci_dev *pdev) { - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); struct resource *res; int i; @@ -673,7 +668,7 @@ void pcibios_release_device(struct pci_dev *pdev) int pcibios_enable_device(struct pci_dev *pdev, int mask) { - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); zdev->pdev = pdev; zpci_debug_init_device(zdev); @@ -684,7 +679,7 @@ int pcibios_enable_device(struct pci_dev *pdev, int mask) void pcibios_disable_device(struct pci_dev *pdev) { - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); zpci_fmb_disable_device(zdev); zpci_debug_exit_device(zdev); @@ -695,7 +690,7 @@ void pcibios_disable_device(struct pci_dev *pdev) static int zpci_restore(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); int ret = 0; if (zdev->state != ZPCI_FN_STATE_ONLINE) @@ -717,7 +712,7 @@ out: static int zpci_freeze(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); if (zdev->state != ZPCI_FN_STATE_ONLINE) return 0; @@ -777,17 +772,22 @@ static int zpci_scan_bus(struct zpci_dev *zdev) ret = zpci_setup_bus_resources(zdev, &resources); if (ret) - return ret; + goto error; zdev->bus = pci_scan_root_bus(NULL, ZPCI_BUS_NR, &pci_root_ops, zdev, &resources); if (!zdev->bus) { - zpci_cleanup_bus_resources(zdev); - return -EIO; + ret = -EIO; + goto error; } zdev->bus->max_bus_speed = zdev->max_bus_speed; pci_bus_add_devices(zdev->bus); return 0; + +error: + zpci_cleanup_bus_resources(zdev); + pci_free_resource_list(&resources); + return ret; } int zpci_enable_device(struct zpci_dev *zdev) diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 6fd8d5836138..42b76580c8b8 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -277,7 +277,7 @@ static dma_addr_t s390_dma_map_pages(struct device *dev, struct page *page, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct zpci_dev *zdev = get_zdev(to_pci_dev(dev)); + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); unsigned long nr_pages, iommu_page_index; unsigned long pa = page_to_phys(page) + offset; int flags = ZPCI_PTE_VALID; @@ -316,7 +316,7 @@ static void s390_dma_unmap_pages(struct device *dev, dma_addr_t dma_addr, size_t size, enum dma_data_direction direction, struct dma_attrs *attrs) { - struct zpci_dev *zdev = get_zdev(to_pci_dev(dev)); + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); unsigned long iommu_page_index; int npages; @@ -337,7 +337,7 @@ static void *s390_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag, struct dma_attrs *attrs) { - struct zpci_dev *zdev = get_zdev(to_pci_dev(dev)); + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); struct page *page; unsigned long pa; dma_addr_t map; @@ -367,7 +367,7 @@ static void s390_dma_free(struct device *dev, size_t size, void *pa, dma_addr_t dma_handle, struct dma_attrs *attrs) { - struct zpci_dev *zdev = get_zdev(to_pci_dev(dev)); + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); size = PAGE_ALIGN(size); atomic64_sub(size / PAGE_SIZE, &zdev->allocated_pages); diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index ed2394dd14e9..369a3e05d468 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -46,15 +46,13 @@ struct zpci_ccdf_avail { static void __zpci_event_error(struct zpci_ccdf_err *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); + struct pci_dev *pdev = zdev ? zdev->pdev : NULL; zpci_err("error CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); - if (!zdev) - return; - pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n", - pci_name(zdev->pdev), ccdf->pec, ccdf->fid); + pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); } void zpci_event_error(void *data) @@ -89,7 +87,9 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) ret = zpci_enable_device(zdev); if (ret) break; + pci_lock_rescan_remove(); pci_rescan_bus(zdev->bus); + pci_unlock_rescan_remove(); break; case 0x0302: /* Reserved -> Standby */ if (!zdev) @@ -97,7 +97,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) break; case 0x0303: /* Deconfiguration requested */ if (pdev) - pci_stop_and_remove_bus_device(pdev); + pci_stop_and_remove_bus_device_locked(pdev); ret = zpci_disable_device(zdev); if (ret) @@ -114,7 +114,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) /* Give the driver a hint that the function is * already unusable. */ pdev->error_state = pci_channel_io_perm_failure; - pci_stop_and_remove_bus_device(pdev); + pci_stop_and_remove_bus_device_locked(pdev); } zdev->fh = ccdf->fh; diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 85267c058af8..dcc2634ccbe2 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -8,10 +8,23 @@ #include <linux/errno.h> #include <linux/delay.h> #include <asm/pci_insn.h> +#include <asm/pci_debug.h> #include <asm/processor.h> #define ZPCI_INSN_BUSY_DELAY 1 /* 1 microsecond */ +static inline void zpci_err_insn(u8 cc, u8 status, u64 req, u64 offset) +{ + struct { + u8 cc; + u8 status; + u64 req; + u64 offset; + } data = {cc, status, req, offset}; + + zpci_err_hex(&data, sizeof(data)); +} + /* Modify PCI Function Controls */ static inline u8 __mpcifc(u64 req, struct zpci_fib *fib, u8 *status) { @@ -38,8 +51,8 @@ int zpci_mod_fc(u64 req, struct zpci_fib *fib) } while (cc == 2); if (cc) - printk_once(KERN_ERR "%s: error cc: %d status: %d\n", - __func__, cc, status); + zpci_err_insn(cc, status, req, 0); + return (cc) ? -EIO : 0; } @@ -72,8 +85,8 @@ int zpci_refresh_trans(u64 fn, u64 addr, u64 range) } while (cc == 2); if (cc) - printk_once(KERN_ERR "%s: error cc: %d status: %d dma_addr: %Lx size: %Lx\n", - __func__, cc, status, addr, range); + zpci_err_insn(cc, status, addr, range); + return (cc) ? -EIO : 0; } @@ -121,8 +134,8 @@ int zpci_load(u64 *data, u64 req, u64 offset) } while (cc == 2); if (cc) - printk_once(KERN_ERR "%s: error cc: %d status: %d req: %Lx offset: %Lx\n", - __func__, cc, status, req, offset); + zpci_err_insn(cc, status, req, offset); + return (cc > 0) ? -EIO : cc; } EXPORT_SYMBOL_GPL(zpci_load); @@ -159,8 +172,8 @@ int zpci_store(u64 data, u64 req, u64 offset) } while (cc == 2); if (cc) - printk_once(KERN_ERR "%s: error cc: %d status: %d req: %Lx offset: %Lx\n", - __func__, cc, status, req, offset); + zpci_err_insn(cc, status, req, offset); + return (cc > 0) ? -EIO : cc; } EXPORT_SYMBOL_GPL(zpci_store); @@ -195,8 +208,8 @@ int zpci_store_block(const u64 *data, u64 req, u64 offset) } while (cc == 2); if (cc) - printk_once(KERN_ERR "%s: error cc: %d status: %d req: %Lx offset: %Lx\n", - __func__, cc, status, req, offset); + zpci_err_insn(cc, status, req, offset); + return (cc > 0) ? -EIO : cc; } EXPORT_SYMBOL_GPL(zpci_store_block); diff --git a/arch/s390/pci/pci_sysfs.c b/arch/s390/pci/pci_sysfs.c index fa3ce891e597..f37a5808883d 100644 --- a/arch/s390/pci/pci_sysfs.c +++ b/arch/s390/pci/pci_sysfs.c @@ -16,7 +16,7 @@ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ - struct zpci_dev *zdev = get_zdev(to_pci_dev(dev)); \ + struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); \ \ return sprintf(buf, fmt, zdev->member); \ } \ @@ -38,23 +38,30 @@ static ssize_t recover_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); int ret; if (!device_remove_file_self(dev, attr)) return count; + pci_lock_rescan_remove(); pci_stop_and_remove_bus_device(pdev); ret = zpci_disable_device(zdev); if (ret) - return ret; + goto error; ret = zpci_enable_device(zdev); if (ret) - return ret; + goto error; pci_rescan_bus(zdev->bus); + pci_unlock_rescan_remove(); + return count; + +error: + pci_unlock_rescan_remove(); + return ret; } static DEVICE_ATTR_WO(recover); @@ -64,7 +71,7 @@ static ssize_t util_string_read(struct file *filp, struct kobject *kobj, { struct device *dev = kobj_to_dev(kobj); struct pci_dev *pdev = to_pci_dev(dev); - struct zpci_dev *zdev = get_zdev(pdev); + struct zpci_dev *zdev = to_zpci(pdev); return memory_read_from_buffer(buf, count, &off, zdev->util_str, sizeof(zdev->util_str)); |