diff options
author | Ingo Molnar <mingo@kernel.org> | 2018-02-06 21:12:31 +0100 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-02-06 21:12:31 +0100 |
commit | 82845079160817cc6ac64e5321bbd935e0a47b3a (patch) | |
tree | 0886d1d52428e9db14536cae4b37db896e7c360a /arch/arm64/kernel | |
parent | 32e839dda3ba576943365f0f5817ce5c843137dc (diff) | |
parent | 68c5735eaa5e680e701c9a2d1e3c7880bdf5ab66 (diff) | |
download | linux-82845079160817cc6ac64e5321bbd935e0a47b3a.tar.bz2 |
Merge branch 'linus' into sched/urgent, to resolve conflicts
Conflicts:
arch/arm64/kernel/entry.S
arch/x86/Kconfig
include/linux/sched/mm.h
kernel/fork.c
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'arch/arm64/kernel')
26 files changed, 1345 insertions, 302 deletions
diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 067baace74a0..b87541360f43 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -52,6 +52,11 @@ arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ arm64-obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +arm64-obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o + +ifeq ($(CONFIG_KVM),y) +arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o +endif obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c index b3162715ed78..252396a96c78 100644 --- a/arch/arm64/kernel/acpi.c +++ b/arch/arm64/kernel/acpi.c @@ -117,7 +117,7 @@ bool __init acpi_psci_present(void) } /* Whether HVC must be used instead of SMC as the PSCI conduit */ -bool __init acpi_psci_use_hvc(void) +bool acpi_psci_use_hvc(void) { return acpi_gbl_FADT.arm_boot_flags & ACPI_FADT_PSCI_USE_HVC; } diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 6dd0a3a3e5c9..414288a558c8 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -32,6 +32,8 @@ #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) +int alternatives_applied; + struct alt_region { struct alt_instr *begin; struct alt_instr *end; @@ -143,7 +145,6 @@ static void __apply_alternatives(void *alt_region, bool use_linear_alias) */ static int __apply_alternatives_multi_stop(void *unused) { - static int patched = 0; struct alt_region region = { .begin = (struct alt_instr *)__alt_instructions, .end = (struct alt_instr *)__alt_instructions_end, @@ -151,14 +152,14 @@ static int __apply_alternatives_multi_stop(void *unused) /* We always have a CPU 0 at this point (__init) */ if (smp_processor_id()) { - while (!READ_ONCE(patched)) + while (!READ_ONCE(alternatives_applied)) cpu_relax(); isb(); } else { - BUG_ON(patched); + BUG_ON(alternatives_applied); __apply_alternatives(®ion, true); /* Barriers provided by the cache flushing */ - WRITE_ONCE(patched, 1); + WRITE_ONCE(alternatives_applied, 1); } return 0; diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 71bf088f1e4b..1303e04110cd 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -18,12 +18,14 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ +#include <linux/arm_sdei.h> #include <linux/sched.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/kvm_host.h> #include <linux/suspend.h> #include <asm/cpufeature.h> +#include <asm/fixmap.h> #include <asm/thread_info.h> #include <asm/memory.h> #include <asm/smp_plat.h> @@ -130,6 +132,7 @@ int main(void) BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); + DEFINE(VCPU_FAULT_DISR, offsetof(struct kvm_vcpu, arch.fault.disr_el1)); DEFINE(CPU_GP_REGS, offsetof(struct kvm_cpu_context, gp_regs)); DEFINE(CPU_USER_PT_REGS, offsetof(struct kvm_regs, regs)); DEFINE(CPU_FP_REGS, offsetof(struct kvm_regs, fp_regs)); @@ -148,11 +151,18 @@ int main(void) DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); - BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next)); DEFINE(ARM64_FTR_SYSVAL, offsetof(struct arm64_ftr_reg, sys_val)); + BLANK(); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + DEFINE(TRAMP_VALIAS, TRAMP_VALIAS); +#endif +#ifdef CONFIG_ARM_SDE_INTERFACE + DEFINE(SDEI_EVENT_INTREGS, offsetof(struct sdei_registered_event, interrupted_regs)); + DEFINE(SDEI_EVENT_PRIORITY, offsetof(struct sdei_registered_event, priority)); +#endif return 0; } diff --git a/arch/arm64/kernel/bpi.S b/arch/arm64/kernel/bpi.S new file mode 100644 index 000000000000..76225c2611ea --- /dev/null +++ b/arch/arm64/kernel/bpi.S @@ -0,0 +1,87 @@ +/* + * Contains CPU specific branch predictor invalidation sequences + * + * Copyright (C) 2018 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <linux/linkage.h> + +.macro ventry target + .rept 31 + nop + .endr + b \target +.endm + +.macro vectors target + ventry \target + 0x000 + ventry \target + 0x080 + ventry \target + 0x100 + ventry \target + 0x180 + + ventry \target + 0x200 + ventry \target + 0x280 + ventry \target + 0x300 + ventry \target + 0x380 + + ventry \target + 0x400 + ventry \target + 0x480 + ventry \target + 0x500 + ventry \target + 0x580 + + ventry \target + 0x600 + ventry \target + 0x680 + ventry \target + 0x700 + ventry \target + 0x780 +.endm + + .align 11 +ENTRY(__bp_harden_hyp_vecs_start) + .rept 4 + vectors __kvm_hyp_vector + .endr +ENTRY(__bp_harden_hyp_vecs_end) +ENTRY(__psci_hyp_bp_inval_start) + sub sp, sp, #(8 * 18) + stp x16, x17, [sp, #(16 * 0)] + stp x14, x15, [sp, #(16 * 1)] + stp x12, x13, [sp, #(16 * 2)] + stp x10, x11, [sp, #(16 * 3)] + stp x8, x9, [sp, #(16 * 4)] + stp x6, x7, [sp, #(16 * 5)] + stp x4, x5, [sp, #(16 * 6)] + stp x2, x3, [sp, #(16 * 7)] + stp x0, x1, [sp, #(16 * 8)] + mov x0, #0x84000000 + smc #0 + ldp x16, x17, [sp, #(16 * 0)] + ldp x14, x15, [sp, #(16 * 1)] + ldp x12, x13, [sp, #(16 * 2)] + ldp x10, x11, [sp, #(16 * 3)] + ldp x8, x9, [sp, #(16 * 4)] + ldp x6, x7, [sp, #(16 * 5)] + ldp x4, x5, [sp, #(16 * 6)] + ldp x2, x3, [sp, #(16 * 7)] + ldp x0, x1, [sp, #(16 * 8)] + add sp, sp, #(8 * 18) +ENTRY(__psci_hyp_bp_inval_end) + +ENTRY(__qcom_hyp_sanitize_link_stack_start) + stp x29, x30, [sp, #-16]! + .rept 16 + bl . + 4 + .endr + ldp x29, x30, [sp], #16 +ENTRY(__qcom_hyp_sanitize_link_stack_end) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 0e27f86ee709..ed6881882231 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -30,6 +30,20 @@ is_affected_midr_range(const struct arm64_cpu_capabilities *entry, int scope) entry->midr_range_max); } +static bool __maybe_unused +is_kryo_midr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u32 model; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + model = read_cpuid_id(); + model &= MIDR_IMPLEMENTOR_MASK | (0xf00 << MIDR_PARTNUM_SHIFT) | + MIDR_ARCHITECTURE_MASK; + + return model == entry->midr_model; +} + static bool has_mismatched_cache_line_size(const struct arm64_cpu_capabilities *entry, int scope) @@ -46,6 +60,127 @@ static int cpu_enable_trap_ctr_access(void *__unused) return 0; } +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +#include <asm/mmu_context.h> +#include <asm/cacheflush.h> + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +#ifdef CONFIG_KVM +extern char __psci_hyp_bp_inval_start[], __psci_hyp_bp_inval_end[]; +extern char __qcom_hyp_sanitize_link_stack_start[]; +extern char __qcom_hyp_sanitize_link_stack_end[]; + +static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + void *dst = lm_alias(__bp_harden_hyp_vecs_start + slot * SZ_2K); + int i; + + for (i = 0; i < SZ_2K; i += 0x80) + memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); + + flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); +} + +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + static int last_slot = -1; + static DEFINE_SPINLOCK(bp_lock); + int cpu, slot = -1; + + spin_lock(&bp_lock); + for_each_possible_cpu(cpu) { + if (per_cpu(bp_hardening_data.fn, cpu) == fn) { + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); + break; + } + } + + if (slot == -1) { + last_slot++; + BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start) + / SZ_2K) <= last_slot); + slot = last_slot; + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); + } + + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); + __this_cpu_write(bp_hardening_data.fn, fn); + spin_unlock(&bp_lock); +} +#else +#define __psci_hyp_bp_inval_start NULL +#define __psci_hyp_bp_inval_end NULL +#define __qcom_hyp_sanitize_link_stack_start NULL +#define __qcom_hyp_sanitize_link_stack_end NULL + +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + __this_cpu_write(bp_hardening_data.fn, fn); +} +#endif /* CONFIG_KVM */ + +static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, + bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + u64 pfr0; + + if (!entry->matches(entry, SCOPE_LOCAL_CPU)) + return; + + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) + return; + + __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); +} + +#include <linux/psci.h> + +static int enable_psci_bp_hardening(void *data) +{ + const struct arm64_cpu_capabilities *entry = data; + + if (psci_ops.get_version) + install_bp_hardening_cb(entry, + (bp_hardening_cb_t)psci_ops.get_version, + __psci_hyp_bp_inval_start, + __psci_hyp_bp_inval_end); + + return 0; +} + +static void qcom_link_stack_sanitization(void) +{ + u64 tmp; + + asm volatile("mov %0, x30 \n" + ".rept 16 \n" + "bl . + 4 \n" + ".endr \n" + "mov x30, %0 \n" + : "=&r" (tmp)); +} + +static int qcom_enable_link_stack_sanitization(void *data) +{ + const struct arm64_cpu_capabilities *entry = data; + + install_bp_hardening_cb(entry, qcom_link_stack_sanitization, + __qcom_hyp_sanitize_link_stack_start, + __qcom_hyp_sanitize_link_stack_end); + + return 0; +} +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + #define MIDR_RANGE(model, min, max) \ .def_scope = SCOPE_LOCAL_CPU, \ .matches = is_affected_midr_range, \ @@ -169,6 +304,13 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_CPU_VAR_REV(0, 0), MIDR_CPU_VAR_REV(0, 0)), }, + { + .desc = "Qualcomm Technologies Kryo erratum 1003", + .capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003, + .def_scope = SCOPE_LOCAL_CPU, + .midr_model = MIDR_QCOM_KRYO, + .matches = is_kryo_midr, + }, #endif #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 { @@ -187,6 +329,47 @@ const struct arm64_cpu_capabilities arm64_errata[] = { MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), }, #endif +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), + .enable = qcom_enable_link_stack_sanitization, + }, + { + .capability = ARM64_HARDEN_BP_POST_GUEST_EXIT, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + .enable = enable_psci_bp_hardening, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), + .enable = enable_psci_bp_hardening, + }, +#endif { } }; @@ -200,15 +383,18 @@ void verify_local_cpu_errata_workarounds(void) { const struct arm64_cpu_capabilities *caps = arm64_errata; - for (; caps->matches; caps++) - if (!cpus_have_cap(caps->capability) && - caps->matches(caps, SCOPE_LOCAL_CPU)) { + for (; caps->matches; caps++) { + if (cpus_have_cap(caps->capability)) { + if (caps->enable) + caps->enable((void *)caps); + } else if (caps->matches(caps, SCOPE_LOCAL_CPU)) { pr_crit("CPU%d: Requires work around for %s, not detected" " at boot time\n", smp_processor_id(), caps->desc ? : "an erratum"); cpu_die_early(); } + } } void update_cpu_errata_workarounds(void) diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index a73a5928f09b..0fb6a3151443 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -123,6 +123,7 @@ cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) * sync with the documentation of the CPU feature register ABI. */ static const struct arm64_ftr_bits ftr_id_aa64isar0[] = { + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_FHM_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_DP_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM4_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64ISAR0_SM3_SHIFT, 4, 0), @@ -145,8 +146,11 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = { }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE), FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_RAS_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), @@ -846,6 +850,67 @@ static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unus ID_AA64PFR0_FP_SHIFT) < 0; } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ + +static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + /* Forced on command line? */ + if (__kpti_forced) { + pr_info_once("kernel page table isolation forced %s by command line option\n", + __kpti_forced > 0 ? "ON" : "OFF"); + return __kpti_forced > 0; + } + + /* Useful for KASLR robustness */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return true; + + /* Don't force KPTI for CPUs that are not vulnerable */ + switch (read_cpuid_id() & MIDR_CPU_MODEL_MASK) { + case MIDR_CAVIUM_THUNDERX2: + case MIDR_BRCM_VULCAN: + return false; + } + + /* Defer to CPU feature registers */ + return !cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV3_SHIFT); +} + +static int __init parse_kpti(char *str) +{ + bool enabled; + int ret = strtobool(str, &enabled); + + if (ret) + return ret; + + __kpti_forced = enabled ? 1 : -1; + return 0; +} +__setup("kpti=", parse_kpti); +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + +static int cpu_copy_el2regs(void *__unused) +{ + /* + * Copy register values that aren't redirected by hardware. + * + * Before code patching, we only set tpidr_el1, all CPUs need to copy + * this value to tpidr_el2 before we patch the code. Once we've done + * that, freshly-onlined CPUs will set tpidr_el2, so we don't need to + * do anything here. + */ + if (!alternatives_applied) + write_sysreg(read_sysreg(tpidr_el1), tpidr_el2); + + return 0; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -915,6 +980,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .capability = ARM64_HAS_VIRT_HOST_EXTN, .def_scope = SCOPE_SYSTEM, .matches = runs_at_el2, + .enable = cpu_copy_el2regs, }, { .desc = "32-bit EL0 Support", @@ -932,6 +998,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + { + .desc = "Kernel page table isolation (KPTI)", + .capability = ARM64_UNMAP_KERNEL_AT_EL0, + .def_scope = SCOPE_SYSTEM, + .matches = unmap_kernel_at_el0, + }, +#endif { /* FP/SIMD is not implemented */ .capability = ARM64_HAS_NO_FPSIMD, @@ -963,6 +1037,19 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .enable = sve_kernel_enable, }, #endif /* CONFIG_ARM64_SVE */ +#ifdef CONFIG_ARM64_RAS_EXTN + { + .desc = "RAS Extension Support", + .capability = ARM64_HAS_RAS_EXTN, + .def_scope = SCOPE_SYSTEM, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64PFR0_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64PFR0_RAS_SHIFT, + .min_field_value = ID_AA64PFR0_RAS_V1, + .enable = cpu_clear_disr, + }, +#endif /* CONFIG_ARM64_RAS_EXTN */ {}, }; @@ -992,6 +1079,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = { HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM3_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM3), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_SM4_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SM4), HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_DP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDDP), + HWCAP_CAP(SYS_ID_AA64ISAR0_EL1, ID_AA64ISAR0_FHM_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_ASIMDFHM), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_FP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), @@ -1071,6 +1159,25 @@ static void __init setup_elf_hwcaps(const struct arm64_cpu_capabilities *hwcaps) cap_set_elf_hwcap(hwcaps); } +/* + * Check if the current CPU has a given feature capability. + * Should be called from non-preemptible context. + */ +static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, + unsigned int cap) +{ + const struct arm64_cpu_capabilities *caps; + + if (WARN_ON(preemptible())) + return false; + + for (caps = cap_array; caps->matches; caps++) + if (caps->capability == cap && + caps->matches(caps, SCOPE_LOCAL_CPU)) + return true; + return false; +} + void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, const char *info) { @@ -1106,7 +1213,7 @@ void __init enable_cpu_capabilities(const struct arm64_cpu_capabilities *caps) * uses an IPI, giving us a PSTATE that disappears when * we return. */ - stop_machine(caps->enable, NULL, cpu_online_mask); + stop_machine(caps->enable, (void *)caps, cpu_online_mask); } } } @@ -1134,8 +1241,9 @@ verify_local_elf_hwcaps(const struct arm64_cpu_capabilities *caps) } static void -verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) +verify_local_cpu_features(const struct arm64_cpu_capabilities *caps_list) { + const struct arm64_cpu_capabilities *caps = caps_list; for (; caps->matches; caps++) { if (!cpus_have_cap(caps->capability)) continue; @@ -1143,13 +1251,13 @@ verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) * If the new CPU misses an advertised feature, we cannot proceed * further, park the cpu. */ - if (!caps->matches(caps, SCOPE_LOCAL_CPU)) { + if (!__this_cpu_has_cap(caps_list, caps->capability)) { pr_crit("CPU%d: missing feature: %s\n", smp_processor_id(), caps->desc); cpu_die_early(); } if (caps->enable) - caps->enable(NULL); + caps->enable((void *)caps); } } @@ -1189,6 +1297,9 @@ static void verify_local_cpu_capabilities(void) if (system_supports_sve()) verify_sve_features(); + + if (system_uses_ttbr0_pan()) + pr_info("Emulating Privileged Access Never (PAN) using TTBR0_EL1 switching\n"); } void check_local_cpu_capabilities(void) @@ -1225,25 +1336,6 @@ static void __init mark_const_caps_ready(void) static_branch_enable(&arm64_const_caps_ready); } -/* - * Check if the current CPU has a given feature capability. - * Should be called from non-preemptible context. - */ -static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, - unsigned int cap) -{ - const struct arm64_cpu_capabilities *caps; - - if (WARN_ON(preemptible())) - return false; - - for (caps = cap_array; caps->desc; caps++) - if (caps->capability == cap && caps->matches) - return caps->matches(caps, SCOPE_LOCAL_CPU); - - return false; -} - extern const struct arm64_cpu_capabilities arm64_errata[]; bool this_cpu_has_cap(unsigned int cap) @@ -1387,3 +1479,11 @@ static int __init enable_mrs_emulation(void) } core_initcall(enable_mrs_emulation); + +int cpu_clear_disr(void *__unused) +{ + /* Firmware may have left a deferred SError in this register. */ + write_sysreg_s(0, SYS_DISR_EL1); + + return 0; +} diff --git a/arch/arm64/kernel/cpuidle.c b/arch/arm64/kernel/cpuidle.c index fd691087dc9a..f2d13810daa8 100644 --- a/arch/arm64/kernel/cpuidle.c +++ b/arch/arm64/kernel/cpuidle.c @@ -47,6 +47,8 @@ int arm_cpuidle_suspend(int index) #include <acpi/processor.h> +#define ARM64_LPI_IS_RETENTION_STATE(arch_flags) (!(arch_flags)) + int acpi_processor_ffh_lpi_probe(unsigned int cpu) { return arm_cpuidle_init(cpu); @@ -54,6 +56,10 @@ int acpi_processor_ffh_lpi_probe(unsigned int cpu) int acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi) { - return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index); + if (ARM64_LPI_IS_RETENTION_STATE(lpi->arch_flags)) + return CPU_PM_CPU_IDLE_ENTER_RETENTION(arm_cpuidle_suspend, + lpi->index); + else + return CPU_PM_CPU_IDLE_ENTER(arm_cpuidle_suspend, lpi->index); } #endif diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 1e2554543506..7f94623df8a5 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -76,6 +76,7 @@ static const char *const hwcap_str[] = { "asimddp", "sha512", "sve", + "asimdfhm", NULL }; diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index a88b6ccebbb4..53781f5687c5 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -209,12 +209,13 @@ NOKPROBE_SYMBOL(call_step_hook); static void send_user_sigtrap(int si_code) { struct pt_regs *regs = current_pt_regs(); - siginfo_t info = { - .si_signo = SIGTRAP, - .si_errno = 0, - .si_code = si_code, - .si_addr = (void __user *)instruction_pointer(regs), - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)instruction_pointer(regs); if (WARN_ON(!user_mode(regs))) return; diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 5edde1c2e93e..cccd2788e631 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -28,6 +28,8 @@ #include <asm/errno.h> #include <asm/esr.h> #include <asm/irq.h> +#include <asm/memory.h> +#include <asm/mmu.h> #include <asm/processor.h> #include <asm/ptrace.h> #include <asm/thread_info.h> @@ -69,8 +71,21 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_ventry label + .macro kernel_ventry, el, label, regsize = 64 .align 7 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +alternative_if ARM64_UNMAP_KERNEL_AT_EL0 + .if \el == 0 + .if \regsize == 64 + mrs x30, tpidrro_el0 + msr tpidrro_el0, xzr + .else + mov x30, xzr + .endif + .endif +alternative_else_nop_endif +#endif + sub sp, sp, #S_FRAME_SIZE #ifdef CONFIG_VMAP_STACK /* @@ -82,7 +97,7 @@ tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp - b \label + b el\()\el\()_\label 0: /* @@ -114,7 +129,12 @@ sub sp, sp, x0 mrs x0, tpidrro_el0 #endif - b \label + b el\()\el\()_\label + .endm + + .macro tramp_alias, dst, sym + mov_q \dst, TRAMP_VALIAS + add \dst, \dst, #(\sym - .entry.tramp.text) .endm .macro kernel_entry, el, regsize = 64 @@ -185,7 +205,7 @@ alternative_else_nop_endif .if \el != 0 mrs x21, ttbr0_el1 - tst x21, #0xffff << 48 // Check for the reserved ASID + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR @@ -248,7 +268,7 @@ alternative_else_nop_endif tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set .endif - __uaccess_ttbr0_enable x0 + __uaccess_ttbr0_enable x0, x1 .if \el == 0 /* @@ -257,7 +277,7 @@ alternative_else_nop_endif * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache * corruption). */ - post_ttbr0_update_workaround + bl post_ttbr_update_workaround .endif 1: .if \el != 0 @@ -269,18 +289,20 @@ alternative_else_nop_endif .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + tst x22, #PSR_MODE32_BIT // native task? + b.eq 3f + #ifdef CONFIG_ARM64_ERRATUM_845719 alternative_if ARM64_WORKAROUND_845719 - tbz x22, #4, 1f #ifdef CONFIG_PID_IN_CONTEXTIDR mrs x29, contextidr_el1 msr contextidr_el1, x29 #else msr contextidr_el1, xzr #endif -1: alternative_else_nop_endif #endif +3: .endif msr elr_el1, x21 // set up the return data @@ -306,7 +328,21 @@ alternative_else_nop_endif * ARCH_HAS_MEMBARRIER_SYNC_CORE rely on eret context synchronization * when returning from IPI handler, and when returning to user-space. */ - eret // return to kernel + + .if \el == 0 +alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + bne 4f + msr far_el1, x30 + tramp_alias x30, tramp_exit_native + br x30 +4: + tramp_alias x30, tramp_exit_compat + br x30 +#endif + .else + eret + .endif .endm .macro irq_stack_entry @@ -371,31 +407,31 @@ tsk .req x28 // current thread_info .align 11 ENTRY(vectors) - kernel_ventry el1_sync_invalid // Synchronous EL1t - kernel_ventry el1_irq_invalid // IRQ EL1t - kernel_ventry el1_fiq_invalid // FIQ EL1t - kernel_ventry el1_error_invalid // Error EL1t + kernel_ventry 1, sync_invalid // Synchronous EL1t + kernel_ventry 1, irq_invalid // IRQ EL1t + kernel_ventry 1, fiq_invalid // FIQ EL1t + kernel_ventry 1, error_invalid // Error EL1t - kernel_ventry el1_sync // Synchronous EL1h - kernel_ventry el1_irq // IRQ EL1h - kernel_ventry el1_fiq_invalid // FIQ EL1h - kernel_ventry el1_error // Error EL1h + kernel_ventry 1, sync // Synchronous EL1h + kernel_ventry 1, irq // IRQ EL1h + kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, error // Error EL1h - kernel_ventry el0_sync // Synchronous 64-bit EL0 - kernel_ventry el0_irq // IRQ 64-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 64-bit EL0 - kernel_ventry el0_error // Error 64-bit EL0 + kernel_ventry 0, sync // Synchronous 64-bit EL0 + kernel_ventry 0, irq // IRQ 64-bit EL0 + kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 + kernel_ventry 0, error // Error 64-bit EL0 #ifdef CONFIG_COMPAT - kernel_ventry el0_sync_compat // Synchronous 32-bit EL0 - kernel_ventry el0_irq_compat // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - kernel_ventry el0_error_compat // Error 32-bit EL0 + kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_compat, 32 // Error 32-bit EL0 #else - kernel_ventry el0_sync_invalid // Synchronous 32-bit EL0 - kernel_ventry el0_irq_invalid // IRQ 32-bit EL0 - kernel_ventry el0_fiq_invalid // FIQ 32-bit EL0 - kernel_ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 #endif END(vectors) @@ -689,12 +725,15 @@ el0_ia: * Instruction abort handling */ mrs x26, far_el1 - enable_daif + enable_da_f +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif ct_user_exit mov x0, x26 mov x1, x25 mov x2, sp - bl do_mem_abort + bl do_el0_ia_bp_hardening b ret_to_user el0_fpsimd_acc: /* @@ -947,6 +986,124 @@ __ni_sys_trace: .popsection // .entry.text +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" + + .macro tramp_map_kernel, tmp + mrs \tmp, ttbr1_el1 + add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + bic \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 +alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003 + /* ASID already in \tmp[63:48] */ + movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12) + movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12) + /* 2MB boundary containing the vectors, so we nobble the walk cache */ + movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12) + isb + tlbi vae1, \tmp + dsb nsh +alternative_else_nop_endif +#endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */ + .endm + + .macro tramp_unmap_kernel, tmp + mrs \tmp, ttbr1_el1 + sub \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + orr \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + /* + * We avoid running the post_ttbr_update_workaround here because the + * user and kernel ASIDs don't have conflicting mappings, so any + * "blessing" as described in: + * + * http://lkml.kernel.org/r/56BB848A.6060603@caviumnetworks.com + * + * will not hurt correctness. Whilst this may partially defeat the + * point of using split ASIDs in the first place, it avoids + * the hit of invalidating the entire I-cache on every return to + * userspace. + */ + .endm + + .macro tramp_ventry, regsize = 64 + .align 7 +1: + .if \regsize == 64 + msr tpidrro_el0, x30 // Restored in kernel_ventry + .endif + /* + * Defend against branch aliasing attacks by pushing a dummy + * entry onto the return stack and using a RET instruction to + * enter the full-fat kernel vectors. + */ + bl 2f + b . +2: + tramp_map_kernel x30 +#ifdef CONFIG_RANDOMIZE_BASE + adr x30, tramp_vectors + PAGE_SIZE +alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 + ldr x30, [x30] +#else + ldr x30, =vectors +#endif + prfm plil1strm, [x30, #(1b - tramp_vectors)] + msr vbar_el1, x30 + add x30, x30, #(1b - tramp_vectors) + isb + ret + .endm + + .macro tramp_exit, regsize = 64 + adr x30, tramp_vectors + msr vbar_el1, x30 + tramp_unmap_kernel x30 + .if \regsize == 64 + mrs x30, far_el1 + .endif + eret + .endm + + .align 11 +ENTRY(tramp_vectors) + .space 0x400 + + tramp_ventry + tramp_ventry + tramp_ventry + tramp_ventry + + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 +END(tramp_vectors) + +ENTRY(tramp_exit_native) + tramp_exit +END(tramp_exit_native) + +ENTRY(tramp_exit_compat) + tramp_exit 32 +END(tramp_exit_compat) + + .ltorg + .popsection // .entry.tramp.text +#ifdef CONFIG_RANDOMIZE_BASE + .pushsection ".rodata", "a" + .align PAGE_SHIFT + .globl __entry_tramp_data_start +__entry_tramp_data_start: + .quad vectors + .popsection // .rodata +#endif /* CONFIG_RANDOMIZE_BASE */ +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + /* * Special system call wrappers. */ @@ -1000,3 +1157,180 @@ ENTRY(ret_from_fork) b ret_to_user ENDPROC(ret_from_fork) NOKPROBE(ret_from_fork) + +#ifdef CONFIG_ARM_SDE_INTERFACE + +#include <asm/sdei.h> +#include <uapi/linux/arm_sdei.h> + +.macro sdei_handler_exit exit_mode + /* On success, this call never returns... */ + cmp \exit_mode, #SDEI_EXIT_SMC + b.ne 99f + smc #0 + b . +99: hvc #0 + b . +.endm + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * The regular SDEI entry point may have been unmapped along with the rest of + * the kernel. This trampoline restores the kernel mapping to make the x1 memory + * argument accessible. + * + * This clobbers x4, __sdei_handler() will restore this from firmware's + * copy. + */ +.ltorg +.pushsection ".entry.tramp.text", "ax" +ENTRY(__sdei_asm_entry_trampoline) + mrs x4, ttbr1_el1 + tbz x4, #USER_ASID_BIT, 1f + + tramp_map_kernel tmp=x4 + isb + mov x4, xzr + + /* + * Use reg->interrupted_regs.addr_limit to remember whether to unmap + * the kernel on exit. + */ +1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] + +#ifdef CONFIG_RANDOMIZE_BASE + adr x4, tramp_vectors + PAGE_SIZE + add x4, x4, #:lo12:__sdei_asm_trampoline_next_handler + ldr x4, [x4] +#else + ldr x4, =__sdei_asm_handler +#endif + br x4 +ENDPROC(__sdei_asm_entry_trampoline) +NOKPROBE(__sdei_asm_entry_trampoline) + +/* + * Make the exit call and restore the original ttbr1_el1 + * + * x0 & x1: setup for the exit API call + * x2: exit_mode + * x4: struct sdei_registered_event argument from registration time. + */ +ENTRY(__sdei_asm_exit_trampoline) + ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] + cbnz x4, 1f + + tramp_unmap_kernel tmp=x4 + +1: sdei_handler_exit exit_mode=x2 +ENDPROC(__sdei_asm_exit_trampoline) +NOKPROBE(__sdei_asm_exit_trampoline) + .ltorg +.popsection // .entry.tramp.text +#ifdef CONFIG_RANDOMIZE_BASE +.pushsection ".rodata", "a" +__sdei_asm_trampoline_next_handler: + .quad __sdei_asm_handler +.popsection // .rodata +#endif /* CONFIG_RANDOMIZE_BASE */ +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + +/* + * Software Delegated Exception entry point. + * + * x0: Event number + * x1: struct sdei_registered_event argument from registration time. + * x2: interrupted PC + * x3: interrupted PSTATE + * x4: maybe clobbered by the trampoline + * + * Firmware has preserved x0->x17 for us, we must save/restore the rest to + * follow SMC-CC. We save (or retrieve) all the registers as the handler may + * want them. + */ +ENTRY(__sdei_asm_handler) + stp x2, x3, [x1, #SDEI_EVENT_INTREGS + S_PC] + stp x4, x5, [x1, #SDEI_EVENT_INTREGS + 16 * 2] + stp x6, x7, [x1, #SDEI_EVENT_INTREGS + 16 * 3] + stp x8, x9, [x1, #SDEI_EVENT_INTREGS + 16 * 4] + stp x10, x11, [x1, #SDEI_EVENT_INTREGS + 16 * 5] + stp x12, x13, [x1, #SDEI_EVENT_INTREGS + 16 * 6] + stp x14, x15, [x1, #SDEI_EVENT_INTREGS + 16 * 7] + stp x16, x17, [x1, #SDEI_EVENT_INTREGS + 16 * 8] + stp x18, x19, [x1, #SDEI_EVENT_INTREGS + 16 * 9] + stp x20, x21, [x1, #SDEI_EVENT_INTREGS + 16 * 10] + stp x22, x23, [x1, #SDEI_EVENT_INTREGS + 16 * 11] + stp x24, x25, [x1, #SDEI_EVENT_INTREGS + 16 * 12] + stp x26, x27, [x1, #SDEI_EVENT_INTREGS + 16 * 13] + stp x28, x29, [x1, #SDEI_EVENT_INTREGS + 16 * 14] + mov x4, sp + stp lr, x4, [x1, #SDEI_EVENT_INTREGS + S_LR] + + mov x19, x1 + +#ifdef CONFIG_VMAP_STACK + /* + * entry.S may have been using sp as a scratch register, find whether + * this is a normal or critical event and switch to the appropriate + * stack for this CPU. + */ + ldrb w4, [x19, #SDEI_EVENT_PRIORITY] + cbnz w4, 1f + ldr_this_cpu dst=x5, sym=sdei_stack_normal_ptr, tmp=x6 + b 2f +1: ldr_this_cpu dst=x5, sym=sdei_stack_critical_ptr, tmp=x6 +2: mov x6, #SDEI_STACK_SIZE + add x5, x5, x6 + mov sp, x5 +#endif + + /* + * We may have interrupted userspace, or a guest, or exit-from or + * return-to either of these. We can't trust sp_el0, restore it. + */ + mrs x28, sp_el0 + ldr_this_cpu dst=x0, sym=__entry_task, tmp=x1 + msr sp_el0, x0 + + /* If we interrupted the kernel point to the previous stack/frame. */ + and x0, x3, #0xc + mrs x1, CurrentEL + cmp x0, x1 + csel x29, x29, xzr, eq // fp, or zero + csel x4, x2, xzr, eq // elr, or zero + + stp x29, x4, [sp, #-16]! + mov x29, sp + + add x0, x19, #SDEI_EVENT_INTREGS + mov x1, x19 + bl __sdei_handler + + msr sp_el0, x28 + /* restore regs >x17 that we clobbered */ + mov x4, x19 // keep x4 for __sdei_asm_exit_trampoline + ldp x28, x29, [x4, #SDEI_EVENT_INTREGS + 16 * 14] + ldp x18, x19, [x4, #SDEI_EVENT_INTREGS + 16 * 9] + ldp lr, x1, [x4, #SDEI_EVENT_INTREGS + S_LR] + mov sp, x1 + + mov x1, x0 // address to complete_and_resume + /* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */ + cmp x0, #1 + mov_q x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE + mov_q x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME + csel x0, x2, x3, ls + + ldr_l x2, sdei_exit_mode + +alternative_if_not ARM64_UNMAP_KERNEL_AT_EL0 + sdei_handler_exit exit_mode=x2 +alternative_else_nop_endif + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_alias dst=x5, sym=__sdei_asm_exit_trampoline + br x5 +#endif +ENDPROC(__sdei_asm_handler) +NOKPROBE(__sdei_asm_handler) +#endif /* CONFIG_ARM_SDE_INTERFACE */ diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index fae81f7964b4..e7226c4c7493 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -867,7 +867,7 @@ asmlinkage void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) { siginfo_t info; - unsigned int si_code = 0; + unsigned int si_code = FPE_FIXME; if (esr & FPEXC_IOF) si_code = FPE_FLTINV; @@ -1036,14 +1036,14 @@ void fpsimd_restore_current_state(void) * flag that indicates that the FPSIMD register contents are the most recent * FPSIMD state of 'current' */ -void fpsimd_update_current_state(struct fpsimd_state *state) +void fpsimd_update_current_state(struct user_fpsimd_state const *state) { if (!system_supports_fpsimd()) return; local_bh_disable(); - current->thread.fpsimd_state.user_fpsimd = state->user_fpsimd; + current->thread.fpsimd_state.user_fpsimd = *state; if (system_supports_sve() && test_thread_flag(TIF_SVE)) fpsimd_to_sve(current); diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index e3cb9fbf96b6..ba3ab04788dc 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -148,6 +148,26 @@ preserve_boot_args: ENDPROC(preserve_boot_args) /* + * Macro to arrange a physical address in a page table entry, taking care of + * 52-bit addresses. + * + * Preserves: phys + * Returns: pte + */ + .macro phys_to_pte, phys, pte +#ifdef CONFIG_ARM64_PA_BITS_52 + /* + * We assume \phys is 64K aligned and this is guaranteed by only + * supporting this configuration with 64K pages. + */ + orr \pte, \phys, \phys, lsr #36 + and \pte, \pte, #PTE_ADDR_MASK +#else + mov \pte, \phys +#endif + .endm + +/* * Macro to create a table entry to the next page. * * tbl: page table address @@ -156,54 +176,124 @@ ENDPROC(preserve_boot_args) * ptrs: #imm pointers per table page * * Preserves: virt - * Corrupts: tmp1, tmp2 + * Corrupts: ptrs, tmp1, tmp2 * Returns: tbl -> next level table page address */ .macro create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2 - lsr \tmp1, \virt, #\shift - and \tmp1, \tmp1, #\ptrs - 1 // table index - add \tmp2, \tbl, #PAGE_SIZE + add \tmp1, \tbl, #PAGE_SIZE + phys_to_pte \tmp1, \tmp2 orr \tmp2, \tmp2, #PMD_TYPE_TABLE // address of next table and entry type + lsr \tmp1, \virt, #\shift + sub \ptrs, \ptrs, #1 + and \tmp1, \tmp1, \ptrs // table index str \tmp2, [\tbl, \tmp1, lsl #3] add \tbl, \tbl, #PAGE_SIZE // next level table page .endm /* - * Macro to populate the PGD (and possibily PUD) for the corresponding - * block entry in the next level (tbl) for the given virtual address. + * Macro to populate page table entries, these entries can be pointers to the next level + * or last level entries pointing to physical memory. + * + * tbl: page table address + * rtbl: pointer to page table or physical memory + * index: start index to write + * eindex: end index to write - [index, eindex] written to + * flags: flags for pagetable entry to or in + * inc: increment to rtbl between each entry + * tmp1: temporary variable * - * Preserves: tbl, next, virt - * Corrupts: tmp1, tmp2 + * Preserves: tbl, eindex, flags, inc + * Corrupts: index, tmp1 + * Returns: rtbl */ - .macro create_pgd_entry, tbl, virt, tmp1, tmp2 - create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2 -#if SWAPPER_PGTABLE_LEVELS > 3 - create_table_entry \tbl, \virt, PUD_SHIFT, PTRS_PER_PUD, \tmp1, \tmp2 -#endif -#if SWAPPER_PGTABLE_LEVELS > 2 - create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2 -#endif + .macro populate_entries, tbl, rtbl, index, eindex, flags, inc, tmp1 +.Lpe\@: phys_to_pte \rtbl, \tmp1 + orr \tmp1, \tmp1, \flags // tmp1 = table entry + str \tmp1, [\tbl, \index, lsl #3] + add \rtbl, \rtbl, \inc // rtbl = pa next level + add \index, \index, #1 + cmp \index, \eindex + b.ls .Lpe\@ + .endm + +/* + * Compute indices of table entries from virtual address range. If multiple entries + * were needed in the previous page table level then the next page table level is assumed + * to be composed of multiple pages. (This effectively scales the end index). + * + * vstart: virtual address of start of range + * vend: virtual address of end of range + * shift: shift used to transform virtual address into index + * ptrs: number of entries in page table + * istart: index in table corresponding to vstart + * iend: index in table corresponding to vend + * count: On entry: how many extra entries were required in previous level, scales + * our end index. + * On exit: returns how many extra entries required for next page table level + * + * Preserves: vstart, vend, shift, ptrs + * Returns: istart, iend, count + */ + .macro compute_indices, vstart, vend, shift, ptrs, istart, iend, count + lsr \iend, \vend, \shift + mov \istart, \ptrs + sub \istart, \istart, #1 + and \iend, \iend, \istart // iend = (vend >> shift) & (ptrs - 1) + mov \istart, \ptrs + mul \istart, \istart, \count + add \iend, \iend, \istart // iend += (count - 1) * ptrs + // our entries span multiple tables + + lsr \istart, \vstart, \shift + mov \count, \ptrs + sub \count, \count, #1 + and \istart, \istart, \count + + sub \count, \iend, \istart .endm /* - * Macro to populate block entries in the page table for the start..end - * virtual range (inclusive). + * Map memory for specified virtual address range. Each level of page table needed supports + * multiple entries. If a level requires n entries the next page table level is assumed to be + * formed from n pages. + * + * tbl: location of page table + * rtbl: address to be used for first level page table entry (typically tbl + PAGE_SIZE) + * vstart: start address to map + * vend: end address to map - we map [vstart, vend] + * flags: flags to use to map last level entries + * phys: physical address corresponding to vstart - physical memory is contiguous + * pgds: the number of pgd entries * - * Preserves: tbl, flags - * Corrupts: phys, start, end, pstate + * Temporaries: istart, iend, tmp, count, sv - these need to be different registers + * Preserves: vstart, vend, flags + * Corrupts: tbl, rtbl, istart, iend, tmp, count, sv */ - .macro create_block_map, tbl, flags, phys, start, end - lsr \phys, \phys, #SWAPPER_BLOCK_SHIFT - lsr \start, \start, #SWAPPER_BLOCK_SHIFT - and \start, \start, #PTRS_PER_PTE - 1 // table index - orr \phys, \flags, \phys, lsl #SWAPPER_BLOCK_SHIFT // table entry - lsr \end, \end, #SWAPPER_BLOCK_SHIFT - and \end, \end, #PTRS_PER_PTE - 1 // table end index -9999: str \phys, [\tbl, \start, lsl #3] // store the entry - add \start, \start, #1 // next entry - add \phys, \phys, #SWAPPER_BLOCK_SIZE // next block - cmp \start, \end - b.ls 9999b + .macro map_memory, tbl, rtbl, vstart, vend, flags, phys, pgds, istart, iend, tmp, count, sv + add \rtbl, \tbl, #PAGE_SIZE + mov \sv, \rtbl + mov \count, #0 + compute_indices \vstart, \vend, #PGDIR_SHIFT, \pgds, \istart, \iend, \count + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv + mov \sv, \rtbl + +#if SWAPPER_PGTABLE_LEVELS > 3 + compute_indices \vstart, \vend, #PUD_SHIFT, #PTRS_PER_PUD, \istart, \iend, \count + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv + mov \sv, \rtbl +#endif + +#if SWAPPER_PGTABLE_LEVELS > 2 + compute_indices \vstart, \vend, #SWAPPER_TABLE_SHIFT, #PTRS_PER_PMD, \istart, \iend, \count + populate_entries \tbl, \rtbl, \istart, \iend, #PMD_TYPE_TABLE, #PAGE_SIZE, \tmp + mov \tbl, \sv +#endif + + compute_indices \vstart, \vend, #SWAPPER_BLOCK_SHIFT, #PTRS_PER_PTE, \istart, \iend, \count + bic \count, \phys, #SWAPPER_BLOCK_SIZE - 1 + populate_entries \tbl, \count, \istart, \iend, \flags, #SWAPPER_BLOCK_SIZE, \tmp .endm /* @@ -221,14 +311,16 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + adrp x1, swapper_pg_end + sub x1, x1, x0 bl __inval_dcache_area /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + adrp x1, swapper_pg_end + sub x1, x1, x0 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -244,26 +336,13 @@ __create_page_tables: adrp x0, idmap_pg_dir adrp x3, __idmap_text_start // __pa(__idmap_text_start) -#ifndef CONFIG_ARM64_VA_BITS_48 -#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) -#define EXTRA_PTRS (1 << (48 - EXTRA_SHIFT)) - - /* - * If VA_BITS < 48, it may be too small to allow for an ID mapping to be - * created that covers system RAM if that is located sufficiently high - * in the physical address space. So for the ID map, use an extended - * virtual range in that case, by configuring an additional translation - * level. - * First, we have to verify our assumption that the current value of - * VA_BITS was chosen such that all translation levels are fully - * utilised, and that lowering T0SZ will always result in an additional - * translation level to be configured. - */ -#if VA_BITS != EXTRA_SHIFT -#error "Mismatch between VA_BITS and page size/number of translation levels" -#endif - /* + * VA_BITS may be too small to allow for an ID mapping to be created + * that covers system RAM if that is located sufficiently high in the + * physical address space. So for the ID map, use an extended virtual + * range in that case, and configure an additional translation level + * if needed. + * * Calculate the maximum allowed value for TCR_EL1.T0SZ so that the * entire ID map region can be mapped. As T0SZ == (64 - #bits used), * this number conveniently equals the number of leading zeroes in @@ -272,21 +351,44 @@ __create_page_tables: adrp x5, __idmap_text_end clz x5, x5 cmp x5, TCR_T0SZ(VA_BITS) // default T0SZ small enough? - b.ge 1f // .. then skip additional level + b.ge 1f // .. then skip VA range extension adr_l x6, idmap_t0sz str x5, [x6] dmb sy dc ivac, x6 // Invalidate potentially stale cache line - create_table_entry x0, x3, EXTRA_SHIFT, EXTRA_PTRS, x5, x6 -1: +#if (VA_BITS < 48) +#define EXTRA_SHIFT (PGDIR_SHIFT + PAGE_SHIFT - 3) +#define EXTRA_PTRS (1 << (PHYS_MASK_SHIFT - EXTRA_SHIFT)) + + /* + * If VA_BITS < 48, we have to configure an additional table level. + * First, we have to verify our assumption that the current value of + * VA_BITS was chosen such that all translation levels are fully + * utilised, and that lowering T0SZ will always result in an additional + * translation level to be configured. + */ +#if VA_BITS != EXTRA_SHIFT +#error "Mismatch between VA_BITS and page size/number of translation levels" #endif - create_pgd_entry x0, x3, x5, x6 + mov x4, EXTRA_PTRS + create_table_entry x0, x3, EXTRA_SHIFT, x4, x5, x6 +#else + /* + * If VA_BITS == 48, we don't have to configure an additional + * translation level, but the top-level table has more entries. + */ + mov x4, #1 << (PHYS_MASK_SHIFT - PGDIR_SHIFT) + str_l x4, idmap_ptrs_per_pgd, x5 +#endif +1: + ldr_l x4, idmap_ptrs_per_pgd mov x5, x3 // __pa(__idmap_text_start) adr_l x6, __idmap_text_end // __pa(__idmap_text_end) - create_block_map x0, x7, x3, x5, x6 + + map_memory x0, x1, x3, x6, x7, x3, x4, x10, x11, x12, x13, x14 /* * Map the kernel image (starting with PHYS_OFFSET). @@ -294,12 +396,13 @@ __create_page_tables: adrp x0, swapper_pg_dir mov_q x5, KIMAGE_VADDR + TEXT_OFFSET // compile time __va(_text) add x5, x5, x23 // add KASLR displacement - create_pgd_entry x0, x5, x3, x6 + mov x4, PTRS_PER_PGD adrp x6, _end // runtime __pa(_end) adrp x3, _text // runtime __pa(_text) sub x6, x6, x3 // _end - _text add x6, x6, x5 // runtime __va(_end) - create_block_map x0, x7, x3, x5, x6 + + map_memory x0, x1, x5, x6, x7, x3, x4, x10, x11, x12, x13, x14 /* * Since the page tables have been populated with non-cacheable @@ -307,7 +410,8 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + adrp x1, swapper_pg_end + sub x1, x1, x0 dmb sy bl __inval_dcache_area @@ -388,17 +492,13 @@ ENTRY(el2_setup) mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq 1f - mrs x0, sctlr_el1 -CPU_BE( orr x0, x0, #(3 << 24) ) // Set the EE and E0E bits for EL1 -CPU_LE( bic x0, x0, #(3 << 24) ) // Clear the EE and E0E bits for EL1 + mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) msr sctlr_el1, x0 mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 isb ret -1: mrs x0, sctlr_el2 -CPU_BE( orr x0, x0, #(1 << 25) ) // Set the EE bit for EL2 -CPU_LE( bic x0, x0, #(1 << 25) ) // Clear the EE bit for EL2 +1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) msr sctlr_el2, x0 #ifdef CONFIG_ARM64_VHE @@ -514,10 +614,7 @@ install_el2_stub: * requires no configuration, and all non-hyp-specific EL2 setup * will be done via the _EL1 system register aliases in __cpu_setup. */ - /* sctlr_el1 */ - mov x0, #0x0800 // Set/clear RES{1,0} bits -CPU_BE( movk x0, #0x33d0, lsl #16 ) // Set EE and E0E on BE systems -CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems + mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) msr sctlr_el1, x0 /* Coprocessor traps. */ @@ -679,8 +776,10 @@ ENTRY(__enable_mmu) update_early_cpu_boot_status 0, x1, x2 adrp x1, idmap_pg_dir adrp x2, swapper_pg_dir - msr ttbr0_el1, x1 // load TTBR0 - msr ttbr1_el1, x2 // load TTBR1 + phys_to_ttbr x1, x3 + phys_to_ttbr x2, x4 + msr ttbr0_el1, x3 // load TTBR0 + msr ttbr1_el1, x4 // load TTBR1 isb msr sctlr_el1, x0 isb diff --git a/arch/arm64/kernel/hibernate-asm.S b/arch/arm64/kernel/hibernate-asm.S index e56d848b6466..84f5d52fddda 100644 --- a/arch/arm64/kernel/hibernate-asm.S +++ b/arch/arm64/kernel/hibernate-asm.S @@ -33,12 +33,14 @@ * Even switching to our copied tables will cause a changed output address at * each stage of the walk. */ -.macro break_before_make_ttbr_switch zero_page, page_table - msr ttbr1_el1, \zero_page +.macro break_before_make_ttbr_switch zero_page, page_table, tmp + phys_to_ttbr \zero_page, \tmp + msr ttbr1_el1, \tmp isb tlbi vmalle1 dsb nsh - msr ttbr1_el1, \page_table + phys_to_ttbr \page_table, \tmp + msr ttbr1_el1, \tmp isb .endm @@ -78,7 +80,7 @@ ENTRY(swsusp_arch_suspend_exit) * We execute from ttbr0, change ttbr1 to our copied linear map tables * with a break-before-make via the zero page */ - break_before_make_ttbr_switch x5, x0 + break_before_make_ttbr_switch x5, x0, x6 mov x21, x1 mov x30, x2 @@ -109,7 +111,7 @@ ENTRY(swsusp_arch_suspend_exit) dsb ish /* wait for PoU cleaning to finish */ /* switch to the restored kernels page tables */ - break_before_make_ttbr_switch x25, x21 + break_before_make_ttbr_switch x25, x21, x6 ic ialluis dsb ish diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 3009b8b80f08..f20cf7e99249 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -247,8 +247,7 @@ static int create_safe_exec_page(void *src_start, size_t length, } pte = pte_offset_kernel(pmd, dst_addr); - set_pte(pte, __pte(virt_to_phys((void *)dst) | - pgprot_val(PAGE_KERNEL_EXEC))); + set_pte(pte, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC)); /* * Load our new page tables. A strict BBM approach requires that we @@ -264,7 +263,7 @@ static int create_safe_exec_page(void *src_start, size_t length, */ cpu_set_reserved_ttbr0(); local_flush_tlb_all(); - write_sysreg(virt_to_phys(pgd), ttbr0_el1); + write_sysreg(phys_to_ttbr(virt_to_phys(pgd)), ttbr0_el1); isb(); *phys_dst_addr = virt_to_phys((void *)dst); diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 713561e5bcab..60e5fc661f74 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -29,6 +29,7 @@ #include <linux/irqchip.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <asm/vmap_stack.h> unsigned long irq_err_count; @@ -58,17 +59,7 @@ static void init_irq_stacks(void) unsigned long *p; for_each_possible_cpu(cpu) { - /* - * To ensure that VMAP'd stack overflow detection works - * correctly, the IRQ stacks need to have the same - * alignment as other stacks. - */ - p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP, PAGE_KERNEL, - 0, cpu_to_node(cpu), - __builtin_return_address(0)); - + p = arch_alloc_vmap_stack(IRQ_STACK_SIZE, cpu_to_node(cpu)); per_cpu(irq_stack_ptr, cpu) = p; } } diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 6b7dcf4310ac..ad8aeb098b31 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -35,7 +35,6 @@ #include <linux/delay.h> #include <linux/reboot.h> #include <linux/interrupt.h> -#include <linux/kallsyms.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/elfcore.h> @@ -221,8 +220,8 @@ void __show_regs(struct pt_regs *regs) show_regs_print_info(KERN_DEFAULT); print_pstate(regs); - print_symbol("pc : %s\n", regs->pc); - print_symbol("lr : %s\n", lr); + printk("pc : %pS\n", (void *)regs->pc); + printk("lr : %pS\n", (void *)lr); printk("sp : %016llx\n", sp); i = top_reg; @@ -370,16 +369,14 @@ void tls_preserve_current_state(void) static void tls_thread_switch(struct task_struct *next) { - unsigned long tpidr, tpidrro; - tls_preserve_current_state(); - tpidr = *task_user_tls(next); - tpidrro = is_compat_thread(task_thread_info(next)) ? - next->thread.tp_value : 0; + if (is_compat_thread(task_thread_info(next))) + write_sysreg(next->thread.tp_value, tpidrro_el0); + else if (!arm64_kernel_unmapped_at_el0()) + write_sysreg(0, tpidrro_el0); - write_sysreg(tpidr, tpidr_el0); - write_sysreg(tpidrro, tpidrro_el0); + write_sysreg(*task_user_tls(next), tpidr_el0); } /* Restore the UAO state depending on next's addr_limit */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 7c44658b316d..6618036ae6d4 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -180,34 +180,34 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct pt_regs *regs) { struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); - siginfo_t info = { - .si_signo = SIGTRAP, - .si_errno = 0, - .si_code = TRAP_HWBKPT, - .si_addr = (void __user *)(bkpt->trigger), - }; + siginfo_t info; -#ifdef CONFIG_COMPAT - int i; + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_HWBKPT; + info.si_addr = (void __user *)(bkpt->trigger); - if (!is_compat_task()) - goto send_sig; +#ifdef CONFIG_COMPAT + if (is_compat_task()) { + int si_errno = 0; + int i; - for (i = 0; i < ARM_MAX_BRP; ++i) { - if (current->thread.debug.hbp_break[i] == bp) { - info.si_errno = (i << 1) + 1; - break; + for (i = 0; i < ARM_MAX_BRP; ++i) { + if (current->thread.debug.hbp_break[i] == bp) { + si_errno = (i << 1) + 1; + break; + } } - } - for (i = 0; i < ARM_MAX_WRP; ++i) { - if (current->thread.debug.hbp_watch[i] == bp) { - info.si_errno = -((i << 1) + 1); - break; + for (i = 0; i < ARM_MAX_WRP; ++i) { + if (current->thread.debug.hbp_watch[i] == bp) { + si_errno = -((i << 1) + 1); + break; + } } + force_sig_ptrace_errno_trap(si_errno, (void __user *)bkpt->trigger); } - -send_sig: #endif force_sig_info(SIGTRAP, &info, current); } diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c new file mode 100644 index 000000000000..6b8d90d5ceae --- /dev/null +++ b/arch/arm64/kernel/sdei.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (C) 2017 Arm Ltd. +#define pr_fmt(fmt) "sdei: " fmt + +#include <linux/arm_sdei.h> +#include <linux/hardirq.h> +#include <linux/irqflags.h> +#include <linux/sched/task_stack.h> +#include <linux/uaccess.h> + +#include <asm/alternative.h> +#include <asm/kprobes.h> +#include <asm/mmu.h> +#include <asm/ptrace.h> +#include <asm/sections.h> +#include <asm/sysreg.h> +#include <asm/vmap_stack.h> + +unsigned long sdei_exit_mode; + +/* + * VMAP'd stacks checking for stack overflow on exception using sp as a scratch + * register, meaning SDEI has to switch to its own stack. We need two stacks as + * a critical event may interrupt a normal event that has just taken a + * synchronous exception, and is using sp as scratch register. For a critical + * event interrupting a normal event, we can't reliably tell if we were on the + * sdei stack. + * For now, we allocate stacks when the driver is probed. + */ +DECLARE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); +DECLARE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); + +#ifdef CONFIG_VMAP_STACK +DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); +DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); +#endif + +static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu) +{ + unsigned long *p; + + p = per_cpu(*ptr, cpu); + if (p) { + per_cpu(*ptr, cpu) = NULL; + vfree(p); + } +} + +static void free_sdei_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + _free_sdei_stack(&sdei_stack_normal_ptr, cpu); + _free_sdei_stack(&sdei_stack_critical_ptr, cpu); + } +} + +static int _init_sdei_stack(unsigned long * __percpu *ptr, int cpu) +{ + unsigned long *p; + + p = arch_alloc_vmap_stack(SDEI_STACK_SIZE, cpu_to_node(cpu)); + if (!p) + return -ENOMEM; + per_cpu(*ptr, cpu) = p; + + return 0; +} + +static int init_sdei_stacks(void) +{ + int cpu; + int err = 0; + + for_each_possible_cpu(cpu) { + err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu); + if (err) + break; + err = _init_sdei_stack(&sdei_stack_critical_ptr, cpu); + if (err) + break; + } + + if (err) + free_sdei_stacks(); + + return err; +} + +bool _on_sdei_stack(unsigned long sp) +{ + unsigned long low, high; + + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return false; + + low = (unsigned long)raw_cpu_read(sdei_stack_critical_ptr); + high = low + SDEI_STACK_SIZE; + + if (low <= sp && sp < high) + return true; + + low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); + high = low + SDEI_STACK_SIZE; + + return (low <= sp && sp < high); +} + +unsigned long sdei_arch_get_entry_point(int conduit) +{ + /* + * SDEI works between adjacent exception levels. If we booted at EL1 we + * assume a hypervisor is marshalling events. If we booted at EL2 and + * dropped to EL1 because we don't support VHE, then we can't support + * SDEI. + */ + if (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) { + pr_err("Not supported on this hardware/boot configuration\n"); + return 0; + } + + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + if (init_sdei_stacks()) + return 0; + } + + sdei_exit_mode = (conduit == CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + if (arm64_kernel_unmapped_at_el0()) { + unsigned long offset; + + offset = (unsigned long)__sdei_asm_entry_trampoline - + (unsigned long)__entry_tramp_text_start; + return TRAMP_VALIAS + offset; + } else +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + return (unsigned long)__sdei_asm_handler; + +} + +/* + * __sdei_handler() returns one of: + * SDEI_EV_HANDLED - success, return to the interrupted context. + * SDEI_EV_FAILED - failure, return this error code to firmare. + * virtual-address - success, return to this address. + */ +static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, + struct sdei_registered_event *arg) +{ + u32 mode; + int i, err = 0; + int clobbered_registers = 4; + u64 elr = read_sysreg(elr_el1); + u32 kernel_mode = read_sysreg(CurrentEL) | 1; /* +SPSel */ + unsigned long vbar = read_sysreg(vbar_el1); + + if (arm64_kernel_unmapped_at_el0()) + clobbered_registers++; + + /* Retrieve the missing registers values */ + for (i = 0; i < clobbered_registers; i++) { + /* from within the handler, this call always succeeds */ + sdei_api_event_context(i, ®s->regs[i]); + } + + /* + * We didn't take an exception to get here, set PAN. UAO will be cleared + * by sdei_event_handler()s set_fs(USER_DS) call. + */ + __uaccess_enable_hw_pan(); + + err = sdei_event_handler(regs, arg); + if (err) + return SDEI_EV_FAILED; + + if (elr != read_sysreg(elr_el1)) { + /* + * We took a synchronous exception from the SDEI handler. + * This could deadlock, and if you interrupt KVM it will + * hyp-panic instead. + */ + pr_warn("unsafe: exception during handler\n"); + } + + mode = regs->pstate & (PSR_MODE32_BIT | PSR_MODE_MASK); + + /* + * If we interrupted the kernel with interrupts masked, we always go + * back to wherever we came from. + */ + if (mode == kernel_mode && !interrupts_enabled(regs)) + return SDEI_EV_HANDLED; + + /* + * Otherwise, we pretend this was an IRQ. This lets user space tasks + * receive signals before we return to them, and KVM to invoke it's + * world switch to do the same. + * + * See DDI0487B.a Table D1-7 'Vector offsets from vector table base + * address'. + */ + if (mode == kernel_mode) + return vbar + 0x280; + else if (mode & PSR_MODE32_BIT) + return vbar + 0x680; + + return vbar + 0x480; +} + + +asmlinkage __kprobes notrace unsigned long +__sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) +{ + unsigned long ret; + bool do_nmi_exit = false; + + /* + * nmi_enter() deals with printk() re-entrance and use of RCU when + * RCU believed this CPU was idle. Because critical events can + * interrupt normal events, we may already be in_nmi(). + */ + if (!in_nmi()) { + nmi_enter(); + do_nmi_exit = true; + } + + ret = _sdei_handler(regs, arg); + + if (do_nmi_exit) + nmi_exit(); + + return ret; +} diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index b120111a46be..f60c052e8d1c 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -178,7 +178,8 @@ static void __user *apply_user_offset( static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) { - struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; + struct user_fpsimd_state const *fpsimd = + ¤t->thread.fpsimd_state.user_fpsimd; int err; /* copy the FP and status/control registers */ @@ -195,7 +196,7 @@ static int preserve_fpsimd_context(struct fpsimd_context __user *ctx) static int restore_fpsimd_context(struct fpsimd_context __user *ctx) { - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; __u32 magic, size; int err = 0; @@ -266,7 +267,7 @@ static int restore_sve_fpsimd_context(struct user_ctxs *user) { int err; unsigned int vq; - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; struct sve_context sve; if (__copy_from_user(&sve, user->sve, sizeof(sve))) diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 22711ee8e36c..79feb861929b 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -125,86 +125,6 @@ static inline int get_sigset_t(sigset_t *set, return 0; } -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err; - - if (!access_ok(VERIFY_WRITE, to, sizeof(*to))) - return -EFAULT; - - /* If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - * This routine must convert siginfo from 64bit to 32bit as well - * at the same time. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - if (from->si_code < 0) - err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, - SI_PAD_SIZE); - else switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_FAULT: - err |= __put_user((compat_uptr_t)(unsigned long)from->si_addr, - &to->si_addr); -#ifdef BUS_MCEERR_AO - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif - break; - case SIL_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_SYS: - err |= __put_user((compat_uptr_t)(unsigned long) - from->si_call_addr, &to->si_call_addr); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - return err; -} - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - if (copy_from_user(to, from, __ARCH_SI_PREAMBLE_SIZE) || - copy_from_user(to->_sifields._pad, - from->_sifields._pad, SI_PAD_SIZE)) - return -EFAULT; - - return 0; -} - /* * VFP save/restore code. * @@ -228,7 +148,8 @@ union __fpsimd_vreg { static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) { - struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; + struct user_fpsimd_state const *fpsimd = + ¤t->thread.fpsimd_state.user_fpsimd; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr, fpexc; @@ -277,7 +198,7 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) { - struct fpsimd_state fpsimd; + struct user_fpsimd_state fpsimd; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr; diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 551eb07c53b6..3b8ad7be9c33 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -18,6 +18,7 @@ */ #include <linux/acpi.h> +#include <linux/arm_sdei.h> #include <linux/delay.h> #include <linux/init.h> #include <linux/spinlock.h> @@ -836,6 +837,7 @@ static void ipi_cpu_stop(unsigned int cpu) set_cpu_online(cpu, false); local_daif_mask(); + sdei_mask_local_cpu(); while (1) cpu_relax(); @@ -853,6 +855,7 @@ static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) atomic_dec(&waiting_for_crash_ipi); local_irq_disable(); + sdei_mask_local_cpu(); #ifdef CONFIG_HOTPLUG_CPU if (cpu_ops[cpu]->cpu_die) @@ -972,6 +975,8 @@ void smp_send_stop(void) if (num_online_cpus() > 1) pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", cpumask_pr_args(cpu_online_mask)); + + sdei_mask_local_cpu(); } #ifdef CONFIG_KEXEC_CORE @@ -990,8 +995,10 @@ void crash_smp_send_stop(void) cpus_stopped = 1; - if (num_online_cpus() == 1) + if (num_online_cpus() == 1) { + sdei_mask_local_cpu(); return; + } cpumask_copy(&mask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &mask); @@ -1009,6 +1016,8 @@ void crash_smp_send_stop(void) if (atomic_read(&waiting_for_crash_ipi) > 0) pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", cpumask_pr_args(&mask)); + + sdei_mask_local_cpu(); } bool smp_crash_stop_failed(void) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 3fe5ad884418..a307b9e13392 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -2,6 +2,7 @@ #include <linux/ftrace.h> #include <linux/percpu.h> #include <linux/slab.h> +#include <linux/uaccess.h> #include <asm/alternative.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> @@ -51,8 +52,7 @@ void notrace __cpu_suspend_exit(void) * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, - CONFIG_ARM64_PAN)); + __uaccess_enable_hw_pan(); uao_thread_switch(current); /* diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 8d48b233e6ce..21868530018e 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -37,18 +37,14 @@ static int __init get_cpu_for_node(struct device_node *node) if (!cpu_node) return -1; - for_each_possible_cpu(cpu) { - if (of_get_cpu_node(cpu, NULL) == cpu_node) { - topology_parse_cpu_capacity(cpu_node, cpu); - of_node_put(cpu_node); - return cpu; - } - } - - pr_crit("Unable to find CPU node for %pOF\n", cpu_node); + cpu = of_cpu_node_to_id(cpu_node); + if (cpu >= 0) + topology_parse_cpu_capacity(cpu_node, cpu); + else + pr_crit("Unable to find CPU node for %pOF\n", cpu_node); of_node_put(cpu_node); - return -1; + return cpu; } static int __init parse_core(struct device_node *core, int cluster_id, diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 3d3588fcd1c7..bbb0fde2780e 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -662,17 +662,58 @@ asmlinkage void handle_bad_stack(struct pt_regs *regs) } #endif -asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) +void __noreturn arm64_serror_panic(struct pt_regs *regs, u32 esr) { - nmi_enter(); - console_verbose(); pr_crit("SError Interrupt on CPU%d, code 0x%08x -- %s\n", smp_processor_id(), esr, esr_get_class_string(esr)); - __show_regs(regs); + if (regs) + __show_regs(regs); + + nmi_panic(regs, "Asynchronous SError Interrupt"); + + cpu_park_loop(); + unreachable(); +} + +bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned int esr) +{ + u32 aet = arm64_ras_serror_get_severity(esr); + + switch (aet) { + case ESR_ELx_AET_CE: /* corrected error */ + case ESR_ELx_AET_UEO: /* restartable, not yet consumed */ + /* + * The CPU can make progress. We may take UEO again as + * a more severe error. + */ + return false; + + case ESR_ELx_AET_UEU: /* Uncorrected Unrecoverable */ + case ESR_ELx_AET_UER: /* Uncorrected Recoverable */ + /* + * The CPU can't make progress. The exception may have + * been imprecise. + */ + return true; + + case ESR_ELx_AET_UC: /* Uncontainable or Uncategorized error */ + default: + /* Error has been silently propagated */ + arm64_serror_panic(regs, esr); + } +} + +asmlinkage void do_serror(struct pt_regs *regs, unsigned int esr) +{ + nmi_enter(); + + /* non-RAS errors are not containable */ + if (!arm64_is_ras_serror(esr) || arm64_is_fatal_ras_serror(regs, esr)) + arm64_serror_panic(regs, esr); - panic("Asynchronous SError Interrupt"); + nmi_exit(); } void __pte_error(const char *file, int line, unsigned long val) diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 7da3e5c366a0..0221aca6493d 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -57,6 +57,17 @@ jiffies = jiffies_64; #define HIBERNATE_TEXT #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_TEXT \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_start) = .; \ + *(.entry.tramp.text) \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_end) = .; +#else +#define TRAMP_TEXT +#endif + /* * The size of the PE/COFF section that covers the kernel image, which * runs from stext to _edata, must be a round multiple of the PE/COFF @@ -113,6 +124,7 @@ SECTIONS HYPERVISOR_TEXT IDMAP_TEXT HIBERNATE_TEXT + TRAMP_TEXT *(.fixup) *(.gnu.warning) . = ALIGN(16); @@ -206,13 +218,19 @@ SECTIONS . = ALIGN(PAGE_SIZE); idmap_pg_dir = .; . += IDMAP_DIR_SIZE; - swapper_pg_dir = .; - . += SWAPPER_DIR_SIZE; + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_pg_dir = .; + . += PAGE_SIZE; +#endif #ifdef CONFIG_ARM64_SW_TTBR0_PAN reserved_ttbr0 = .; . += RESERVED_TTBR0_SIZE; #endif + swapper_pg_dir = .; + . += SWAPPER_DIR_SIZE; + swapper_pg_end = .; __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; @@ -234,7 +252,10 @@ ASSERT(__idmap_text_end - (__idmap_text_start & ~(SZ_4K - 1)) <= SZ_4K, ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif - +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, + "Entry trampoline text too big") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ |